diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c index 007184007f1..d347df02125 100644 --- a/icu4c/source/common/ucnv.c +++ b/icu4c/source/common/ucnv.c @@ -611,7 +611,7 @@ static void _reset(UConverter *converter, UConverterResetChoice choice, } if(choice!=UCNV_RESET_TO_UNICODE) { converter->fromUnicodeStatus = 0; - converter->fromUSurrogateLead = 0; + converter->fromUChar32 = 0; converter->invalidUCharLength = converter->charErrorBufferLength = 0; } @@ -864,7 +864,7 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { converterSawEndOfInput= (UBool)(U_SUCCESS(*err) && pArgs->flush && pArgs->source==pArgs->sourceLimit && - cnv->fromUSurrogateLead==0); + cnv->fromUChar32==0); /* no callback called yet for this iteration */ calledCallback=FALSE; @@ -911,13 +911,11 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { * (continue converting by breaking out of only the inner loop) */ break; - } else if(pArgs->flush && cnv->fromUSurrogateLead!=0) { + } else if(pArgs->flush && cnv->fromUChar32!=0) { /* * the entire input stream is consumed * and there is a partial, truncated input sequence left */ - cnv->invalidUCharBuffer[0]=(UChar)cnv->fromUSurrogateLead; - cnv->invalidUCharLength=1; /* inject an error and continue with callback handling */ *err=U_TRUNCATED_CHAR_FOUND; @@ -970,20 +968,15 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { /* callback handling */ { UChar32 codePoint; - int32_t i; - /* get the first code point */ - i=0; - errorInputLength=cnv->invalidUCharLength; - if(errorInputLength>0) { - U16_NEXT(cnv->invalidUCharBuffer, i, errorInputLength, codePoint); - } else { - /* should never occur because errors should be caused by some input */ - codePoint=U_SENTINEL; - } + /* get and write the code point */ + codePoint=cnv->fromUChar32; + errorInputLength=0; + U16_APPEND_UNSAFE(cnv->invalidUCharBuffer, errorInputLength, codePoint); + cnv->invalidUCharLength=(int8_t)errorInputLength; /* set the converter state to deal with the next character */ - cnv->fromUSurrogateLead=0; + cnv->fromUChar32=0; /* call the callback function */ cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs, diff --git a/icu4c/source/common/ucnv2022.c b/icu4c/source/common/ucnv2022.c index ecc60e32ab4..22a84a25d77 100644 --- a/icu4c/source/common/ucnv2022.c +++ b/icu4c/source/common/ucnv2022.c @@ -1279,70 +1279,6 @@ getEndOfBuffer_2022(const char** source, return sourceLimit; } -/* - * From Unicode Callback helper function - */ -static void -fromUnicodeCallback(UConverterFromUnicodeArgs* args,const UChar32 sourceChar,const UChar** pSource, - unsigned char** pTarget,int32_t** pOffsets,UConverterCallbackReason reason, UErrorCode* err){ - - /*variables for callback */ - const UChar* saveSource =NULL; - char* saveTarget =NULL; - int32_t* saveOffsets =NULL; - int currentOffset =0; - int saveIndex =0; - int32_t* offsets = *pOffsets; - const UChar* source = *pSource; - unsigned char* target = *pTarget; - - args->converter->invalidUCharLength = 0; - - if(sourceChar>0xffff){ - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((sourceChar)>>10)+0xd7c0); - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((sourceChar)&0x3ff)|0xdc00); - } - else{ - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(UChar)sourceChar; - } - if(offsets) - currentOffset = *(offsets-1)+1; - - saveSource = args->source; - saveTarget = args->target; - saveOffsets = args->offsets; - args->target = (char*)target; - args->source = source; - args->offsets = offsets; - - /*copies current values for the ErrorFunctor to update */ - /*Calls the ErrorFunctor */ - args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext, - args, - args->converter->invalidUCharBuffer, - args->converter->invalidUCharLength, - (UChar32) (sourceChar), - reason, - err); - - saveIndex = args->target - (char*)target; - if(args->offsets){ - args->offsets = saveOffsets; - while(saveIndex-->0){ - *offsets = currentOffset; - offsets++; - } - } - target = (unsigned char*)args->target; - *pTarget=target; - *pOffsets=offsets; - args->source=saveSource; - args->target=saveTarget; - args->offsets=saveOffsets; - args->converter->fromUSurrogateLead=0x00; - -} - /* * To Unicode Callback helper function */ @@ -1528,7 +1464,6 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args UChar32 sourceChar =0x0000; const char* escSeq = NULL; int len =0; /*length of escSeq chars*/ - UConverterCallbackReason reason; UConverterSharedData* sharedData=NULL; UBool useFallback; @@ -1556,7 +1491,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args currentType = &converterData->currentType; /* check if the last codepoint of previous buffer was a lead surrogate*/ - if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) { + if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { goto getTrail; } @@ -1700,17 +1635,13 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args } } else{ - /* if we cannot find the character after checking all codepages * then this is an error */ - reason = UCNV_UNASSIGNED; - *err = U_INVALID_CHAR_FOUND; /*check if the char is a First surrogate*/ if(UTF_IS_SURROGATE(sourceChar)) { if(UTF_IS_SURROGATE_FIRST(sourceChar)) { - args->converter->fromUSurrogateLead=(UChar)sourceChar; getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { @@ -1718,36 +1649,31 @@ getTrail: UChar trail=(UChar) *source; if(UTF_IS_SECOND_SURROGATE(trail)) { source++; - sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail); - args->converter->fromUSurrogateLead=0x00; - reason =UCNV_UNASSIGNED; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); *err = U_INVALID_CHAR_FOUND; /* convert this surrogate code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } } else { /* no more input */ *err = U_ZERO_ERROR; - break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } + } else { + /* callback(unassigned) for a BMP code point */ + *err = U_INVALID_CHAR_FOUND; } - /* Call the callback function*/ - fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err); - initIterState = *currentState; - if (U_FAILURE (*err)){ - break; - } + + args->converter->fromUChar32=sourceChar; + break; } } /* end if(myTargetIndexconverter->fromUnicodeStatus; - if(args->converter->fromUSurrogateLead!=0 && target converter->fromUChar32)!=0 && target converter->fromUSurrogateLead=(UChar)sourceChar; getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { @@ -2154,38 +2076,32 @@ getTrail: UChar trail=(UChar) *source; if(UTF_IS_SECOND_SURROGATE(trail)) { source++; - sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail); - args->converter->fromUSurrogateLead=0x00; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); *err = U_INVALID_CHAR_FOUND; - reason =UCNV_UNASSIGNED; /* convert this surrogate code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } } else { /* no more input */ *err = U_ZERO_ERROR; - break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } + } else { + /* callback(unassigned) for a BMP code point */ + *err = U_INVALID_CHAR_FOUND; } - args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS; - /* Call the callback function*/ - fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err); - isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; - if (U_FAILURE (*err)){ - break; - } + args->converter->fromUChar32=sourceChar; + args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS; + break; } } /* end if(myTargetIndexsharedData; /* check if the last codepoint of previous buffer was a lead surrogate*/ - if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) { + if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) { goto getTrail; } @@ -2591,7 +2506,6 @@ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args /*check if the char is a First surrogate*/ if(UTF_IS_SURROGATE(sourceChar)) { if(UTF_IS_SURROGATE_FIRST(sourceChar)) { - args->converter->fromUSurrogateLead=(UChar)sourceChar; getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { @@ -2599,28 +2513,28 @@ getTrail: UChar trail=(UChar) *source; if(UTF_IS_SECOND_SURROGATE(trail)) { source++; - /*(((args->converter->fromUSurrogateLead)<<10L)+(trail)-((0xd800<<10L)+0xdc00-0x10000))*/ - sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail); - args->converter->fromUSurrogateLead=0x00; - /* convert this surrogate code point */ + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); + args->converter->fromUChar32=0x00; + /* convert this supplementary code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; - goto callback; + args->converter->fromUChar32=sourceChar; + break; } } else { /* no more input */ + args->converter->fromUChar32=sourceChar; break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; - goto callback; + args->converter->fromUChar32=sourceChar; + break; } } @@ -2755,20 +2669,12 @@ getTrail: } else{ - /* if we cannot find the character after checking all codepages * then this is an error */ - reason = UCNV_UNASSIGNED; *err = U_INVALID_CHAR_FOUND; -callback: - - fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err); - initIterState = *currentState; - - if (U_FAILURE (*err)){ - break; - } + args->converter->fromUChar32=sourceChar; + break; } } /* end if(myTargetIndex0 && t0 && t0) { + if(cnv!=NULL) { + t=(char *)cnv->charErrorBuffer; + cnv->charErrorBufferLength=(int8_t)length; + do { + *t++=(uint8_t)*bytes++; + } while(--length>0); + } + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } +} diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index 844d9a1e1fc..12d32c51572 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -229,4 +229,12 @@ ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv, UConverterUnicodeSet which, UErrorCode *pErrorCode); +U_CFUNC void +ucnv_fromUWriteBytes(UConverter *cnv, + const char *bytes, int32_t length, + char **target, const char *targetLimit, + int32_t **offsets, + int32_t sourceIndex, + UErrorCode *pErrorCode); + #endif /* UCNV_CNV */ diff --git a/icu4c/source/common/ucnv_u32.c b/icu4c/source/common/ucnv_u32.c index f547e97b5f6..f845e3084f1 100644 --- a/icu4c/source/common/ucnv_u32.c +++ b/icu4c/source/common/ucnv_u32.c @@ -311,10 +311,10 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, temp[0] = 0; - if (args->converter->fromUSurrogateLead) + if (args->converter->fromUChar32) { - ch = args->converter->fromUSurrogateLead; - args->converter->fromUSurrogateLead = 0; + ch = args->converter->fromUChar32; + args->converter->fromUChar32 = 0; goto lowsurogate; } @@ -346,7 +346,7 @@ lowsurogate: else if (!args->flush) { /* ran out of source */ - args->converter->fromUSurrogateLead = (UChar)ch; + args->converter->fromUChar32 = ch; break; } #endif @@ -396,10 +396,10 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, temp[0] = 0; - if (args->converter->fromUSurrogateLead) + if (args->converter->fromUChar32) { - ch = args->converter->fromUSurrogateLead; - args->converter->fromUSurrogateLead = 0; + ch = args->converter->fromUChar32; + args->converter->fromUChar32 = 0; goto lowsurogate; } @@ -423,7 +423,7 @@ lowsurogate: else if (!args->flush) { /* ran out of source */ - args->converter->fromUSurrogateLead = (UChar)ch; + args->converter->fromUChar32 = ch; break; } #endif @@ -790,10 +790,10 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, temp[3] = 0; - if (args->converter->fromUSurrogateLead) + if (args->converter->fromUChar32) { - ch = args->converter->fromUSurrogateLead; - args->converter->fromUSurrogateLead = 0; + ch = args->converter->fromUChar32; + args->converter->fromUChar32 = 0; goto lowsurogate; } @@ -817,7 +817,7 @@ lowsurogate: else if (!args->flush) { /* ran out of source */ - args->converter->fromUSurrogateLead = (UChar)ch; + args->converter->fromUChar32 = ch; break; } #endif @@ -867,10 +867,10 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, temp[3] = 0; - if (args->converter->fromUSurrogateLead) + if (args->converter->fromUChar32) { - ch = args->converter->fromUSurrogateLead; - args->converter->fromUSurrogateLead = 0; + ch = args->converter->fromUChar32; + args->converter->fromUChar32 = 0; goto lowsurogate; } @@ -894,7 +894,7 @@ lowsurogate: else if (!args->flush) { /* ran out of source */ - args->converter->fromUSurrogateLead = (UChar)ch; + args->converter->fromUChar32 = ch; break; } #endif diff --git a/icu4c/source/common/ucnv_u8.c b/icu4c/source/common/ucnv_u8.c index c28778a045f..b7471c27596 100644 --- a/icu4c/source/common/ucnv_u8.c +++ b/icu4c/source/common/ucnv_u8.c @@ -443,14 +443,14 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, const UChar *sourceLimit = args->sourceLimit; const unsigned char *targetLimit = (unsigned char *) args->targetLimit; UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data); - uint32_t ch, ch2; + UChar32 ch, ch2; int16_t indexToWrite; char temp[4]; - if (cnv->fromUSurrogateLead && myTarget < targetLimit) + if (cnv->fromUChar32 && myTarget < targetLimit) { - ch = cnv->fromUSurrogateLead; - cnv->fromUSurrogateLead = 0; + ch = cnv->fromUChar32; + cnv->fromUChar32 = 0; goto lowsurrogate; } @@ -494,63 +494,21 @@ lowsurrogate: } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - ch2 = ch; + cnv->fromUChar32 = ch; + *err = U_ILLEGAL_CHAR_FOUND; + break; } } else { /* no more input */ - cnv->fromUSurrogateLead = (UChar)ch; + cnv->fromUChar32 = ch; break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - ch2 = ch; - } - - if(ch2 != 0) { - /* call the callback function with all the preparations and post-processing */ + cnv->fromUChar32 = ch; *err = U_ILLEGAL_CHAR_FOUND; - - /* update the arguments structure */ - args->source=mySource; - args->target=(char *)myTarget; - - /* write the code point as code units */ - cnv->invalidUCharBuffer[0] = (UChar)ch2; - cnv->invalidUCharLength = 1; - - /* call the callback function */ - cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err); - - /* get the converter state from UConverter */ - ch = cnv->fromUSurrogateLead; - cnv->fromUSurrogateLead = 0; - - myTarget=(uint8_t *)args->target; - mySource=args->source; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*err==U_BUFFER_OVERFLOW_ERROR) { - break; - } else if(U_FAILURE(*err)) { - /* break on error */ - break; - } else if(cnv->charErrorBufferLength>0) { - /* target is full */ - *err=U_BUFFER_OVERFLOW_ERROR; - break; - /* - * } else if(ch != 0) { ... - * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c) - * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 . - * We would have to check myTargetsourceLimit; const unsigned char *targetLimit = (unsigned char *) args->targetLimit; UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data); - uint32_t ch, ch2; + UChar32 ch, ch2; int32_t offsetNum, nextSourceIndex; int16_t indexToWrite; char temp[4]; - if (cnv->fromUSurrogateLead && myTarget < targetLimit) + if (cnv->fromUChar32 && myTarget < targetLimit) { - ch = cnv->fromUSurrogateLead; - cnv->fromUSurrogateLead = 0; + ch = cnv->fromUChar32; + cnv->fromUChar32 = 0; offsetNum = -1; nextSourceIndex = 0; goto lowsurrogate; @@ -664,69 +622,21 @@ lowsurrogate: } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - ch2 = ch; + cnv->fromUChar32 = ch; + *err = U_ILLEGAL_CHAR_FOUND; + break; } } else { /* no more input */ - cnv->fromUSurrogateLead = (UChar)ch; + cnv->fromUChar32 = ch; break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - ch2 = ch; - } - - if(ch2 != 0) { - /* call the callback function with all the preparations and post-processing */ + cnv->fromUChar32 = ch; *err = U_ILLEGAL_CHAR_FOUND; - - /* update the arguments structure */ - args->source=mySource; - args->target=(char *)myTarget; - args->offsets=myOffsets; - - /* write the code point as code units */ - cnv->invalidUCharBuffer[0] = (UChar)ch2; - cnv->invalidUCharLength = 1; - - /* call the callback function */ - cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err); - - /* get the converter state from UConverter */ - ch = cnv->fromUSurrogateLead; - cnv->fromUSurrogateLead = 0; - - /* update target and deal with offsets if necessary */ - myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum); - myTarget=(uint8_t *)args->target; - - /* update the source pointer and index */ - offsetNum=nextSourceIndex+(args->source-mySource); - mySource=args->source; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*err==U_BUFFER_OVERFLOW_ERROR) { - break; - } else if(U_FAILURE(*err)) { - /* break on error */ - break; - } else if(cnv->charErrorBufferLength>0) { - /* target is full */ - *err=U_BUFFER_OVERFLOW_ERROR; - break; - /* - * } else if(ch != 0) { ... - * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c) - * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 . - * We would have to check myTargetoffsets; /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; @@ -667,7 +667,7 @@ getTrail: } /* set the converter state back into UConverter */ - cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0; + cnv->fromUChar32= c<0 ? -c : 0; cnv->fromUnicodeStatus=(uint32_t)prev; /* write back the updated pointers */ @@ -701,7 +701,7 @@ _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, targetCapacity=pArgs->targetLimit-pArgs->target; /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; @@ -888,7 +888,7 @@ getTrail: } /* set the converter state back into UConverter */ - cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0; + cnv->fromUChar32= c<0 ? -c : 0; cnv->fromUnicodeStatus=(uint32_t)prev; /* write back the updated pointers */ diff --git a/icu4c/source/common/ucnvhz.c b/icu4c/source/common/ucnvhz.c index 8572607775f..669edaadc8e 100644 --- a/icu4c/source/common/ucnvhz.c +++ b/icu4c/source/common/ucnvhz.c @@ -69,7 +69,7 @@ _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, U cnv->toUnicodeStatus = 0; cnv->fromUnicodeStatus= 0; cnv->mode=0; - cnv->fromUSurrogateLead=0x0000; + cnv->fromUChar32=0x0000; cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ)); if(cnv->extraInfo != NULL){ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode); @@ -108,7 +108,7 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){ } if(choice!=UCNV_RESET_TO_UNICODE) { cnv->fromUnicodeStatus= 0; - cnv->fromUSurrogateLead=0x0000; + cnv->fromUChar32=0x0000; if(cnv->extraInfo != NULL){ ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE; ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0; @@ -347,7 +347,6 @@ UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo; UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS; UBool oldIsTargetUCharDBCS = isTargetUCharDBCS; - UConverterCallbackReason reason; UBool isEscapeAppended =FALSE; int len =0; const char* escSeq=NULL; @@ -356,7 +355,7 @@ UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, *err = U_ILLEGAL_ARGUMENT_ERROR; return; } - if(args->converter->fromUSurrogateLead!=0 && myTargetIndex < targetLength) { + if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) { goto getTrail; } /*writing the char to the output stream */ @@ -440,16 +439,12 @@ UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, } else{ - /* oops.. the code point is unassingned - * set the error and reason - */ - reason =UCNV_UNASSIGNED; - *err =U_INVALID_CHAR_FOUND; + /* oops.. the code point is unassigned */ /*Handle surrogates */ /*check if the char is a First surrogate*/ if(UTF_IS_SURROGATE(mySourceChar)) { if(UTF_IS_SURROGATE_FIRST(mySourceChar)) { - args->converter->fromUSurrogateLead=(UChar)mySourceChar; + args->converter->fromUChar32=mySourceChar; getTrail: /*look ahead to find the trail surrogate*/ if(mySourceIndex < mySourceLength) { @@ -457,87 +452,32 @@ getTrail: UChar trail=(UChar) args->source[mySourceIndex]; if(UTF_IS_SECOND_SURROGATE(trail)) { ++mySourceIndex; - mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail); - args->converter->fromUSurrogateLead=0x00; + mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail); + args->converter->fromUChar32=0x00; /* there are no surrogates in GB2312*/ *err = U_INVALID_CHAR_FOUND; - reason=UCNV_UNASSIGNED; /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } } else { /* no more input */ *err = U_ZERO_ERROR; - break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } + } else { + /* callback(unassigned) for a BMP code point */ + *err = U_INVALID_CHAR_FOUND; } - { - int32_t saveIndex=0; - int32_t currentOffset = (args->offsets) ? *(offsets-1)+1:0; - char * saveTarget = args->target; - const UChar* saveSource = args->source; - int32_t *saveOffsets = args->offsets; - - args->converter->invalidUCharLength = 0; - - if(mySourceChar>0xffff){ - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((mySourceChar)>>10)+0xd7c0); - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((mySourceChar)&0x3ff)|0xdc00); - } - else{ - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(UChar)mySourceChar; - } - - myConverterData->isTargetUCharDBCS = (UBool)isTargetUCharDBCS; - args->target += myTargetIndex; - args->source += mySourceIndex; - args->offsets = args->offsets?offsets:0; - - - saveIndex = myTargetIndex; - /*copies current values for the ErrorFunctor to update */ - /*Calls the ErrorFunctor */ - args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext, - args, - args->converter->invalidUCharBuffer, - args->converter->invalidUCharLength, - (UChar32) (mySourceChar), - reason, - err); - /*Update the local Indexes so that the conversion - *can restart at the right points - */ - myTargetIndex = (int32_t)(args->target - (char*)myTarget); - mySourceIndex = (int32_t)(args->source - mySource); - args->offsets = saveOffsets; - saveIndex = myTargetIndex - saveIndex; - if(args->offsets){ - args->offsets = saveOffsets; - while(saveIndex-->0){ - *offsets = currentOffset; - offsets++; - } - } - isTargetUCharDBCS=myConverterData->isTargetUCharDBCS; - args->source = saveSource; - args->target = saveTarget; - args->offsets = saveOffsets; - args->converter->fromUSurrogateLead=0x00; - if (U_FAILURE (*err)) - break; - - } + args->converter->fromUChar32=mySourceChar; + break; } } else{ diff --git a/icu4c/source/common/ucnvisci.c b/icu4c/source/common/ucnvisci.c index 14eb4cdb6e7..67454446859 100644 --- a/icu4c/source/common/ucnvisci.c +++ b/icu4c/source/common/ucnvisci.c @@ -116,7 +116,7 @@ typedef struct{ MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */ MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */ MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */ - UBool isFirstBuffer; + UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */ char name[30]; }UConverterDataISCII; @@ -197,13 +197,12 @@ _ISCIIReset(UConverter *cnv, UConverterResetChoice choice){ data->contextCharToUnicode=NO_CHAR_MARKER; } if(choice!=UCNV_RESET_TO_UNICODE) { - cnv->fromUSurrogateLead=0x0000; + cnv->fromUChar32=0x0000; data->contextCharFromUnicode=0x00; data->currentMaskFromUnicode=data->defDeltaToUnicode; data->currentDeltaFromUnicode=data->defDeltaToUnicode; + data->isFirstBuffer=TRUE; } - data->isFirstBuffer=TRUE; - } /** @@ -811,7 +810,6 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, int32_t* offsets = args->offsets; uint32_t targetByteUnit = 0x0000; UChar32 sourceChar = 0x0000; - UConverterCallbackReason reason; UBool useFallback; UConverterDataISCII *converterData; uint16_t newDelta=0; @@ -828,7 +826,7 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, newDelta=converterData->currentDeltaFromUnicode; range = (uint16_t)(newDelta/DELTA); - if(args->converter->fromUSurrogateLead!=0 && target converter->fromUChar32)!=0) { goto getTrail; } @@ -946,16 +944,10 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, } } else{ - /* oops.. the code point is unassingned - * set the error and reason - */ - reason =UCNV_UNASSIGNED; - *err =U_INVALID_CHAR_FOUND; - + /* oops.. the code point is unassigned */ /*check if the char is a First surrogate*/ if(UTF_IS_SURROGATE(sourceChar)) { if(UTF_IS_SURROGATE_FIRST(sourceChar)) { - args->converter->fromUSurrogateLead=(UChar)sourceChar; getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { @@ -963,94 +955,32 @@ getTrail: UChar trail= (*source); if(UTF_IS_SECOND_SURROGATE(trail)) { source++; - sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail); - args->converter->fromUSurrogateLead=0x00; - reason =UCNV_UNASSIGNED; + sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); *err =U_INVALID_CHAR_FOUND; /* convert this surrogate code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - sourceChar = args->converter->fromUSurrogateLead; - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } } else { /* no more input */ *err = U_ZERO_ERROR; - break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *err=U_ILLEGAL_CHAR_FOUND; } + } else { + /* callback(unassigned) for a BMP code point */ + *err = U_INVALID_CHAR_FOUND; } - { - /*variables for callback */ - const UChar* saveSource =NULL; - char* saveTarget =NULL; - int32_t* saveOffsets =NULL; - int currentOffset =0; - int32_t saveIndex =0; - args->converter->invalidUCharLength = 0; - - if(sourceChar>0xffff){ - /* we have got a surrogate pair... dissable and populate the invalidUCharBuffer */ - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] - =(uint16_t)(((sourceChar)>>10)+0xd7c0); - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] - =(uint16_t)(((sourceChar)&0x3ff)|0xdc00); - } - else{ - args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] - =(UChar)sourceChar; - } - - if(offsets){ - currentOffset = *(offsets-1)+1; - } - saveSource = args->source; - saveTarget = args->target; - saveOffsets = args->offsets; - args->target = (char*)target; - args->source = source; - args->offsets = offsets; - - /*copies current values for the ErrorFunctor to update */ - /*Calls the ErrorFunctor */ - args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext, - args, - args->converter->invalidUCharBuffer, - args->converter->invalidUCharLength, - (UChar32) (sourceChar), - reason, - err); - - saveIndex = (int32_t)(args->target - (char*)target); - if(args->offsets){ - args->offsets = saveOffsets; - while(saveIndex-->0){ - *offsets = currentOffset; - offsets++; - } - } - target = (unsigned char*)args->target; - args->source=saveSource; - args->target=saveTarget; - args->offsets=saveOffsets; - args->converter->fromUSurrogateLead=0x00; - - if (U_FAILURE (*err)){ - break; - } - } + args->converter->fromUChar32=sourceChar; + break; } - - }/* end while(mySourceIndexfromUSurrogateLead; + cp=cnv->fromUChar32; /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= cp==0 ? 0 : -1; @@ -299,7 +299,7 @@ getTrail: } } else { /* no more input */ - cnv->fromUSurrogateLead=(UChar)cp; + cnv->fromUChar32=cp; break; } } else { @@ -308,14 +308,7 @@ getTrail: } *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; - - /* write the code point as code units */ - { - int32_t i=0; - U16_APPEND_UNSAFE(cnv->invalidUCharBuffer, i, cp); - cnv->invalidUCharLength=(int8_t)i; - } - + cnv->fromUChar32=cp; break; } diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 46f211e17d4..1dafcf7600a 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -315,11 +315,14 @@ static void _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode); -static void -fromUCallback(UConverter *cnv, - const void *context, UConverterFromUnicodeArgs *pArgs, - UChar32 codePoint, - UConverterCallbackReason reason, UErrorCode *pErrorCode); +static UChar32 +_extFromU(UConverter *cnv, const UConverterSharedData *sharedData, + UChar32 cp, + const UChar **source, const UChar *sourceLimit, + char **target, const char *targetLimit, + int32_t **offsets, int32_t sourceIndex, + UBool useFallback, UBool flush, + UErrorCode *pErrorCode); static void toUCallback(UConverter *cnv, @@ -819,7 +822,7 @@ _MBCSOpen(UConverter *cnv, cnv->toULength=0; /* byteIndex */ /* fromUnicode */ - cnv->fromUSurrogateLead=0; + cnv->fromUChar32=0; cnv->fromUnicodeStatus=1; /* prevLength */ #endif } @@ -2141,7 +2144,6 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, int32_t prevSourceIndex, sourceIndex, nextSourceIndex; - UConverterCallbackReason reason; uint32_t stage2Entry; uint32_t value; int32_t length, prevLength; @@ -2178,7 +2180,7 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, } /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; prevLength=cnv->fromUnicodeStatus; /* sourceIndex=-1 if the current character began in the previous buffer */ @@ -2246,9 +2248,8 @@ getTrail: } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto callback; + break; } } else { /* no more input */ @@ -2257,9 +2258,8 @@ getTrail: } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto callback; + break; } } @@ -2422,8 +2422,32 @@ getTrail: * There is no way with this data structure for fallback output * for other than U+0000 to be a zero byte. */ - /* callback(unassigned) */ - goto unassigned; + +unassigned: + /* try an extension mapping */ + pArgs->source=source; + c=_extFromU(cnv, cnv->sharedData, + c, &source, sourceLimit, + (char **)&target, (char *)target+targetCapacity, + &offsets, sourceIndex, + (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pErrorCode); + nextSourceIndex+=(int32_t)(source-pArgs->source); + prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ + + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ + break; + } else { + /* a mapping was written to the target, continue */ + + /* normal end of conversion: prepare for a new character */ + if(offsets!=NULL) { + prevSourceIndex=sourceIndex; + sourceIndex=nextSourceIndex; + } + continue; + } } /* write the output character bytes from value and length */ @@ -2529,69 +2553,6 @@ getTrail: sourceIndex=nextSourceIndex; } continue; - - /* - * This is the same ugly trick as in ToUnicode(), for the - * same reasons... - */ -unassigned: - reason=UCNV_UNASSIGNED; - *pErrorCode=U_INVALID_CHAR_FOUND; -callback: - /* call the callback function with all the preparations and post-processing */ - /* update the arguments structure */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; - - /* set the converter state in UConverter to deal with the next character */ - cnv->fromUSurrogateLead=0; - /* - * Do not save the prevLength SISO state because prevLength is set for - * the character that is now not output because it is unassigned or it is - * a fallback that is not taken. - * The above branch for MBCS_OUTPUT_2_SISO has saved the previous state already. - * See comments there. - */ - prevSourceIndex=sourceIndex; - - /* call the callback function */ - fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode); - - /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; - prevLength=cnv->fromUnicodeStatus; - - /* update target and deal with offsets if necessary */ - offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex); - target=(uint8_t *)pArgs->target; - - /* update the source pointer and index */ - sourceIndex=nextSourceIndex+(pArgs->source-source); - source=pArgs->source; - targetCapacity=(uint8_t *)pArgs->targetLimit-target; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - break; - } else if(U_FAILURE(*pErrorCode)) { - /* break on error */ - c=0; - break; - } else if(cnv->charErrorBufferLength>0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - /* - * We do not need to repeat the statements from the normal - * end of the conversion because we already updated all the - * necessary variables. - */ } else { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; @@ -2630,7 +2591,7 @@ callback: } /* set the converter state back into UConverter */ - cnv->fromUSurrogateLead=(UChar)c; + cnv->fromUChar32=c; cnv->fromUnicodeStatus=prevLength; /* write back the updated pointers */ @@ -2656,7 +2617,6 @@ _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, int32_t sourceIndex, nextSourceIndex; - UConverterCallbackReason reason; uint32_t stage2Entry; uint32_t value; int32_t length, prevLength; @@ -2681,7 +2641,7 @@ _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, } /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; prevLength=cnv->fromUnicodeStatus; /* sourceIndex=-1 if the current character began in the previous buffer */ @@ -2735,9 +2695,8 @@ getTrail: } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto callback; + break; } } else { /* no more input */ @@ -2746,9 +2705,8 @@ getTrail: } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto callback; + break; } } @@ -2774,8 +2732,28 @@ getTrail: * There is no way with this data structure for fallback output * for other than U+0000 to be a zero byte. */ - /* callback(unassigned) */ - goto unassigned; + +unassigned: + /* try an extension mapping */ + pArgs->source=source; + c=_extFromU(cnv, cnv->sharedData, + c, &source, sourceLimit, + (char **)&target, (char *)target+targetCapacity, + &offsets, sourceIndex, + (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pErrorCode); + nextSourceIndex+=(int32_t)(source-pArgs->source); + + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ + break; + } else { + /* a mapping was written to the target, continue */ + + /* normal end of conversion: prepare for a new character */ + sourceIndex=nextSourceIndex; + continue; + } } /* write the output character bytes from value and length */ @@ -2815,62 +2793,6 @@ getTrail: c=0; sourceIndex=nextSourceIndex; continue; - - /* - * This is the same ugly trick as in ToUnicode(), for the - * same reasons... - */ -unassigned: - reason=UCNV_UNASSIGNED; - *pErrorCode=U_INVALID_CHAR_FOUND; -callback: - /* call the callback function with all the preparations and post-processing */ - /* update the arguments structure */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; - - /* set the converter state in UConverter to deal with the next character */ - cnv->fromUSurrogateLead=0; - cnv->fromUnicodeStatus=prevLength; - - /* call the callback function */ - fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode); - - /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; - prevLength=cnv->fromUnicodeStatus; - - /* update target and deal with offsets if necessary */ - offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex); - target=(uint8_t *)pArgs->target; - - /* update the source pointer and index */ - sourceIndex=nextSourceIndex+(pArgs->source-source); - source=pArgs->source; - targetCapacity=(uint8_t *)pArgs->targetLimit-target; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - break; - } else if(U_FAILURE(*pErrorCode)) { - /* break on error */ - c=0; - break; - } else if(cnv->charErrorBufferLength>0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - /* - * We do not need to repeat the statements from the normal - * end of the conversion because we already updated all the - * necessary variables. - */ } else { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; @@ -2879,7 +2801,7 @@ callback: } /* set the converter state back into UConverter */ - cnv->fromUSurrogateLead=(UChar)c; + cnv->fromUChar32=c; cnv->fromUnicodeStatus=prevLength; /* write back the updated pointers */ @@ -2905,7 +2827,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, int32_t sourceIndex, nextSourceIndex; - UConverterCallbackReason reason; uint16_t value, minValue; UBool hasSupplementary; @@ -2934,7 +2855,7 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, hasSupplementary=(UBool)(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; @@ -2982,9 +2903,8 @@ getTrail: } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto callback; + break; } } else { /* no more input */ @@ -2993,9 +2913,8 @@ getTrail: } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ - reason=UCNV_ILLEGAL; *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto callback; + break; } } @@ -3016,65 +2935,28 @@ getTrail: /* normal end of conversion: prepare for a new character */ c=0; sourceIndex=nextSourceIndex; - continue; } else { /* unassigned */ - /* - * We allow a 0 byte output if the Unicode code point is - * U+0000 and also if the "assigned" bit is set for this entry. - * There is no way with this data structure for fallback output - * for other than U+0000 to be a zero byte. - */ - /* callback(unassigned) */ - } unassigned: - reason=UCNV_UNASSIGNED; - *pErrorCode=U_INVALID_CHAR_FOUND; -callback: - /* call the callback function with all the preparations and post-processing */ - /* update the arguments structure */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; + /* try an extension mapping */ + pArgs->source=source; + c=_extFromU(cnv, cnv->sharedData, + c, &source, sourceLimit, + (char **)&target, (char *)target+targetCapacity, + &offsets, sourceIndex, + (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pErrorCode); + nextSourceIndex+=(int32_t)(source-pArgs->source); - /* set the converter state in UConverter to deal with the next character */ - cnv->fromUSurrogateLead=0; + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ + break; + } else { + /* a mapping was written to the target, continue */ - /* call the callback function */ - fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode); - - /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; - - /* update target and deal with offsets if necessary */ - offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex); - target=(uint8_t *)pArgs->target; - - /* update the source pointer and index */ - sourceIndex=nextSourceIndex+(pArgs->source-source); - source=pArgs->source; - targetCapacity=(uint8_t *)pArgs->targetLimit-target; - - /* - * If the callback overflowed the target, then we need to - * stop here with an overflow indication. - */ - if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { - break; - } else if(U_FAILURE(*pErrorCode)) { - /* break on error */ - c=0; - break; - } else if(cnv->charErrorBufferLength>0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; + /* normal end of conversion: prepare for a new character */ + sourceIndex=nextSourceIndex; + } } - - /* - * We do not need to repeat the statements from the normal - * end of the conversion because we already updated all the - * necessary variables. - */ } else { /* target is full */ *pErrorCode=U_BUFFER_OVERFLOW_ERROR; @@ -3083,7 +2965,7 @@ callback: } /* set the converter state back into UConverter */ - cnv->fromUSurrogateLead=(UChar)c; + cnv->fromUChar32=c; /* write back the updated pointers */ pArgs->source=source; @@ -3113,7 +2995,6 @@ _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, int32_t sourceIndex; - UConverterCallbackReason reason; uint16_t value, minValue; /* set up the local pointers */ @@ -3140,7 +3021,7 @@ _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, } /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; @@ -3237,15 +3118,6 @@ unrolled: continue; } else if(!UTF_IS_SURROGATE(c)) { /* normal, unassigned BMP character */ - /* - * We allow a 0 byte output if the Unicode code point is - * U+0000 and also if the "assigned" bit is set for this entry. - * There is no way with this data structure for fallback output - * for other than U+0000 to be a zero byte. - */ - /* callback(unassigned) */ - reason=UCNV_UNASSIGNED; - *pErrorCode=U_INVALID_CHAR_FOUND; } else if(UTF_IS_SURROGATE_FIRST(c)) { getTrail: if(source0) { *offsets++=sourceIndex++; --count; } - /* offset and sourceIndex are now set for the current character */ + /* offsets and sourceIndex are now set for the current character */ } - /* update the arguments structure */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; + /* try an extension mapping */ + lastSource=source; + c=_extFromU(cnv, cnv->sharedData, + c, &source, sourceLimit, + (char **)&target, (char *)target+targetCapacity, + &offsets, sourceIndex, + (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pErrorCode); + sourceIndex+=length+(int32_t)(source-lastSource); + lastSource=source; - /* set the converter state in UConverter to deal with the next character */ - cnv->fromUSurrogateLead=0; - - /* call the callback function */ - fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode); - - /* get the converter state from UConverter */ - c=cnv->fromUSurrogateLead; - - /* update target and deal with offsets if necessary */ - offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex); - target=(uint8_t *)pArgs->target; - - /* update the source pointer and index */ - sourceIndex+=length+(pArgs->source-source); - source=lastSource=pArgs->source; - targetCapacity=(uint8_t *)pArgs->targetLimit-target; - length=sourceLimit-source; - if(lengthcharErrorBufferLength>0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + if(U_FAILURE(*pErrorCode)) { + /* not mappable or buffer overflow */ break; + } else { + /* a mapping was written to the target, continue */ } #if MBCS_UNROLL_SINGLE_FROM_BMP @@ -3357,7 +3203,7 @@ getTrail: } /* set the converter state back into UConverter */ - cnv->fromUSurrogateLead=(UChar)c; + cnv->fromUChar32=c; /* write back the updated pointers */ pArgs->source=source; @@ -3672,35 +3518,53 @@ const UConverterSharedData _MBCSData={ 0 }; -/* GB 18030 special handling ------------------------------------------------ */ +/* conversion extensions for input not in the main table -------------------- */ -/* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */ +/* + * Hardcoded extension handling for GB 18030. + * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. + * + * In the future, conversion extensions may handle m:n mappings and delta tables, + * see http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/conversion/conversion_extensions.html + * + * If an input character cannot be mapped, then these functions set an error + * code. The framework will then call the callback function. + */ -/* the callback functions handle GB 18030 specially */ -static void -fromUCallback(UConverter *cnv, - const void *context, UConverterFromUnicodeArgs *pArgs, - UChar32 codePoint, - UConverterCallbackReason reason, UErrorCode *pErrorCode) { - int32_t i; +/* + * TODO when implementing real extensions, review whether the useFallback parameter + * should get cnv->useFallback or the full resolution considering cp as well + */ - if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED) { +/* + * @return if(U_FAILURE) return the code point for cnv->fromUChar32 + * else return 0 after output has been written to the target + */ +static UChar32 +_extFromU(UConverter *cnv, const UConverterSharedData *sharedData, + UChar32 cp, + const UChar **source, const UChar *sourceLimit, + char **target, const char *targetLimit, + int32_t **offsets, int32_t sourceIndex, + UBool useFallback, UBool flush, + UErrorCode *pErrorCode) { + /* GB 18030 */ + if(cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) { const uint32_t *range; + int32_t i; range=gb18030Ranges[0]; for(i=0; iinvalidUCharBuffer, i, codePoint); - cnv->invalidUCharLength=(int8_t)i; - - /* call the normal callback function */ - cnv->fromUCharErrorBehaviour(context, pArgs, cnv->invalidUCharBuffer, i, codePoint, reason, pErrorCode); + /* no mapping */ + *pErrorCode=U_INVALID_CHAR_FOUND; + return cp; } +/* GB 18030 special handling ------------------------------------------------ */ + static void toUCallback(UConverter *cnv, const void *context, UConverterToUnicodeArgs *pArgs, diff --git a/icu4c/source/common/ucnvscsu.c b/icu4c/source/common/ucnvscsu.c index ee526352586..6f8bf9d307f 100644 --- a/icu4c/source/common/ucnvscsu.c +++ b/icu4c/source/common/ucnvscsu.c @@ -181,7 +181,7 @@ _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { break; } - cnv->fromUSurrogateLead=0; + cnv->fromUChar32=0; } } @@ -216,8 +216,6 @@ _SCSUClose(UConverter *cnv) { /* SCSU-to-Unicode conversion functions ------------------------------------- */ -/* ### TODO check operator precedence | << + < */ - static void _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { @@ -1059,7 +1057,7 @@ _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, dynamicWindow=scsu->fromUDynamicWindow; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; @@ -1386,18 +1384,11 @@ getTrailUnicode: } endloop: - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - /* c is an unpaired surrogate */ - cnv->invalidUCharBuffer[0]=(UChar)c; - cnv->invalidUCharLength=1; - c=0; - } - /* set the converter state back into UConverter */ scsu->fromUIsSingleByteMode=isSingleByteMode; scsu->fromUDynamicWindow=dynamicWindow; - cnv->fromUSurrogateLead=(UChar)c; + cnv->fromUChar32=c; /* write back the updated pointers */ pArgs->source=source; @@ -1553,7 +1544,7 @@ _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, dynamicWindow=scsu->fromUDynamicWindow; currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - c=cnv->fromUSurrogateLead; + c=cnv->fromUChar32; /* similar conversion "loop" as in toUnicode */ loop: @@ -1851,18 +1842,11 @@ getTrailUnicode: } endloop: - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - /* c is an unpaired surrogate */ - cnv->invalidUCharBuffer[0]=(UChar)c; - cnv->invalidUCharLength=1; - c=0; - } - /* set the converter state back into UConverter */ scsu->fromUIsSingleByteMode=isSingleByteMode; scsu->fromUDynamicWindow=dynamicWindow; - cnv->fromUSurrogateLead=(UChar)c; + cnv->fromUChar32=c; /* write back the updated pointers */ pArgs->source=source;