ICU-2449 refactor conversion - call fromUnicode callbacks only from ucnv.c framework

X-SVN-Rev: 12667
This commit is contained in:
Markus Scherer 2003-07-24 00:28:47 +00:00
parent cab6c351eb
commit 8ab9f23f3a
13 changed files with 319 additions and 741 deletions

View file

@ -611,7 +611,7 @@ static void _reset(UConverter *converter, UConverterResetChoice choice,
}
if(choice!=UCNV_RESET_TO_UNICODE) {
converter->fromUnicodeStatus = 0;
converter->fromUSurrogateLead = 0;
converter->fromUChar32 = 0;
converter->invalidUCharLength = converter->charErrorBufferLength = 0;
}
@ -864,7 +864,7 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
converterSawEndOfInput=
(UBool)(U_SUCCESS(*err) &&
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
cnv->fromUSurrogateLead==0);
cnv->fromUChar32==0);
/* no callback called yet for this iteration */
calledCallback=FALSE;
@ -911,13 +911,11 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
* (continue converting by breaking out of only the inner loop)
*/
break;
} else if(pArgs->flush && cnv->fromUSurrogateLead!=0) {
} else if(pArgs->flush && cnv->fromUChar32!=0) {
/*
* the entire input stream is consumed
* and there is a partial, truncated input sequence left
*/
cnv->invalidUCharBuffer[0]=(UChar)cnv->fromUSurrogateLead;
cnv->invalidUCharLength=1;
/* inject an error and continue with callback handling */
*err=U_TRUNCATED_CHAR_FOUND;
@ -970,20 +968,15 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
/* callback handling */
{
UChar32 codePoint;
int32_t i;
/* get the first code point */
i=0;
errorInputLength=cnv->invalidUCharLength;
if(errorInputLength>0) {
U16_NEXT(cnv->invalidUCharBuffer, i, errorInputLength, codePoint);
} else {
/* should never occur because errors should be caused by some input */
codePoint=U_SENTINEL;
}
/* get and write the code point */
codePoint=cnv->fromUChar32;
errorInputLength=0;
U16_APPEND_UNSAFE(cnv->invalidUCharBuffer, errorInputLength, codePoint);
cnv->invalidUCharLength=(int8_t)errorInputLength;
/* set the converter state to deal with the next character */
cnv->fromUSurrogateLead=0;
cnv->fromUChar32=0;
/* call the callback function */
cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs,

View file

@ -1279,70 +1279,6 @@ getEndOfBuffer_2022(const char** source,
return sourceLimit;
}
/*
* From Unicode Callback helper function
*/
static void
fromUnicodeCallback(UConverterFromUnicodeArgs* args,const UChar32 sourceChar,const UChar** pSource,
unsigned char** pTarget,int32_t** pOffsets,UConverterCallbackReason reason, UErrorCode* err){
/*variables for callback */
const UChar* saveSource =NULL;
char* saveTarget =NULL;
int32_t* saveOffsets =NULL;
int currentOffset =0;
int saveIndex =0;
int32_t* offsets = *pOffsets;
const UChar* source = *pSource;
unsigned char* target = *pTarget;
args->converter->invalidUCharLength = 0;
if(sourceChar>0xffff){
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((sourceChar)>>10)+0xd7c0);
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((sourceChar)&0x3ff)|0xdc00);
}
else{
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(UChar)sourceChar;
}
if(offsets)
currentOffset = *(offsets-1)+1;
saveSource = args->source;
saveTarget = args->target;
saveOffsets = args->offsets;
args->target = (char*)target;
args->source = source;
args->offsets = offsets;
/*copies current values for the ErrorFunctor to update */
/*Calls the ErrorFunctor */
args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext,
args,
args->converter->invalidUCharBuffer,
args->converter->invalidUCharLength,
(UChar32) (sourceChar),
reason,
err);
saveIndex = args->target - (char*)target;
if(args->offsets){
args->offsets = saveOffsets;
while(saveIndex-->0){
*offsets = currentOffset;
offsets++;
}
}
target = (unsigned char*)args->target;
*pTarget=target;
*pOffsets=offsets;
args->source=saveSource;
args->target=saveTarget;
args->offsets=saveOffsets;
args->converter->fromUSurrogateLead=0x00;
}
/*
* To Unicode Callback helper function
*/
@ -1528,7 +1464,6 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
UChar32 sourceChar =0x0000;
const char* escSeq = NULL;
int len =0; /*length of escSeq chars*/
UConverterCallbackReason reason;
UConverterSharedData* sharedData=NULL;
UBool useFallback;
@ -1556,7 +1491,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
currentType = &converterData->currentType;
/* check if the last codepoint of previous buffer was a lead surrogate*/
if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) {
if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
goto getTrail;
}
@ -1700,17 +1635,13 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
}
}
else{
/* if we cannot find the character after checking all codepages
* then this is an error
*/
reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
args->converter->fromUSurrogateLead=(UChar)sourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(source < sourceLimit) {
@ -1718,36 +1649,31 @@ getTrail:
UChar trail=(UChar) *source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
args->converter->fromUSurrogateLead=0x00;
reason =UCNV_UNASSIGNED;
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
*err = U_INVALID_CHAR_FOUND;
/* convert this surrogate code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
*err = U_ZERO_ERROR;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
/* Call the callback function*/
fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err);
initIterState = *currentState;
if (U_FAILURE (*err)){
break;
}
args->converter->fromUChar32=sourceChar;
break;
}
} /* end if(myTargetIndex<myTargetLength) */
else{
@ -2045,7 +1971,6 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
UBool isTargetByteDBCS;
UBool oldIsTargetByteDBCS;
UConverterDataISO2022 *converterData;
UConverterCallbackReason reason;
UConverterSharedData* sharedData;
UBool useFallback;
int32_t length =0;
@ -2070,7 +1995,7 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
}
isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
if(args->converter->fromUSurrogateLead!=0 && target <targetLimit) {
if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
goto getTrail;
}
while(source < sourceLimit){
@ -2140,13 +2065,10 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
/* oops.. the code point is unassingned
* set the error and reason
*/
reason =UCNV_UNASSIGNED;
*err =U_INVALID_CHAR_FOUND;
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
args->converter->fromUSurrogateLead=(UChar)sourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(source < sourceLimit) {
@ -2154,38 +2076,32 @@ getTrail:
UChar trail=(UChar) *source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
args->converter->fromUSurrogateLead=0x00;
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
*err = U_INVALID_CHAR_FOUND;
reason =UCNV_UNASSIGNED;
/* convert this surrogate code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
*err = U_ZERO_ERROR;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
/* Call the callback function*/
fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err);
isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
if (U_FAILURE (*err)){
break;
}
args->converter->fromUChar32=sourceChar;
args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
break;
}
} /* end if(myTargetIndex<myTargetLength) */
else{
@ -2542,7 +2458,6 @@ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
int len =0; /*length of escSeq chars*/
uint32_t targetValue=0;
uint8_t planeVal=0;
UConverterCallbackReason reason;
UConverterSharedData* sharedData=NULL;
UBool useFallback;
@ -2575,7 +2490,7 @@ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
sharedData = (*currentConverter)->sharedData;
/* check if the last codepoint of previous buffer was a lead surrogate*/
if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) {
if((sourceChar = args->converter->fromUChar32)!=0 && target< targetLimit) {
goto getTrail;
}
@ -2591,7 +2506,6 @@ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
args->converter->fromUSurrogateLead=(UChar)sourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(source < sourceLimit) {
@ -2599,28 +2513,28 @@ getTrail:
UChar trail=(UChar) *source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
/*(((args->converter->fromUSurrogateLead)<<10L)+(trail)-((0xd800<<10L)+0xdc00-0x10000))*/
sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
args->converter->fromUSurrogateLead=0x00;
/* convert this surrogate code point */
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
args->converter->fromUChar32=0x00;
/* convert this supplementary code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
goto callback;
args->converter->fromUChar32=sourceChar;
break;
}
} else {
/* no more input */
args->converter->fromUChar32=sourceChar;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
goto callback;
args->converter->fromUChar32=sourceChar;
break;
}
}
@ -2755,20 +2669,12 @@ getTrail:
}
else{
/* if we cannot find the character after checking all codepages
* then this is an error
*/
reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
callback:
fromUnicodeCallback(args,sourceChar,&source,&target,&offsets,reason,err);
initIterState = *currentState;
if (U_FAILURE (*err)){
break;
}
args->converter->fromUChar32=sourceChar;
break;
}
} /* end if(myTargetIndex<myTargetLength) */
else{

View file

@ -148,7 +148,17 @@ struct UConverter {
uint32_t toUnicodeStatus; /* Used to internalize stream status information */
int32_t mode;
uint32_t fromUnicodeStatus;
UChar fromUSurrogateLead; /* similar to toUBytes; keeps the lead surrogate of the current character */
/*
* More fromUnicode() status. Serves 3 purposes:
* - keeps a lead surrogate between buffers (similar to toUBytes[])
* - keeps a lead surrogate at the end of the stream,
* which the framework handles as truncated input
* - if the fromUnicode() implementation returns to the framework
* (ucnv.c ucnv_fromUnicode()), then the framework calls the callback
* for this code point
*/
UChar32 fromUChar32;
int8_t subCharLen; /* length of the codepage specific character sequence */
int8_t invalidCharLength;

View file

@ -116,3 +116,43 @@ ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
uset_addRange(set, 0, 0xd7ff);
uset_addRange(set, 0xe000, 0x10ffff);
}
U_CFUNC void
ucnv_fromUWriteBytes(UConverter *cnv,
const char *bytes, int32_t length,
char **target, const char *targetLimit,
int32_t **offsets,
int32_t sourceIndex,
UErrorCode *pErrorCode) {
char *t=*target;
int32_t *o;
/* write bytes */
if(offsets==NULL || (o=*offsets)==NULL) {
while(length>0 && t<targetLimit) {
*t++=*bytes++;
--length;
}
} else {
/* output with offsets */
while(length>0 && t<targetLimit) {
*t++=*bytes++;
*o++=sourceIndex;
--length;
}
*offsets=o;
}
*target=t;
/* write overflow */
if(length>0) {
if(cnv!=NULL) {
t=(char *)cnv->charErrorBuffer;
cnv->charErrorBufferLength=(int8_t)length;
do {
*t++=(uint8_t)*bytes++;
} while(--length>0);
}
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}

View file

@ -229,4 +229,12 @@ ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_fromUWriteBytes(UConverter *cnv,
const char *bytes, int32_t length,
char **target, const char *targetLimit,
int32_t **offsets,
int32_t sourceIndex,
UErrorCode *pErrorCode);
#endif /* UCNV_CNV */

View file

@ -311,10 +311,10 @@ T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
temp[0] = 0;
if (args->converter->fromUSurrogateLead)
if (args->converter->fromUChar32)
{
ch = args->converter->fromUSurrogateLead;
args->converter->fromUSurrogateLead = 0;
ch = args->converter->fromUChar32;
args->converter->fromUChar32 = 0;
goto lowsurogate;
}
@ -346,7 +346,7 @@ lowsurogate:
else if (!args->flush)
{
/* ran out of source */
args->converter->fromUSurrogateLead = (UChar)ch;
args->converter->fromUChar32 = ch;
break;
}
#endif
@ -396,10 +396,10 @@ T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
temp[0] = 0;
if (args->converter->fromUSurrogateLead)
if (args->converter->fromUChar32)
{
ch = args->converter->fromUSurrogateLead;
args->converter->fromUSurrogateLead = 0;
ch = args->converter->fromUChar32;
args->converter->fromUChar32 = 0;
goto lowsurogate;
}
@ -423,7 +423,7 @@ lowsurogate:
else if (!args->flush)
{
/* ran out of source */
args->converter->fromUSurrogateLead = (UChar)ch;
args->converter->fromUChar32 = ch;
break;
}
#endif
@ -790,10 +790,10 @@ T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
temp[3] = 0;
if (args->converter->fromUSurrogateLead)
if (args->converter->fromUChar32)
{
ch = args->converter->fromUSurrogateLead;
args->converter->fromUSurrogateLead = 0;
ch = args->converter->fromUChar32;
args->converter->fromUChar32 = 0;
goto lowsurogate;
}
@ -817,7 +817,7 @@ lowsurogate:
else if (!args->flush)
{
/* ran out of source */
args->converter->fromUSurrogateLead = (UChar)ch;
args->converter->fromUChar32 = ch;
break;
}
#endif
@ -867,10 +867,10 @@ T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
temp[3] = 0;
if (args->converter->fromUSurrogateLead)
if (args->converter->fromUChar32)
{
ch = args->converter->fromUSurrogateLead;
args->converter->fromUSurrogateLead = 0;
ch = args->converter->fromUChar32;
args->converter->fromUChar32 = 0;
goto lowsurogate;
}
@ -894,7 +894,7 @@ lowsurogate:
else if (!args->flush)
{
/* ran out of source */
args->converter->fromUSurrogateLead = (UChar)ch;
args->converter->fromUChar32 = ch;
break;
}
#endif

View file

@ -443,14 +443,14 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
uint32_t ch, ch2;
UChar32 ch, ch2;
int16_t indexToWrite;
char temp[4];
if (cnv->fromUSurrogateLead && myTarget < targetLimit)
if (cnv->fromUChar32 && myTarget < targetLimit)
{
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
ch = cnv->fromUChar32;
cnv->fromUChar32 = 0;
goto lowsurrogate;
}
@ -494,63 +494,21 @@ lowsurrogate:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
ch2 = ch;
cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
} else {
/* no more input */
cnv->fromUSurrogateLead = (UChar)ch;
cnv->fromUChar32 = ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
ch2 = ch;
}
if(ch2 != 0) {
/* call the callback function with all the preparations and post-processing */
cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
/* update the arguments structure */
args->source=mySource;
args->target=(char *)myTarget;
/* write the code point as code units */
cnv->invalidUCharBuffer[0] = (UChar)ch2;
cnv->invalidUCharLength = 1;
/* call the callback function */
cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
/* get the converter state from UConverter */
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
myTarget=(uint8_t *)args->target;
mySource=args->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*err==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*err)) {
/* break on error */
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*err=U_BUFFER_OVERFLOW_ERROR;
break;
/*
* } else if(ch != 0) { ...
* ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
* does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
* We would have to check myTarget<targetLimit and goto lowsurrogate?!
*/
}
continue;
break;
}
}
@ -602,15 +560,15 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeA
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
UBool isCESU8 = (UBool)(args->converter->sharedData == &_CESU8Data);
uint32_t ch, ch2;
UChar32 ch, ch2;
int32_t offsetNum, nextSourceIndex;
int16_t indexToWrite;
char temp[4];
if (cnv->fromUSurrogateLead && myTarget < targetLimit)
if (cnv->fromUChar32 && myTarget < targetLimit)
{
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
ch = cnv->fromUChar32;
cnv->fromUChar32 = 0;
offsetNum = -1;
nextSourceIndex = 0;
goto lowsurrogate;
@ -664,69 +622,21 @@ lowsurrogate:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
ch2 = ch;
cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
break;
}
} else {
/* no more input */
cnv->fromUSurrogateLead = (UChar)ch;
cnv->fromUChar32 = ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
ch2 = ch;
}
if(ch2 != 0) {
/* call the callback function with all the preparations and post-processing */
cnv->fromUChar32 = ch;
*err = U_ILLEGAL_CHAR_FOUND;
/* update the arguments structure */
args->source=mySource;
args->target=(char *)myTarget;
args->offsets=myOffsets;
/* write the code point as code units */
cnv->invalidUCharBuffer[0] = (UChar)ch2;
cnv->invalidUCharLength = 1;
/* call the callback function */
cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
/* get the converter state from UConverter */
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
/* update target and deal with offsets if necessary */
myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum);
myTarget=(uint8_t *)args->target;
/* update the source pointer and index */
offsetNum=nextSourceIndex+(args->source-mySource);
mySource=args->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*err==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*err)) {
/* break on error */
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*err=U_BUFFER_OVERFLOW_ERROR;
break;
/*
* } else if(ch != 0) { ...
* ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
* does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
* We would have to check myTarget<targetLimit and goto lowsurrogate?!
*/
}
continue;
break;
}
}

View file

@ -402,7 +402,7 @@ U_ALIGN_CODE(16)
offsets=pArgs->offsets;
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
prev=(int32_t)cnv->fromUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
@ -667,7 +667,7 @@ getTrail:
}
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
cnv->fromUChar32= c<0 ? -c : 0;
cnv->fromUnicodeStatus=(uint32_t)prev;
/* write back the updated pointers */
@ -701,7 +701,7 @@ _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
targetCapacity=pArgs->targetLimit-pArgs->target;
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
prev=(int32_t)cnv->fromUnicodeStatus;
if(prev==0) {
prev=BOCU1_ASCII_PREV;
@ -888,7 +888,7 @@ getTrail:
}
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead= c<0 ? (UChar)-c : 0;
cnv->fromUChar32= c<0 ? -c : 0;
cnv->fromUnicodeStatus=(uint32_t)prev;
/* write back the updated pointers */

View file

@ -69,7 +69,7 @@ _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, U
cnv->toUnicodeStatus = 0;
cnv->fromUnicodeStatus= 0;
cnv->mode=0;
cnv->fromUSurrogateLead=0x0000;
cnv->fromUChar32=0x0000;
cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ));
if(cnv->extraInfo != NULL){
((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
@ -108,7 +108,7 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
}
if(choice!=UCNV_RESET_TO_UNICODE) {
cnv->fromUnicodeStatus= 0;
cnv->fromUSurrogateLead=0x0000;
cnv->fromUChar32=0x0000;
if(cnv->extraInfo != NULL){
((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
@ -347,7 +347,6 @@ UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
UConverterCallbackReason reason;
UBool isEscapeAppended =FALSE;
int len =0;
const char* escSeq=NULL;
@ -356,7 +355,7 @@ UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
*err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(args->converter->fromUSurrogateLead!=0 && myTargetIndex < targetLength) {
if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
goto getTrail;
}
/*writing the char to the output stream */
@ -440,16 +439,12 @@ UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
}
else{
/* oops.. the code point is unassingned
* set the error and reason
*/
reason =UCNV_UNASSIGNED;
*err =U_INVALID_CHAR_FOUND;
/* oops.. the code point is unassigned */
/*Handle surrogates */
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(mySourceChar)) {
if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
args->converter->fromUSurrogateLead=(UChar)mySourceChar;
args->converter->fromUChar32=mySourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(mySourceIndex < mySourceLength) {
@ -457,87 +452,32 @@ getTrail:
UChar trail=(UChar) args->source[mySourceIndex];
if(UTF_IS_SECOND_SURROGATE(trail)) {
++mySourceIndex;
mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
args->converter->fromUSurrogateLead=0x00;
mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
args->converter->fromUChar32=0x00;
/* there are no surrogates in GB2312*/
*err = U_INVALID_CHAR_FOUND;
reason=UCNV_UNASSIGNED;
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
*err = U_ZERO_ERROR;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
{
int32_t saveIndex=0;
int32_t currentOffset = (args->offsets) ? *(offsets-1)+1:0;
char * saveTarget = args->target;
const UChar* saveSource = args->source;
int32_t *saveOffsets = args->offsets;
args->converter->invalidUCharLength = 0;
if(mySourceChar>0xffff){
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((mySourceChar)>>10)+0xd7c0);
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(uint16_t)(((mySourceChar)&0x3ff)|0xdc00);
}
else{
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++] =(UChar)mySourceChar;
}
myConverterData->isTargetUCharDBCS = (UBool)isTargetUCharDBCS;
args->target += myTargetIndex;
args->source += mySourceIndex;
args->offsets = args->offsets?offsets:0;
saveIndex = myTargetIndex;
/*copies current values for the ErrorFunctor to update */
/*Calls the ErrorFunctor */
args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext,
args,
args->converter->invalidUCharBuffer,
args->converter->invalidUCharLength,
(UChar32) (mySourceChar),
reason,
err);
/*Update the local Indexes so that the conversion
*can restart at the right points
*/
myTargetIndex = (int32_t)(args->target - (char*)myTarget);
mySourceIndex = (int32_t)(args->source - mySource);
args->offsets = saveOffsets;
saveIndex = myTargetIndex - saveIndex;
if(args->offsets){
args->offsets = saveOffsets;
while(saveIndex-->0){
*offsets = currentOffset;
offsets++;
}
}
isTargetUCharDBCS=myConverterData->isTargetUCharDBCS;
args->source = saveSource;
args->target = saveTarget;
args->offsets = saveOffsets;
args->converter->fromUSurrogateLead=0x00;
if (U_FAILURE (*err))
break;
}
args->converter->fromUChar32=mySourceChar;
break;
}
}
else{

View file

@ -116,7 +116,7 @@ typedef struct{
MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */
MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */
UBool isFirstBuffer;
UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */
char name[30];
}UConverterDataISCII;
@ -197,13 +197,12 @@ _ISCIIReset(UConverter *cnv, UConverterResetChoice choice){
data->contextCharToUnicode=NO_CHAR_MARKER;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
cnv->fromUSurrogateLead=0x0000;
cnv->fromUChar32=0x0000;
data->contextCharFromUnicode=0x00;
data->currentMaskFromUnicode=data->defDeltaToUnicode;
data->currentDeltaFromUnicode=data->defDeltaToUnicode;
data->isFirstBuffer=TRUE;
}
data->isFirstBuffer=TRUE;
}
/**
@ -811,7 +810,6 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
int32_t* offsets = args->offsets;
uint32_t targetByteUnit = 0x0000;
UChar32 sourceChar = 0x0000;
UConverterCallbackReason reason;
UBool useFallback;
UConverterDataISCII *converterData;
uint16_t newDelta=0;
@ -828,7 +826,7 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
newDelta=converterData->currentDeltaFromUnicode;
range = (uint16_t)(newDelta/DELTA);
if(args->converter->fromUSurrogateLead!=0 && target <targetLimit) {
if((sourceChar = args->converter->fromUChar32)!=0) {
goto getTrail;
}
@ -946,16 +944,10 @@ UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
}
}
else{
/* oops.. the code point is unassingned
* set the error and reason
*/
reason =UCNV_UNASSIGNED;
*err =U_INVALID_CHAR_FOUND;
/* oops.. the code point is unassigned */
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
args->converter->fromUSurrogateLead=(UChar)sourceChar;
getTrail:
/*look ahead to find the trail surrogate*/
if(source < sourceLimit) {
@ -963,94 +955,32 @@ getTrail:
UChar trail= (*source);
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
args->converter->fromUSurrogateLead=0x00;
reason =UCNV_UNASSIGNED;
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
*err =U_INVALID_CHAR_FOUND;
/* convert this surrogate code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
sourceChar = args->converter->fromUSurrogateLead;
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
*err = U_ZERO_ERROR;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* callback(unassigned) for a BMP code point */
*err = U_INVALID_CHAR_FOUND;
}
{
/*variables for callback */
const UChar* saveSource =NULL;
char* saveTarget =NULL;
int32_t* saveOffsets =NULL;
int currentOffset =0;
int32_t saveIndex =0;
args->converter->invalidUCharLength = 0;
if(sourceChar>0xffff){
/* we have got a surrogate pair... dissable and populate the invalidUCharBuffer */
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++]
=(uint16_t)(((sourceChar)>>10)+0xd7c0);
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++]
=(uint16_t)(((sourceChar)&0x3ff)|0xdc00);
}
else{
args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++]
=(UChar)sourceChar;
}
if(offsets){
currentOffset = *(offsets-1)+1;
}
saveSource = args->source;
saveTarget = args->target;
saveOffsets = args->offsets;
args->target = (char*)target;
args->source = source;
args->offsets = offsets;
/*copies current values for the ErrorFunctor to update */
/*Calls the ErrorFunctor */
args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext,
args,
args->converter->invalidUCharBuffer,
args->converter->invalidUCharLength,
(UChar32) (sourceChar),
reason,
err);
saveIndex = (int32_t)(args->target - (char*)target);
if(args->offsets){
args->offsets = saveOffsets;
while(saveIndex-->0){
*offsets = currentOffset;
offsets++;
}
}
target = (unsigned char*)args->target;
args->source=saveSource;
args->target=saveTarget;
args->offsets=saveOffsets;
args->converter->fromUSurrogateLead=0x00;
if (U_FAILURE (*err)){
break;
}
}
args->converter->fromUChar32=sourceChar;
break;
}
}/* end while(mySourceIndex<mySourceLength) */
/*save the state and return */

View file

@ -171,7 +171,7 @@ _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
}
/* get the converter state from UConverter */
cp=cnv->fromUSurrogateLead;
cp=cnv->fromUChar32;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= cp==0 ? 0 : -1;
@ -299,7 +299,7 @@ getTrail:
}
} else {
/* no more input */
cnv->fromUSurrogateLead=(UChar)cp;
cnv->fromUChar32=cp;
break;
}
} else {
@ -308,14 +308,7 @@ getTrail:
}
*pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
/* write the code point as code units */
{
int32_t i=0;
U16_APPEND_UNSAFE(cnv->invalidUCharBuffer, i, cp);
cnv->invalidUCharLength=(int8_t)i;
}
cnv->fromUChar32=cp;
break;
}

View file

@ -315,11 +315,14 @@ static void
_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
static void
fromUCallback(UConverter *cnv,
const void *context, UConverterFromUnicodeArgs *pArgs,
UChar32 codePoint,
UConverterCallbackReason reason, UErrorCode *pErrorCode);
static UChar32
_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
UChar32 cp,
const UChar **source, const UChar *sourceLimit,
char **target, const char *targetLimit,
int32_t **offsets, int32_t sourceIndex,
UBool useFallback, UBool flush,
UErrorCode *pErrorCode);
static void
toUCallback(UConverter *cnv,
@ -819,7 +822,7 @@ _MBCSOpen(UConverter *cnv,
cnv->toULength=0; /* byteIndex */
/* fromUnicode */
cnv->fromUSurrogateLead=0;
cnv->fromUChar32=0;
cnv->fromUnicodeStatus=1; /* prevLength */
#endif
}
@ -2141,7 +2144,6 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
int32_t prevSourceIndex, sourceIndex, nextSourceIndex;
UConverterCallbackReason reason;
uint32_t stage2Entry;
uint32_t value;
int32_t length, prevLength;
@ -2178,7 +2180,7 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
}
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
prevLength=cnv->fromUnicodeStatus;
/* sourceIndex=-1 if the current character began in the previous buffer */
@ -2246,9 +2248,8 @@ getTrail:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto callback;
break;
}
} else {
/* no more input */
@ -2257,9 +2258,8 @@ getTrail:
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto callback;
break;
}
}
@ -2422,8 +2422,32 @@ getTrail:
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
/* callback(unassigned) */
goto unassigned;
unassigned:
/* try an extension mapping */
pArgs->source=source;
c=_extFromU(cnv, cnv->sharedData,
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
if(U_FAILURE(*pErrorCode)) {
/* not mappable or buffer overflow */
break;
} else {
/* a mapping was written to the target, continue */
/* normal end of conversion: prepare for a new character */
if(offsets!=NULL) {
prevSourceIndex=sourceIndex;
sourceIndex=nextSourceIndex;
}
continue;
}
}
/* write the output character bytes from value and length */
@ -2529,69 +2553,6 @@ getTrail:
sourceIndex=nextSourceIndex;
}
continue;
/*
* This is the same ugly trick as in ToUnicode(), for the
* same reasons...
*/
unassigned:
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
/* set the converter state in UConverter to deal with the next character */
cnv->fromUSurrogateLead=0;
/*
* Do not save the prevLength SISO state because prevLength is set for
* the character that is now not output because it is unassigned or it is
* a fallback that is not taken.
* The above branch for MBCS_OUTPUT_2_SISO has saved the previous state already.
* See comments there.
*/
prevSourceIndex=sourceIndex;
/* call the callback function */
fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
prevLength=cnv->fromUnicodeStatus;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
target=(uint8_t *)pArgs->target;
/* update the source pointer and index */
sourceIndex=nextSourceIndex+(pArgs->source-source);
source=pArgs->source;
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
/*
* We do not need to repeat the statements from the normal
* end of the conversion because we already updated all the
* necessary variables.
*/
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@ -2630,7 +2591,7 @@ callback:
}
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead=(UChar)c;
cnv->fromUChar32=c;
cnv->fromUnicodeStatus=prevLength;
/* write back the updated pointers */
@ -2656,7 +2617,6 @@ _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
int32_t sourceIndex, nextSourceIndex;
UConverterCallbackReason reason;
uint32_t stage2Entry;
uint32_t value;
int32_t length, prevLength;
@ -2681,7 +2641,7 @@ _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
}
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
prevLength=cnv->fromUnicodeStatus;
/* sourceIndex=-1 if the current character began in the previous buffer */
@ -2735,9 +2695,8 @@ getTrail:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto callback;
break;
}
} else {
/* no more input */
@ -2746,9 +2705,8 @@ getTrail:
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto callback;
break;
}
}
@ -2774,8 +2732,28 @@ getTrail:
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
/* callback(unassigned) */
goto unassigned;
unassigned:
/* try an extension mapping */
pArgs->source=source;
c=_extFromU(cnv, cnv->sharedData,
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
if(U_FAILURE(*pErrorCode)) {
/* not mappable or buffer overflow */
break;
} else {
/* a mapping was written to the target, continue */
/* normal end of conversion: prepare for a new character */
sourceIndex=nextSourceIndex;
continue;
}
}
/* write the output character bytes from value and length */
@ -2815,62 +2793,6 @@ getTrail:
c=0;
sourceIndex=nextSourceIndex;
continue;
/*
* This is the same ugly trick as in ToUnicode(), for the
* same reasons...
*/
unassigned:
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
/* set the converter state in UConverter to deal with the next character */
cnv->fromUSurrogateLead=0;
cnv->fromUnicodeStatus=prevLength;
/* call the callback function */
fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
prevLength=cnv->fromUnicodeStatus;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
target=(uint8_t *)pArgs->target;
/* update the source pointer and index */
sourceIndex=nextSourceIndex+(pArgs->source-source);
source=pArgs->source;
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
/*
* We do not need to repeat the statements from the normal
* end of the conversion because we already updated all the
* necessary variables.
*/
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@ -2879,7 +2801,7 @@ callback:
}
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead=(UChar)c;
cnv->fromUChar32=c;
cnv->fromUnicodeStatus=prevLength;
/* write back the updated pointers */
@ -2905,7 +2827,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
int32_t sourceIndex, nextSourceIndex;
UConverterCallbackReason reason;
uint16_t value, minValue;
UBool hasSupplementary;
@ -2934,7 +2855,7 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
hasSupplementary=(UBool)(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY);
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= c==0 ? 0 : -1;
@ -2982,9 +2903,8 @@ getTrail:
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto callback;
break;
}
} else {
/* no more input */
@ -2993,9 +2913,8 @@ getTrail:
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
goto callback;
break;
}
}
@ -3016,65 +2935,28 @@ getTrail:
/* normal end of conversion: prepare for a new character */
c=0;
sourceIndex=nextSourceIndex;
continue;
} else { /* unassigned */
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
/* callback(unassigned) */
}
unassigned:
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
callback:
/* call the callback function with all the preparations and post-processing */
/* update the arguments structure */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
/* try an extension mapping */
pArgs->source=source;
c=_extFromU(cnv, cnv->sharedData,
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
/* set the converter state in UConverter to deal with the next character */
cnv->fromUSurrogateLead=0;
if(U_FAILURE(*pErrorCode)) {
/* not mappable or buffer overflow */
break;
} else {
/* a mapping was written to the target, continue */
/* call the callback function */
fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
target=(uint8_t *)pArgs->target;
/* update the source pointer and index */
sourceIndex=nextSourceIndex+(pArgs->source-source);
source=pArgs->source;
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
/* normal end of conversion: prepare for a new character */
sourceIndex=nextSourceIndex;
}
}
/*
* We do not need to repeat the statements from the normal
* end of the conversion because we already updated all the
* necessary variables.
*/
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
@ -3083,7 +2965,7 @@ callback:
}
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead=(UChar)c;
cnv->fromUChar32=c;
/* write back the updated pointers */
pArgs->source=source;
@ -3113,7 +2995,6 @@ _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
int32_t sourceIndex;
UConverterCallbackReason reason;
uint16_t value, minValue;
/* set up the local pointers */
@ -3140,7 +3021,7 @@ _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
}
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= c==0 ? 0 : -1;
@ -3237,15 +3118,6 @@ unrolled:
continue;
} else if(!UTF_IS_SURROGATE(c)) {
/* normal, unassigned BMP character */
/*
* We allow a 0 byte output if the Unicode code point is
* U+0000 and also if the "assigned" bit is set for this entry.
* There is no way with this data structure for fallback output
* for other than U+0000 to be a zero byte.
*/
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
} else if(UTF_IS_SURROGATE_FIRST(c)) {
getTrail:
if(source<sourceLimit) {
@ -3256,13 +3128,11 @@ getTrail:
c=UTF16_GET_PAIR_VALUE(c, trail);
/* this codepage does not map supplementary code points */
/* callback(unassigned) */
reason=UCNV_UNASSIGNED;
*pErrorCode=U_INVALID_CHAR_FOUND;
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
} else {
/* no more input */
@ -3271,69 +3141,45 @@ getTrail:
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
reason=UCNV_ILLEGAL;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
/* call the callback function with all the preparations and post-processing */
/* get the number of code units for c to correctly advance sourceIndex after the callback call */
length=UTF_CHAR_LENGTH(c);
/* c does not have a mapping */
/* set offsets since the start or the last callback */
/* get the number of code units for c to correctly advance sourceIndex */
length=U16_LENGTH(c);
/* set offsets since the start or the last extension */
if(offsets!=NULL) {
int32_t count=(int32_t)(source-lastSource);
/* do not set the offset for the callback-causing character */
/* do not set the offset for this character */
count-=length;
while(count>0) {
*offsets++=sourceIndex++;
--count;
}
/* offset and sourceIndex are now set for the current character */
/* offsets and sourceIndex are now set for the current character */
}
/* update the arguments structure */
pArgs->source=source;
pArgs->target=(char *)target;
pArgs->offsets=offsets;
/* try an extension mapping */
lastSource=source;
c=_extFromU(cnv, cnv->sharedData,
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pErrorCode);
sourceIndex+=length+(int32_t)(source-lastSource);
lastSource=source;
/* set the converter state in UConverter to deal with the next character */
cnv->fromUSurrogateLead=0;
/* call the callback function */
fromUCallback(cnv, cnv->fromUContext, pArgs, c, reason, pErrorCode);
/* get the converter state from UConverter */
c=cnv->fromUSurrogateLead;
/* update target and deal with offsets if necessary */
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
target=(uint8_t *)pArgs->target;
/* update the source pointer and index */
sourceIndex+=length+(pArgs->source-source);
source=lastSource=pArgs->source;
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
length=sourceLimit-source;
if(length<targetCapacity) {
targetCapacity=length;
}
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
/* break on error */
c=0;
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
if(U_FAILURE(*pErrorCode)) {
/* not mappable or buffer overflow */
break;
} else {
/* a mapping was written to the target, continue */
}
#if MBCS_UNROLL_SINGLE_FROM_BMP
@ -3357,7 +3203,7 @@ getTrail:
}
/* set the converter state back into UConverter */
cnv->fromUSurrogateLead=(UChar)c;
cnv->fromUChar32=c;
/* write back the updated pointers */
pArgs->source=source;
@ -3672,35 +3518,53 @@ const UConverterSharedData _MBCSData={
0
};
/* GB 18030 special handling ------------------------------------------------ */
/* conversion extensions for input not in the main table -------------------- */
/* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */
/*
* Hardcoded extension handling for GB 18030.
* Definition of LINEAR macros and gb18030Ranges see near the beginning of the file.
*
* In the future, conversion extensions may handle m:n mappings and delta tables,
* see http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/conversion/conversion_extensions.html
*
* If an input character cannot be mapped, then these functions set an error
* code. The framework will then call the callback function.
*/
/* the callback functions handle GB 18030 specially */
static void
fromUCallback(UConverter *cnv,
const void *context, UConverterFromUnicodeArgs *pArgs,
UChar32 codePoint,
UConverterCallbackReason reason, UErrorCode *pErrorCode) {
int32_t i;
/*
* TODO when implementing real extensions, review whether the useFallback parameter
* should get cnv->useFallback or the full resolution considering cp as well
*/
if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED) {
/*
* @return if(U_FAILURE) return the code point for cnv->fromUChar32
* else return 0 after output has been written to the target
*/
static UChar32
_extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
UChar32 cp,
const UChar **source, const UChar *sourceLimit,
char **target, const char *targetLimit,
int32_t **offsets, int32_t sourceIndex,
UBool useFallback, UBool flush,
UErrorCode *pErrorCode) {
/* GB 18030 */
if(cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
const uint32_t *range;
int32_t i;
range=gb18030Ranges[0];
for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i) {
if(range[0]<=(uint32_t)codePoint && (uint32_t)codePoint<=range[1]) {
if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {
/* found the Unicode code point, output the four-byte sequence for it */
uint32_t linear;
char bytes[4];
/* found the Unicode code point, output the four-byte sequence for it */
*pErrorCode=U_ZERO_ERROR;
/* get the linear value of the first GB 18030 code in this range */
linear=range[2]-LINEAR_18030_BASE;
/* add the offset from the beginning of the range */
linear+=((uint32_t)codePoint-range[0]);
linear+=((uint32_t)cp-range[0]);
/* turn this into a four-byte sequence */
bytes[3]=(char)(0x30+linear%10); linear/=10;
@ -3709,21 +3573,21 @@ fromUCallback(UConverter *cnv,
bytes[0]=(char)(0x81+linear);
/* output this sequence */
ucnv_cbFromUWriteBytes(pArgs, bytes, 4, 0, pErrorCode);
return;
ucnv_fromUWriteBytes(cnv,
bytes, 4, target, targetLimit,
offsets, sourceIndex, pErrorCode);
return 0;
}
}
}
/* write the code point as code units */
i=0;
UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, codePoint);
cnv->invalidUCharLength=(int8_t)i;
/* call the normal callback function */
cnv->fromUCharErrorBehaviour(context, pArgs, cnv->invalidUCharBuffer, i, codePoint, reason, pErrorCode);
/* no mapping */
*pErrorCode=U_INVALID_CHAR_FOUND;
return cp;
}
/* GB 18030 special handling ------------------------------------------------ */
static void
toUCallback(UConverter *cnv,
const void *context, UConverterToUnicodeArgs *pArgs,

View file

@ -181,7 +181,7 @@ _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
break;
}
cnv->fromUSurrogateLead=0;
cnv->fromUChar32=0;
}
}
@ -216,8 +216,6 @@ _SCSUClose(UConverter *cnv) {
/* SCSU-to-Unicode conversion functions ------------------------------------- */
/* ### TODO check operator precedence | << + < */
static void
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@ -1059,7 +1057,7 @@ _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
dynamicWindow=scsu->fromUDynamicWindow;
currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
/* sourceIndex=-1 if the current character began in the previous buffer */
sourceIndex= c==0 ? 0 : -1;
@ -1386,18 +1384,11 @@ getTrailUnicode:
}
endloop:
if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
/* c is an unpaired surrogate */
cnv->invalidUCharBuffer[0]=(UChar)c;
cnv->invalidUCharLength=1;
c=0;
}
/* set the converter state back into UConverter */
scsu->fromUIsSingleByteMode=isSingleByteMode;
scsu->fromUDynamicWindow=dynamicWindow;
cnv->fromUSurrogateLead=(UChar)c;
cnv->fromUChar32=c;
/* write back the updated pointers */
pArgs->source=source;
@ -1553,7 +1544,7 @@ _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
dynamicWindow=scsu->fromUDynamicWindow;
currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
c=cnv->fromUSurrogateLead;
c=cnv->fromUChar32;
/* similar conversion "loop" as in toUnicode */
loop:
@ -1851,18 +1842,11 @@ getTrailUnicode:
}
endloop:
if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
/* c is an unpaired surrogate */
cnv->invalidUCharBuffer[0]=(UChar)c;
cnv->invalidUCharLength=1;
c=0;
}
/* set the converter state back into UConverter */
scsu->fromUIsSingleByteMode=isSingleByteMode;
scsu->fromUDynamicWindow=dynamicWindow;
cnv->fromUSurrogateLead=(UChar)c;
cnv->fromUChar32=c;
/* write back the updated pointers */
pArgs->source=source;