mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-5518 merge direct-from-UTF-8 conversion code from http://source.icu-project.org/repos/icu/icu/branches/markus/ucnvutf8 -r 20735:20990 to icu/trunk
X-SVN-Rev: 21010
This commit is contained in:
parent
464ae7d46f
commit
9acca77737
20 changed files with 3800 additions and 677 deletions
|
@ -889,20 +889,25 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
* }
|
||||
*/
|
||||
for(;;) {
|
||||
/* convert */
|
||||
fromUnicode(pArgs, err);
|
||||
if(U_SUCCESS(*err)) {
|
||||
/* convert */
|
||||
fromUnicode(pArgs, err);
|
||||
|
||||
/*
|
||||
* set a flag for whether the converter
|
||||
* successfully processed the end of the input
|
||||
*
|
||||
* need not check cnv->preFromULength==0 because a replay (<0) will cause
|
||||
* s<sourceLimit before converterSawEndOfInput is checked
|
||||
*/
|
||||
converterSawEndOfInput=
|
||||
(UBool)(U_SUCCESS(*err) &&
|
||||
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
|
||||
cnv->fromUChar32==0);
|
||||
/*
|
||||
* set a flag for whether the converter
|
||||
* successfully processed the end of the input
|
||||
*
|
||||
* need not check cnv->preFromULength==0 because a replay (<0) will cause
|
||||
* s<sourceLimit before converterSawEndOfInput is checked
|
||||
*/
|
||||
converterSawEndOfInput=
|
||||
(UBool)(U_SUCCESS(*err) &&
|
||||
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
|
||||
cnv->fromUChar32==0);
|
||||
} else {
|
||||
/* handle error from ucnv_convertEx() */
|
||||
converterSawEndOfInput=FALSE;
|
||||
}
|
||||
|
||||
/* no callback called yet for this iteration */
|
||||
calledCallback=FALSE;
|
||||
|
@ -1093,6 +1098,64 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Output the fromUnicode overflow buffer.
|
||||
* Call this function if(cnv->charErrorBufferLength>0).
|
||||
* @return TRUE if overflow
|
||||
*/
|
||||
static UBool
|
||||
ucnv_outputOverflowFromUnicode(UConverter *cnv,
|
||||
char **target, const char *targetLimit,
|
||||
int32_t **pOffsets,
|
||||
UErrorCode *err) {
|
||||
int32_t *offsets;
|
||||
char *overflow, *t;
|
||||
int32_t i, length;
|
||||
|
||||
t=*target;
|
||||
if(pOffsets!=NULL) {
|
||||
offsets=*pOffsets;
|
||||
} else {
|
||||
offsets=NULL;
|
||||
}
|
||||
|
||||
overflow=(char *)cnv->charErrorBuffer;
|
||||
length=cnv->charErrorBufferLength;
|
||||
i=0;
|
||||
while(i<length) {
|
||||
if(t==targetLimit) {
|
||||
/* the overflow buffer contains too much, keep the rest */
|
||||
int32_t j=0;
|
||||
|
||||
do {
|
||||
overflow[j++]=overflow[i++];
|
||||
} while(i<length);
|
||||
|
||||
cnv->charErrorBufferLength=(int8_t)j;
|
||||
*target=t;
|
||||
if(offsets!=NULL) {
|
||||
*pOffsets=offsets;
|
||||
}
|
||||
*err=U_BUFFER_OVERFLOW_ERROR;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* copy the overflow contents to the target */
|
||||
*t++=overflow[i++];
|
||||
if(offsets!=NULL) {
|
||||
*offsets++=-1; /* no source index available for old output */
|
||||
}
|
||||
}
|
||||
|
||||
/* the overflow buffer is completely copied to the target */
|
||||
cnv->charErrorBufferLength=0;
|
||||
*target=t;
|
||||
if(offsets!=NULL) {
|
||||
*pOffsets=offsets;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucnv_fromUnicode(UConverter *cnv,
|
||||
char **target, const char *targetLimit,
|
||||
|
@ -1145,43 +1208,17 @@ ucnv_fromUnicode(UConverter *cnv,
|
|||
return;
|
||||
}
|
||||
|
||||
/* flush the target overflow buffer */
|
||||
if(cnv->charErrorBufferLength>0) {
|
||||
char *overflow;
|
||||
int32_t i, length;
|
||||
|
||||
overflow=(char *)cnv->charErrorBuffer;
|
||||
length=cnv->charErrorBufferLength;
|
||||
i=0;
|
||||
do {
|
||||
if(t==targetLimit) {
|
||||
/* the overflow buffer contains too much, keep the rest */
|
||||
int32_t j=0;
|
||||
|
||||
do {
|
||||
overflow[j++]=overflow[i++];
|
||||
} while(i<length);
|
||||
|
||||
cnv->charErrorBufferLength=(int8_t)j;
|
||||
*target=t;
|
||||
*err=U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
/* copy the overflow contents to the target */
|
||||
*t++=overflow[i++];
|
||||
if(offsets!=NULL) {
|
||||
*offsets++=-1; /* no source index available for old output */
|
||||
}
|
||||
} while(i<length);
|
||||
|
||||
/* the overflow buffer is completely copied to the target */
|
||||
cnv->charErrorBufferLength=0;
|
||||
/* output the target overflow buffer */
|
||||
if( cnv->charErrorBufferLength>0 &&
|
||||
ucnv_outputOverflowFromUnicode(cnv, target, targetLimit, &offsets, err)
|
||||
) {
|
||||
/* U_BUFFER_OVERFLOW_ERROR */
|
||||
return;
|
||||
}
|
||||
/* *target may have moved, therefore stop using t */
|
||||
|
||||
if(!flush && s==sourceLimit && cnv->preFromULength>=0) {
|
||||
/* the overflow buffer is emptied and there is no new input: we are done */
|
||||
*target=t;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1199,7 +1236,7 @@ ucnv_fromUnicode(UConverter *cnv,
|
|||
args.offsets=offsets;
|
||||
args.source=s;
|
||||
args.sourceLimit=sourceLimit;
|
||||
args.target=t;
|
||||
args.target=*target;
|
||||
args.targetLimit=targetLimit;
|
||||
args.size=sizeof(args);
|
||||
|
||||
|
@ -1304,7 +1341,7 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
|
||||
cnv->toULength==0);
|
||||
} else {
|
||||
/* handle error from getNextUChar() */
|
||||
/* handle error from getNextUChar() or ucnv_convertEx() */
|
||||
converterSawEndOfInput=FALSE;
|
||||
}
|
||||
|
||||
|
@ -1495,6 +1532,64 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Output the toUnicode overflow buffer.
|
||||
* Call this function if(cnv->UCharErrorBufferLength>0).
|
||||
* @return TRUE if overflow
|
||||
*/
|
||||
static UBool
|
||||
ucnv_outputOverflowToUnicode(UConverter *cnv,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **pOffsets,
|
||||
UErrorCode *err) {
|
||||
int32_t *offsets;
|
||||
UChar *overflow, *t;
|
||||
int32_t i, length;
|
||||
|
||||
t=*target;
|
||||
if(pOffsets!=NULL) {
|
||||
offsets=*pOffsets;
|
||||
} else {
|
||||
offsets=NULL;
|
||||
}
|
||||
|
||||
overflow=cnv->UCharErrorBuffer;
|
||||
length=cnv->UCharErrorBufferLength;
|
||||
i=0;
|
||||
while(i<length) {
|
||||
if(t==targetLimit) {
|
||||
/* the overflow buffer contains too much, keep the rest */
|
||||
int32_t j=0;
|
||||
|
||||
do {
|
||||
overflow[j++]=overflow[i++];
|
||||
} while(i<length);
|
||||
|
||||
cnv->UCharErrorBufferLength=(int8_t)j;
|
||||
*target=t;
|
||||
if(offsets!=NULL) {
|
||||
*pOffsets=offsets;
|
||||
}
|
||||
*err=U_BUFFER_OVERFLOW_ERROR;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* copy the overflow contents to the target */
|
||||
*t++=overflow[i++];
|
||||
if(offsets!=NULL) {
|
||||
*offsets++=-1; /* no source index available for old output */
|
||||
}
|
||||
}
|
||||
|
||||
/* the overflow buffer is completely copied to the target */
|
||||
cnv->UCharErrorBufferLength=0;
|
||||
*target=t;
|
||||
if(offsets!=NULL) {
|
||||
*pOffsets=offsets;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucnv_toUnicode(UConverter *cnv,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
|
@ -1547,43 +1642,17 @@ ucnv_toUnicode(UConverter *cnv,
|
|||
return;
|
||||
}
|
||||
|
||||
/* flush the target overflow buffer */
|
||||
if(cnv->UCharErrorBufferLength>0) {
|
||||
UChar *overflow;
|
||||
int32_t i, length;
|
||||
|
||||
overflow=cnv->UCharErrorBuffer;
|
||||
length=cnv->UCharErrorBufferLength;
|
||||
i=0;
|
||||
do {
|
||||
if(t==targetLimit) {
|
||||
/* the overflow buffer contains too much, keep the rest */
|
||||
int32_t j=0;
|
||||
|
||||
do {
|
||||
overflow[j++]=overflow[i++];
|
||||
} while(i<length);
|
||||
|
||||
cnv->UCharErrorBufferLength=(int8_t)j;
|
||||
*target=t;
|
||||
*err=U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
/* copy the overflow contents to the target */
|
||||
*t++=overflow[i++];
|
||||
if(offsets!=NULL) {
|
||||
*offsets++=-1; /* no source index available for old output */
|
||||
}
|
||||
} while(i<length);
|
||||
|
||||
/* the overflow buffer is completely copied to the target */
|
||||
cnv->UCharErrorBufferLength=0;
|
||||
/* output the target overflow buffer */
|
||||
if( cnv->UCharErrorBufferLength>0 &&
|
||||
ucnv_outputOverflowToUnicode(cnv, target, targetLimit, &offsets, err)
|
||||
) {
|
||||
/* U_BUFFER_OVERFLOW_ERROR */
|
||||
return;
|
||||
}
|
||||
/* *target may have moved, therefore stop using t */
|
||||
|
||||
if(!flush && s==sourceLimit && cnv->preToULength>=0) {
|
||||
/* the overflow buffer is emptied and there is no new input: we are done */
|
||||
*target=t;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1601,7 +1670,7 @@ ucnv_toUnicode(UConverter *cnv,
|
|||
args.offsets=offsets;
|
||||
args.source=s;
|
||||
args.sourceLimit=sourceLimit;
|
||||
args.target=t;
|
||||
args.target=*target;
|
||||
args.targetLimit=targetLimit;
|
||||
args.size=sizeof(args);
|
||||
|
||||
|
@ -1951,7 +2020,14 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
|
|||
UBool reset, UBool flush,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar pivotBuffer[CHUNK_SIZE];
|
||||
UChar *myPivotSource, *myPivotTarget;
|
||||
const UChar *myPivotSource;
|
||||
UChar *myPivotTarget;
|
||||
const char *s;
|
||||
char *t;
|
||||
|
||||
UConverterToUnicodeArgs toUArgs;
|
||||
UConverterFromUnicodeArgs fromUArgs;
|
||||
UConverterConvert convert;
|
||||
|
||||
/* error checking */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
|
@ -1966,6 +2042,25 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
|
|||
return;
|
||||
}
|
||||
|
||||
s=*source;
|
||||
t=*target;
|
||||
if((sourceLimit!=NULL && sourceLimit<s) || targetLimit<t) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the buffer sizes do not exceed the number range for
|
||||
* int32_t. See ucnv_toUnicode() for a more detailed comment.
|
||||
*/
|
||||
if(
|
||||
(sourceLimit!=NULL && ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s)) ||
|
||||
((size_t)(targetLimit-t)>(size_t)0x7fffffff && targetLimit>t)
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
if(pivotStart==NULL) {
|
||||
if(!flush) {
|
||||
/* streaming conversion requires an explicit pivot buffer */
|
||||
|
@ -1974,8 +2069,8 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
|
|||
}
|
||||
|
||||
/* use the stack pivot buffer */
|
||||
pivotStart=myPivotSource=myPivotTarget=pivotBuffer;
|
||||
pivotSource=&myPivotSource;
|
||||
myPivotSource=myPivotTarget=pivotStart=pivotBuffer;
|
||||
pivotSource=(UChar **)&myPivotSource;
|
||||
pivotTarget=&myPivotTarget;
|
||||
pivotLimit=pivotBuffer+CHUNK_SIZE;
|
||||
} else if( pivotStart>=pivotLimit ||
|
||||
|
@ -1995,51 +2090,260 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
|
|||
if(reset) {
|
||||
ucnv_resetToUnicode(sourceCnv);
|
||||
ucnv_resetFromUnicode(targetCnv);
|
||||
*pivotTarget=*pivotSource=pivotStart;
|
||||
*pivotSource=*pivotTarget=pivotStart;
|
||||
} else if(targetCnv->charErrorBufferLength>0) {
|
||||
/* output the targetCnv overflow buffer */
|
||||
if(ucnv_outputOverflowFromUnicode(targetCnv, target, targetLimit, NULL, pErrorCode)) {
|
||||
/* U_BUFFER_OVERFLOW_ERROR */
|
||||
return;
|
||||
}
|
||||
/* *target has moved, therefore stop using t */
|
||||
|
||||
if( !flush &&
|
||||
targetCnv->preFromULength>=0 && *pivotSource==*pivotTarget &&
|
||||
sourceCnv->UCharErrorBufferLength==0 && sourceCnv->preToULength>=0 && s==sourceLimit
|
||||
) {
|
||||
/* the fromUnicode overflow buffer is emptied and there is no new input: we are done */
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
/* Is direct-UTF-8 conversion available? */
|
||||
if( sourceCnv->sharedData->staticData->conversionType==UCNV_UTF8 &&
|
||||
targetCnv->sharedData->impl->fromUTF8!=NULL
|
||||
) {
|
||||
convert=targetCnv->sharedData->impl->fromUTF8;
|
||||
} else if( targetCnv->sharedData->staticData->conversionType==UCNV_UTF8 &&
|
||||
sourceCnv->sharedData->impl->toUTF8!=NULL
|
||||
) {
|
||||
convert=sourceCnv->sharedData->impl->toUTF8;
|
||||
} else {
|
||||
convert=NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If direct-UTF-8 conversion is available, then we use a smaller
|
||||
* pivot buffer for error handling and partial matches
|
||||
* so that we quickly return to direct conversion.
|
||||
*
|
||||
* 32 is large enough for UCNV_EXT_MAX_UCHARS and UCNV_ERROR_BUFFER_LENGTH.
|
||||
*
|
||||
* We could reduce the pivot buffer size further, at the cost of
|
||||
* buffer overflows from callbacks.
|
||||
* The pivot buffer should not be smaller than the maximum number of
|
||||
* fromUnicode extension table input UChars
|
||||
* (for m:n conversion, see
|
||||
* targetCnv->sharedData->mbcs.extIndexes[UCNV_EXT_COUNT_UCHARS])
|
||||
* or 2 for surrogate pairs.
|
||||
*
|
||||
* Too small a buffer can cause thrashing between pivoting and direct
|
||||
* conversion, with function call overhead outweighing the benefits
|
||||
* of direct conversion.
|
||||
*/
|
||||
if(convert!=NULL && (pivotLimit-pivotStart)>32) {
|
||||
pivotLimit=pivotStart+32;
|
||||
}
|
||||
|
||||
/* prepare the converter arguments */
|
||||
fromUArgs.converter=targetCnv;
|
||||
fromUArgs.flush=FALSE;
|
||||
fromUArgs.offsets=NULL;
|
||||
fromUArgs.target=*target;
|
||||
fromUArgs.targetLimit=targetLimit;
|
||||
fromUArgs.size=sizeof(fromUArgs);
|
||||
|
||||
toUArgs.converter=sourceCnv;
|
||||
toUArgs.flush=flush;
|
||||
toUArgs.offsets=NULL;
|
||||
toUArgs.source=s;
|
||||
toUArgs.sourceLimit=sourceLimit;
|
||||
toUArgs.targetLimit=pivotLimit;
|
||||
toUArgs.size=sizeof(toUArgs);
|
||||
|
||||
/*
|
||||
* TODO: Consider separating this function into two functions,
|
||||
* extracting exactly the conversion loop,
|
||||
* for readability and to reduce the set of visible variables.
|
||||
*
|
||||
* Otherwise stop using s and t from here on.
|
||||
*/
|
||||
s=t=NULL;
|
||||
|
||||
/*
|
||||
* conversion loop
|
||||
*
|
||||
* The sequence of steps in the loop may appear backward,
|
||||
* but the principle is simple:
|
||||
* In the chain of
|
||||
* source - sourceCnv overflow - pivot - targetCnv overflow - target
|
||||
* empty out later buffers before refilling them from earlier ones.
|
||||
*
|
||||
* The targetCnv overflow buffer is flushed out only once before the loop.
|
||||
*/
|
||||
for(;;) {
|
||||
if(reset) {
|
||||
/*
|
||||
* if we did a reset in this function, we know that there is nothing
|
||||
* to convert to the target yet, so we save a function call
|
||||
*/
|
||||
reset=FALSE;
|
||||
} else {
|
||||
/*
|
||||
* convert to the target first in case the pivot is filled at entry
|
||||
* or the targetCnv has some output bytes in its state
|
||||
*/
|
||||
ucnv_fromUnicode(targetCnv,
|
||||
target, targetLimit,
|
||||
(const UChar **)pivotSource, *pivotTarget,
|
||||
NULL,
|
||||
(UBool)(flush && *source==sourceLimit),
|
||||
pErrorCode);
|
||||
/*
|
||||
* if(pivot not empty or error or replay or flush fromUnicode) {
|
||||
* fromUnicode(pivot -> target);
|
||||
* }
|
||||
*
|
||||
* For pivoting conversion; and for direct conversion for
|
||||
* error callback handling and flushing the replay buffer.
|
||||
*/
|
||||
if( *pivotSource<*pivotTarget ||
|
||||
U_FAILURE(*pErrorCode) ||
|
||||
targetCnv->preFromULength<0 ||
|
||||
fromUArgs.flush
|
||||
) {
|
||||
fromUArgs.source=*pivotSource;
|
||||
fromUArgs.sourceLimit=*pivotTarget;
|
||||
_fromUnicodeWithCallback(&fromUArgs, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* target overflow, or conversion error */
|
||||
*pivotSource=(UChar *)fromUArgs.source;
|
||||
break;
|
||||
}
|
||||
|
||||
/* ucnv_fromUnicode() must have consumed the pivot contents since it returned with U_SUCCESS() */
|
||||
*pivotSource=*pivotTarget=pivotStart;
|
||||
/*
|
||||
* _fromUnicodeWithCallback() must have consumed the pivot contents
|
||||
* (*pivotSource==*pivotTarget) since it returned with U_SUCCESS()
|
||||
*/
|
||||
}
|
||||
|
||||
/* convert from the source to the pivot */
|
||||
ucnv_toUnicode(sourceCnv,
|
||||
pivotTarget, pivotLimit,
|
||||
source, sourceLimit,
|
||||
NULL,
|
||||
flush,
|
||||
pErrorCode);
|
||||
/* The pivot buffer is empty; reset it so we start at pivotStart. */
|
||||
*pivotSource=*pivotTarget=pivotStart;
|
||||
|
||||
/*
|
||||
* if(sourceCnv overflow buffer not empty) {
|
||||
* move(sourceCnv overflow buffer -> pivot);
|
||||
* continue;
|
||||
* }
|
||||
*/
|
||||
/* output the sourceCnv overflow buffer */
|
||||
if(sourceCnv->UCharErrorBufferLength>0) {
|
||||
if(ucnv_outputOverflowToUnicode(sourceCnv, pivotTarget, pivotLimit, NULL, pErrorCode)) {
|
||||
/* U_BUFFER_OVERFLOW_ERROR */
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* check for end of input and break if done
|
||||
*
|
||||
* Checking both flush and fromUArgs.flush ensures that the converters
|
||||
* have been called with the flush flag set if the ucnv_convertEx()
|
||||
* caller set it.
|
||||
*/
|
||||
if( toUArgs.source==sourceLimit &&
|
||||
sourceCnv->preToULength>=0 && sourceCnv->toULength==0 &&
|
||||
(!flush || fromUArgs.flush)
|
||||
) {
|
||||
/* done successfully */
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* use direct conversion if available
|
||||
* but not if continuing a partial match
|
||||
* or flushing the toUnicode replay buffer
|
||||
*/
|
||||
if(convert!=NULL && targetCnv->preFromUFirstCP<0 && sourceCnv->preToULength==0) {
|
||||
if(*pErrorCode==U_USING_DEFAULT_WARNING) {
|
||||
/* remove a warning that may be set by this function */
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
convert(&fromUArgs, &toUArgs, pErrorCode);
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
if(sourceCnv->toULength>0) {
|
||||
/*
|
||||
* Fall through to calling _toUnicodeWithCallback()
|
||||
* for callback handling.
|
||||
*
|
||||
* The pivot buffer will be reset with
|
||||
* *pivotSource=*pivotTarget=pivotStart;
|
||||
* which indicates a toUnicode error to the caller
|
||||
* (*pivotSource==pivotStart shows no pivot UChars consumed).
|
||||
*/
|
||||
} else {
|
||||
/*
|
||||
* Indicate a fromUnicode error to the caller
|
||||
* (*pivotSource>pivotStart shows some pivot UChars consumed).
|
||||
*/
|
||||
*pivotSource=*pivotTarget=pivotStart+1;
|
||||
/*
|
||||
* Loop around to calling _fromUnicodeWithCallbacks()
|
||||
* for callback handling.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
} else if(*pErrorCode==U_USING_DEFAULT_WARNING) {
|
||||
/*
|
||||
* No error, but the implementation requested to temporarily
|
||||
* fall back to pivoting.
|
||||
*/
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
/*
|
||||
* The following else branches are almost identical to the end-of-input
|
||||
* handling in _toUnicodeWithCallback().
|
||||
* Avoid calling it just for the end of input.
|
||||
*/
|
||||
} else if(flush && sourceCnv->toULength>0) { /* flush==toUArgs.flush */
|
||||
/*
|
||||
* the entire input stream is consumed
|
||||
* and there is a partial, truncated input sequence left
|
||||
*/
|
||||
|
||||
/* inject an error and continue with callback handling */
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
} else {
|
||||
/* input consumed */
|
||||
if(flush) {
|
||||
/* reset the converters without calling the callback functions */
|
||||
_reset(sourceCnv, UCNV_RESET_TO_UNICODE, FALSE);
|
||||
_reset(targetCnv, UCNV_RESET_FROM_UNICODE, FALSE);
|
||||
}
|
||||
|
||||
/* done successfully */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* toUnicode(source -> pivot);
|
||||
*
|
||||
* For pivoting conversion; and for direct conversion for
|
||||
* error callback handling, continuing partial matches
|
||||
* and flushing the replay buffer.
|
||||
*
|
||||
* The pivot buffer is empty and reset.
|
||||
*/
|
||||
toUArgs.target=pivotStart; /* ==*pivotTarget */
|
||||
/* toUArgs.targetLimit=pivotLimit; already set before the loop */
|
||||
_toUnicodeWithCallback(&toUArgs, pErrorCode);
|
||||
*pivotTarget=toUArgs.target;
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* pivot overflow: continue with the conversion loop */
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
} else if(U_FAILURE(*pErrorCode) || *pivotTarget==pivotStart) {
|
||||
} else if(U_FAILURE(*pErrorCode) || (!flush && *pivotTarget==pivotStart)) {
|
||||
/* conversion error, or there was nothing left to convert */
|
||||
break;
|
||||
}
|
||||
/* else ucnv_toUnicode() wrote into the pivot buffer: continue */
|
||||
/*
|
||||
* else:
|
||||
* _toUnicodeWithCallback() wrote into the pivot buffer,
|
||||
* continue with fromUnicode conversion.
|
||||
*
|
||||
* Set the fromUnicode flush flag if we flush and if toUnicode has
|
||||
* processed the end of the input.
|
||||
*/
|
||||
if( flush && toUArgs.source==sourceLimit &&
|
||||
sourceCnv->preToULength>=0 &&
|
||||
sourceCnv->UCharErrorBufferLength==0
|
||||
) {
|
||||
fromUArgs.flush=TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2049,6 +2353,9 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
|
|||
* - a conversion error occurred
|
||||
*/
|
||||
|
||||
*source=toUArgs.source;
|
||||
*target=fromUArgs.target;
|
||||
|
||||
/* terminate the target buffer if possible */
|
||||
if(flush && U_SUCCESS(*pErrorCode)) {
|
||||
if(*target!=targetLimit) {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1996-2006, International Business Machines Corporation and
|
||||
* Copyright (c) 1996-2007, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************
|
||||
*
|
||||
|
@ -1433,6 +1433,7 @@ ucnv_swap(const UDataSwapper *ds,
|
|||
outBytes+offset, pErrorCode);
|
||||
} else {
|
||||
/* otherwise: swap the stage tables separately */
|
||||
int32_t maxFastUChar;
|
||||
|
||||
/* stage 1 table: uint16_t[0x440 or 0x40] */
|
||||
if(inStaticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
|
@ -1467,6 +1468,20 @@ ucnv_swap(const UDataSwapper *ds,
|
|||
/* just uint8_t[], nothing to swap */
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* utf8Friendly MBCS files (mbcsHeader.version 4.3)
|
||||
* contain an additional mbcsIndex table:
|
||||
* uint16_t[(maxFastUChar+1)>>6];
|
||||
* where maxFastUChar=((mbcsHeader.version[2]<<8)|0xff).
|
||||
*/
|
||||
if(mbcsHeader.version[1]>=3 && (maxFastUChar=mbcsHeader.version[2])!=0) {
|
||||
maxFastUChar=(maxFastUChar<<8)|0xff;
|
||||
offset+=count;
|
||||
count=((maxFastUChar+1)>>6)*2;
|
||||
ds->swapArray16(ds, inBytes+offset, (int32_t)count,
|
||||
outBytes+offset, pErrorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines
|
||||
* Copyright (C) 1999-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* uconv_cnv.h:
|
||||
* defines all the low level conversion functions
|
||||
* T_UnicodeConverter_{to,from}Unicode_$ConversionType
|
||||
* ucnv_cnv.h:
|
||||
* Definitions for converter implementations.
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
|
@ -104,6 +103,23 @@ typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *);
|
|||
*/
|
||||
typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *);
|
||||
|
||||
/*
|
||||
* Converter implementation function for ucnv_convertEx(), for direct conversion
|
||||
* between two charsets without pivoting through UTF-16.
|
||||
* The rules are the same as for UConverterToUnicode and UConverterFromUnicode.
|
||||
* In addition,
|
||||
* - The toUnicode side must behave and keep state exactly like the
|
||||
* UConverterToUnicode implementation for the same source charset.
|
||||
* - A U_USING_DEFAULT_WARNING can be set to request to temporarily fall back
|
||||
* to pivoting. When this function is called, the conversion framework makes
|
||||
* sure that this warning is not set on input.
|
||||
* - Continuing a partial match and flushing the toUnicode replay buffer
|
||||
* are handled by pivoting, using the toUnicode and fromUnicode functions.
|
||||
*/
|
||||
typedef void (*UConverterConvert) (UConverterFromUnicodeArgs *pFromUArgs,
|
||||
UConverterToUnicodeArgs *pToUArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/*
|
||||
* Converter implementation function for ucnv_getNextUChar().
|
||||
* If the function pointer is NULL, then the toUnicode function will be used.
|
||||
|
@ -214,6 +230,9 @@ struct UConverterImpl {
|
|||
UConverterWriteSub writeSub;
|
||||
UConverterSafeClone safeClone;
|
||||
UConverterGetUnicodeSet getUnicodeSet;
|
||||
|
||||
UConverterConvert toUTF8;
|
||||
UConverterConvert fromUTF8;
|
||||
};
|
||||
|
||||
extern const UConverterSharedData
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2006, International Business Machines
|
||||
* Copyright (C) 2002-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ucnv_u8.c
|
||||
|
@ -724,6 +724,263 @@ static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
return 0xffff;
|
||||
}
|
||||
|
||||
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
|
||||
|
||||
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
|
||||
static const UChar32
|
||||
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
|
||||
static const UChar32
|
||||
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
|
||||
|
||||
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
|
||||
static void
|
||||
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
UConverterToUnicodeArgs *pToUArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *utf8, *cnv;
|
||||
const uint8_t *source, *sourceLimit;
|
||||
uint8_t *target;
|
||||
int32_t targetCapacity;
|
||||
int32_t count;
|
||||
|
||||
int8_t oldToULength, toULength, toULimit;
|
||||
|
||||
UChar32 c;
|
||||
uint8_t b, t1, t2;
|
||||
|
||||
/* set up the local pointers */
|
||||
utf8=pToUArgs->converter;
|
||||
cnv=pFromUArgs->converter;
|
||||
source=(uint8_t *)pToUArgs->source;
|
||||
sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
|
||||
target=(uint8_t *)pFromUArgs->target;
|
||||
targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
|
||||
|
||||
/* get the converter state from the UTF-8 UConverter */
|
||||
c=(UChar32)utf8->toUnicodeStatus;
|
||||
if(c!=0) {
|
||||
toULength=oldToULength=utf8->toULength;
|
||||
toULimit=(int8_t)utf8->mode;
|
||||
} else {
|
||||
toULength=oldToULength=toULimit=0;
|
||||
}
|
||||
|
||||
count=(int32_t)(sourceLimit-source)+oldToULength;
|
||||
if(count<toULimit) {
|
||||
/*
|
||||
* Not enough input to complete the partial character.
|
||||
* Jump to moreBytes below - it will not output to target.
|
||||
*/
|
||||
} else if(targetCapacity<toULimit) {
|
||||
/*
|
||||
* Not enough target capacity to output the partial character.
|
||||
* Let the standard converter handle this.
|
||||
*/
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
} else {
|
||||
/*
|
||||
* Use a single counter for source and target, counting the minimum of
|
||||
* the source length and the target capacity.
|
||||
* As a result, the source length is checked only once per multi-byte
|
||||
* character instead of twice.
|
||||
*
|
||||
* Make sure that the last byte sequence is complete, or else
|
||||
* stop just before it.
|
||||
* (The longest legal byte sequence has 3 trail bytes.)
|
||||
* Count oldToULength (number of source bytes from a previous buffer)
|
||||
* into the source length but reduce the source index by toULimit
|
||||
* while going back over trail bytes in order to not go back into
|
||||
* the bytes that will be read for finishing a partial
|
||||
* sequence from the previous buffer.
|
||||
* Let the standard converter handle edge cases.
|
||||
*/
|
||||
int32_t i;
|
||||
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(i<3 && i<(count-toULimit)) {
|
||||
b=source[count-oldToULength-i-1];
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++i;
|
||||
} else {
|
||||
if(i<utf8_countTrailBytes[b]) {
|
||||
/* stop converting before the lead byte if there are not enough trail bytes for it */
|
||||
count-=i+1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
utf8->toUnicodeStatus=0;
|
||||
utf8->toULength=0;
|
||||
goto moreBytes;
|
||||
/* See note in ucnv_SBCSFromUTF8() about this goto. */
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
while(count>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
/* convert ASCII */
|
||||
*target++=b;
|
||||
--count;
|
||||
continue;
|
||||
} else {
|
||||
if(b>0xe0) {
|
||||
if( /* handle U+1000..U+D7FF inline */
|
||||
(t1=source[0]) >= 0x80 && ((b<0xed) && (t1 <= 0xbf) ||
|
||||
(b==0xed && (t1 <= 0x9f))) &&
|
||||
(t2=source[1]) >= 0x80 && t2 <= 0xbf
|
||||
) {
|
||||
source+=2;
|
||||
*target++=b;
|
||||
*target++=t1;
|
||||
*target++=t2;
|
||||
count-=3;
|
||||
continue;
|
||||
}
|
||||
} else if(b<0xe0) {
|
||||
if( /* handle U+0080..U+07FF inline */
|
||||
b>=0xc2 &&
|
||||
(t1=*source) >= 0x80 && t1 <= 0xbf
|
||||
) {
|
||||
++source;
|
||||
*target++=b;
|
||||
*target++=t1;
|
||||
count-=2;
|
||||
continue;
|
||||
}
|
||||
} else if(b==0xe0) {
|
||||
if( /* handle U+0800..U+0FFF inline */
|
||||
(t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
|
||||
(t2=source[1]) >= 0x80 && t2 <= 0xbf
|
||||
) {
|
||||
source+=2;
|
||||
*target++=b;
|
||||
*target++=t1;
|
||||
*target++=t2;
|
||||
count-=3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle "complicated" and error cases, and continuing partial characters */
|
||||
oldToULength=0;
|
||||
toULength=1;
|
||||
toULimit=utf8_countTrailBytes[b]+1;
|
||||
c=b;
|
||||
moreBytes:
|
||||
while(toULength<toULimit) {
|
||||
if(source<sourceLimit) {
|
||||
b=*source;
|
||||
if(U8_IS_TRAIL(b)) {
|
||||
++source;
|
||||
++toULength;
|
||||
c=(c<<6)+b;
|
||||
} else {
|
||||
break; /* sequence too short, stop with toULength<toULimit */
|
||||
}
|
||||
} else {
|
||||
/* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
|
||||
source-=(toULength-oldToULength);
|
||||
while(oldToULength<toULength) {
|
||||
utf8->toUBytes[oldToULength++]=*source++;
|
||||
}
|
||||
utf8->toUnicodeStatus=c;
|
||||
utf8->toULength=toULength;
|
||||
utf8->mode=toULimit;
|
||||
pToUArgs->source=(char *)source;
|
||||
pFromUArgs->target=(char *)target;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if( toULength==toULimit && /* consumed all trail bytes */
|
||||
(toULength==3 || toULength==2) && /* BMP */
|
||||
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
|
||||
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
|
||||
) {
|
||||
/* legal byte sequence for BMP code point */
|
||||
} else if(
|
||||
toULength==toULimit && toULength==4 &&
|
||||
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
|
||||
) {
|
||||
/* legal byte sequence for supplementary code point */
|
||||
} else {
|
||||
/* error handling: illegal UTF-8 byte sequence */
|
||||
source-=(toULength-oldToULength);
|
||||
while(oldToULength<toULength) {
|
||||
utf8->toUBytes[oldToULength++]=*source++;
|
||||
}
|
||||
utf8->toULength=toULength;
|
||||
pToUArgs->source=(char *)source;
|
||||
pFromUArgs->target=(char *)target;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
return;
|
||||
}
|
||||
|
||||
/* copy the legal byte sequence to the target */
|
||||
{
|
||||
int8_t i;
|
||||
|
||||
for(i=0; i<oldToULength; ++i) {
|
||||
*target++=utf8->toUBytes[i];
|
||||
}
|
||||
source-=(toULength-oldToULength);
|
||||
for(; i<toULength; ++i) {
|
||||
*target++=*source++;
|
||||
}
|
||||
count-=toULength;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
|
||||
if(target==(const uint8_t *)pFromUArgs->targetLimit) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
} else {
|
||||
b=*source;
|
||||
toULimit=utf8_countTrailBytes[b]+1;
|
||||
if(toULimit>(sourceLimit-source)) {
|
||||
/* collect a truncated byte sequence */
|
||||
toULength=0;
|
||||
c=b;
|
||||
for(;;) {
|
||||
utf8->toUBytes[toULength++]=b;
|
||||
if(++source==sourceLimit) {
|
||||
/* partial byte sequence at end of source */
|
||||
utf8->toUnicodeStatus=c;
|
||||
utf8->toULength=toULength;
|
||||
utf8->mode=toULimit;
|
||||
break;
|
||||
} else if(!U8_IS_TRAIL(b=*source)) {
|
||||
/* lead byte in trail byte position */
|
||||
utf8->toULength=toULength;
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
break;
|
||||
}
|
||||
c=(c<<6)+b;
|
||||
}
|
||||
} else {
|
||||
/* partial-sequence target overflow: fall back to the pivoting implementation */
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
pToUArgs->source=(char *)source;
|
||||
pFromUArgs->target=(char *)target;
|
||||
}
|
||||
|
||||
/* UTF-8 converter data ----------------------------------------------------- */
|
||||
|
||||
static const UConverterImpl _UTF8Impl={
|
||||
|
@ -746,7 +1003,10 @@ static const UConverterImpl _UTF8Impl={
|
|||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
ucnv_getNonSurrogateUnicodeSet
|
||||
ucnv_getNonSurrogateUnicodeSet,
|
||||
|
||||
ucnv_UTF8FromUTF8,
|
||||
ucnv_UTF8FromUTF8
|
||||
};
|
||||
|
||||
/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2000-2004, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ucnvlat1.cpp
|
||||
|
@ -330,6 +330,105 @@ noMoreInput:
|
|||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
|
||||
static void
|
||||
ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
UConverterToUnicodeArgs *pToUArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *utf8;
|
||||
const uint8_t *source, *sourceLimit;
|
||||
uint8_t *target;
|
||||
int32_t targetCapacity;
|
||||
|
||||
UChar32 c;
|
||||
uint8_t b, t1;
|
||||
|
||||
/* set up the local pointers */
|
||||
utf8=pToUArgs->converter;
|
||||
source=(uint8_t *)pToUArgs->source;
|
||||
sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
|
||||
target=(uint8_t *)pFromUArgs->target;
|
||||
targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
|
||||
|
||||
/* get the converter state from the UTF-8 UConverter */
|
||||
c=(UChar32)utf8->toUnicodeStatus;
|
||||
if(c!=0 && source<sourceLimit) {
|
||||
if(targetCapacity==0) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
return;
|
||||
} else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {
|
||||
++source;
|
||||
*target++=(uint8_t)(((c&3)<<6)|t1);
|
||||
--targetCapacity;
|
||||
|
||||
utf8->toUnicodeStatus=0;
|
||||
utf8->toULength=0;
|
||||
} else {
|
||||
/* complicated, illegal or unmappable input: fall back to the pivoting implementation */
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the last byte sequence before sourceLimit is complete
|
||||
* or runs into a lead byte.
|
||||
* In the conversion loop compare source with sourceLimit only once
|
||||
* per multi-byte character.
|
||||
* For Latin-1, adjust sourceLimit only for 1 trail byte because
|
||||
* the conversion loop handles at most 2-byte sequences.
|
||||
*/
|
||||
if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {
|
||||
--sourceLimit;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
while(source<sourceLimit) {
|
||||
if(targetCapacity>0) {
|
||||
b=*source++;
|
||||
if((int8_t)b>=0) {
|
||||
/* convert ASCII */
|
||||
*target++=(uint8_t)b;
|
||||
--targetCapacity;
|
||||
} else if( /* handle U+0080..U+00FF inline */
|
||||
b>=0xc2 && b<=0xc3 &&
|
||||
(t1=(uint8_t)(*source-0x80)) <= 0x3f
|
||||
) {
|
||||
++source;
|
||||
*target++=(uint8_t)(((b&3)<<6)|t1);
|
||||
--targetCapacity;
|
||||
} else {
|
||||
/* complicated, illegal or unmappable input: fall back to the pivoting implementation */
|
||||
pToUArgs->source=(char *)(source-1);
|
||||
pFromUArgs->target=(char *)target;
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The sourceLimit may have been adjusted before the conversion loop
|
||||
* to stop before a truncated sequence.
|
||||
* If so, then collect the truncated sequence now.
|
||||
* For Latin-1, there is at most exactly one lead byte because of the
|
||||
* smaller sourceLimit adjustment logic.
|
||||
*/
|
||||
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
|
||||
utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
|
||||
utf8->toULength=1;
|
||||
utf8->mode=utf8_countTrailBytes[b]+1;
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
pToUArgs->source=(char *)source;
|
||||
pFromUArgs->target=(char *)target;
|
||||
}
|
||||
|
||||
static void
|
||||
_Latin1GetUnicodeSet(const UConverter *cnv,
|
||||
const USetAdder *sa,
|
||||
|
@ -358,7 +457,10 @@ static const UConverterImpl _Latin1Impl={
|
|||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
_Latin1GetUnicodeSet
|
||||
_Latin1GetUnicodeSet,
|
||||
|
||||
NULL,
|
||||
ucnv_Latin1FromUTF8
|
||||
};
|
||||
|
||||
static const UConverterStaticData _Latin1StaticData={
|
||||
|
@ -532,6 +634,95 @@ _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
|||
return 0xffff;
|
||||
}
|
||||
|
||||
/* "Convert" UTF-8 to US-ASCII: Validate and copy. */
|
||||
static void
|
||||
ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
|
||||
UConverterToUnicodeArgs *pToUArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
const uint8_t *source, *sourceLimit;
|
||||
uint8_t *target;
|
||||
int32_t targetCapacity, length;
|
||||
|
||||
uint8_t c;
|
||||
|
||||
if(pToUArgs->converter->toUnicodeStatus!=0) {
|
||||
/* no handling of partial UTF-8 characters here, fall back to pivoting */
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
return;
|
||||
}
|
||||
|
||||
/* set up the local pointers */
|
||||
source=(const uint8_t *)pToUArgs->source;
|
||||
sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;
|
||||
target=(uint8_t *)pFromUArgs->target;
|
||||
targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
|
||||
|
||||
/*
|
||||
* since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
|
||||
* for the minimum of the sourceLength and targetCapacity
|
||||
*/
|
||||
length=(int32_t)(sourceLimit-source);
|
||||
if(length<targetCapacity) {
|
||||
targetCapacity=length;
|
||||
}
|
||||
|
||||
/* unroll the loop with the most common case */
|
||||
if(targetCapacity>=16) {
|
||||
int32_t count, loops;
|
||||
uint8_t oredChars;
|
||||
|
||||
loops=count=targetCapacity>>4;
|
||||
do {
|
||||
oredChars=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
oredChars|=*target++=*source++;
|
||||
|
||||
/* were all 16 entries really valid? */
|
||||
if(oredChars>0x7f) {
|
||||
/* no, return to the first of these 16 */
|
||||
source-=16;
|
||||
target-=16;
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
count=loops-count;
|
||||
targetCapacity-=16*count;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
c=0;
|
||||
while(targetCapacity>0 && (c=*source)<=0x7f) {
|
||||
++source;
|
||||
*target++=c;
|
||||
--targetCapacity;
|
||||
}
|
||||
|
||||
if(c>0x7f) {
|
||||
/* non-ASCII character, handle in standard converter */
|
||||
*pErrorCode=U_USING_DEFAULT_WARNING;
|
||||
} else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {
|
||||
/* target is full */
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
/* write back the updated pointers */
|
||||
pToUArgs->source=(const char *)source;
|
||||
pFromUArgs->target=(char *)target;
|
||||
}
|
||||
|
||||
static void
|
||||
_ASCIIGetUnicodeSet(const UConverter *cnv,
|
||||
const USetAdder *sa,
|
||||
|
@ -560,7 +751,10 @@ static const UConverterImpl _ASCIIImpl={
|
|||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
_ASCIIGetUnicodeSet
|
||||
_ASCIIGetUnicodeSet,
|
||||
|
||||
NULL,
|
||||
ucnv_ASCIIFromUTF8
|
||||
};
|
||||
|
||||
static const UConverterStaticData _ASCIIStaticData={
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2004, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -41,6 +41,59 @@
|
|||
* the same toUnicode structures, while the fromUnicode structures for SBCS
|
||||
* differ from those for other MBCS-style converters.
|
||||
*
|
||||
* _MBCSHeader.version 4.3 optionally modifies the fromUnicode data structures
|
||||
* slightly and optionally adds a table for conversion to MBCS (non-SBCS)
|
||||
* charsets.
|
||||
*
|
||||
* The modifications are to make the data utf8Friendly. Not every 4.3 file
|
||||
* file contains utf8Friendly data.
|
||||
* It is utf8Friendly if _MBCSHeader.version[2]!=0.
|
||||
* In this case, the data structures are utf8Friendly up to the code point
|
||||
* maxFastUChar=((_MBCSHeader.version[2]<<8)|0xff)
|
||||
*
|
||||
* A utf8Friendly file has fromUnicode stage 3 entries for code points up to
|
||||
* maxFastUChar allocated in blocks of 64 for indexing with the 6 bits from
|
||||
* a UTF-8 trail byte. ASCII is allocated linearly with 128 contiguous entries.
|
||||
*
|
||||
* In addition, a utf8Friendly MBCS file contains an additional
|
||||
* uint16_t mbcsIndex[(maxFastUChar+1)>>6];
|
||||
* which replaces the stage 1 and 2 tables for indexing with bits from the
|
||||
* UTF-8 lead byte and middle trail byte. Unlike the older MBCS stage 2 table,
|
||||
* the mbcsIndex does not contain roundtrip flags. Therefore, all fallbacks
|
||||
* from code points up to maxFastUChar (and roundtrips to 0x00) are moved to
|
||||
* the extension data structure. This also allows for faster roundtrip
|
||||
* conversion from UTF-16.
|
||||
*
|
||||
* SBCS files do not contain an additional sbcsIndex[] array because the
|
||||
* proportional size increase would be noticeable, but the runtime
|
||||
* code builds one for the code point range for which the runtime conversion
|
||||
* code is optimized.
|
||||
*
|
||||
* For SBCS, maxFastUChar should be at least U+0FFF. The initial makeconv
|
||||
* implementation sets it to U+1FFF. Because the sbcsIndex is not stored in
|
||||
* the file, a larger maxFastUChar only affects stage 3 block allocation size
|
||||
* and is free in empty blocks. (Larger blocks with sparse contents cause larger
|
||||
* files.) U+1FFF includes almost all of the small scripts.
|
||||
* U+0FFF covers UTF-8 two-byte sequences and three-byte sequences starting with
|
||||
* 0xe0. This includes most scripts with legacy SBCS charsets.
|
||||
* The initial runtime implementation using 4.3 files only builds an sbcsIndex
|
||||
* for code points up to U+0FFF.
|
||||
*
|
||||
* For MBCS, maxFastUChar should be at least U+D7FF (=initial value).
|
||||
* This boundary is convenient because practically all of the commonly used
|
||||
* characters are below it, and because it is the boundary to surrogate
|
||||
* code points, above which special handling is necessary anyway.
|
||||
* (Surrogate pair assembly for UTF-16, validity checking for UTF-8.)
|
||||
*
|
||||
* maxFastUChar could be up to U+FFFF to cover the whole BMP, which could be
|
||||
* useful especially for conversion from UTF-8 when the input can be assumed
|
||||
* to be valid, because the surrogate range would then not have to be
|
||||
* checked.
|
||||
* (With maxFastUChar=0xffff, makeconv would have to check for mbcsIndex value
|
||||
* overflow because with the all-unassigned block 0 and nearly full mappings
|
||||
* from the BMP it is theoretically possible that an index into stage 3
|
||||
* exceeds 16 bits.)
|
||||
*
|
||||
* _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
|
||||
* If it is present, then an ICU version reading header versions 4.0 or 4.1
|
||||
* will be able to use the base table and ignore the extension.
|
||||
|
@ -60,7 +113,7 @@
|
|||
* struct _MBCSHeader (see the definition in this header file below)
|
||||
* contains 32-bit fields as follows:
|
||||
* 8 values:
|
||||
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
|
||||
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.3.x.0)
|
||||
* 1 uint32_t countStates
|
||||
* 2 uint32_t countToUFallbacks
|
||||
* 3 uint32_t offsetToUCodeUnits
|
||||
|
@ -121,6 +174,15 @@
|
|||
* uint16_t fromUBytes[fromUBytesLength/2]; or
|
||||
* uint32_t fromUBytes[fromUBytesLength/4];
|
||||
* }
|
||||
*
|
||||
* -- optional utf8Friendly mbcsIndex -- _MBCSHeader.version 4.3 (ICU 3.8) and higher
|
||||
* if(outputType!=MBCS_OUTPUT_1 &&
|
||||
* _MBCSHeader.version[1]>=3 &&
|
||||
* (maxFastUChar=_MBCSHeader.version[2])!=0
|
||||
* ) {
|
||||
* maxFastUChar=(maxFastUChar<<8)|0xff;
|
||||
* uint16_t mbcsIndex[(maxFastUChar+1)>>6];
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* -- extension table, details see ucnv_ext.h
|
||||
|
@ -180,9 +242,17 @@ enum {
|
|||
#define MBCS_ENTRY_FINAL_VALUE(entry) ((entry)&0xfffff)
|
||||
#define MBCS_ENTRY_FINAL_VALUE_16(entry) (uint16_t)(entry)
|
||||
|
||||
#define IS_ASCII_ROUNDTRIP(b, asciiRoundtrips) (((asciiRoundtrips) & (1<<((b)>>2)))!=0)
|
||||
|
||||
/* single-byte fromUnicode: get the 16-bit result word */
|
||||
#define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ]
|
||||
|
||||
/* single-byte fromUnicode using the sbcsIndex */
|
||||
#define SBCS_RESULT_FROM_LOW_BMP(table, results, c) (results)[ (table)[(c)>>6] +((c)&0x3f) ]
|
||||
|
||||
/* single-byte fromUTF8 using the sbcsIndex; l and t must be masked externally; can be l=0 and t<=0x7f */
|
||||
#define SBCS_RESULT_FROM_UTF8(table, results, l, t) (results)[ (table)[l] +(t) ]
|
||||
|
||||
/* multi-byte fromUnicode: get the 32-bit stage 2 entry */
|
||||
#define MBCS_STAGE_2_FROM_U(table, c) ((const uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ]
|
||||
#define MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ( ((stage2Entry) & ((uint32_t)1<< (16+((c)&0xf)) )) !=0)
|
||||
|
@ -192,6 +262,12 @@ enum {
|
|||
|
||||
#define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3)
|
||||
|
||||
/* double-byte fromUnicode using the mbcsIndex */
|
||||
#define DBCS_RESULT_FROM_MOST_BMP(table, results, c) (results)[ (table)[(c)>>6] +((c)&0x3f) ]
|
||||
|
||||
/* double-byte fromUTF8 using the mbcsIndex; l and t1 combined into lt1; lt1 and t2 must be masked externally */
|
||||
#define DBCS_RESULT_FROM_UTF8(table, results, lt1, t2) (results)[ (table)[lt1] +(t2) ]
|
||||
|
||||
|
||||
/**
|
||||
* MBCS output types for conversions from Unicode.
|
||||
|
@ -226,9 +302,19 @@ typedef struct {
|
|||
UChar32 codePoint;
|
||||
} _MBCSToUFallback;
|
||||
|
||||
/** Constants for fast and UTF-8-friendly conversion. */
|
||||
enum {
|
||||
SBCS_FAST_MAX=0x0fff, /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */
|
||||
SBCS_FAST_LIMIT=SBCS_FAST_MAX+1, /* =0x1000 */
|
||||
MBCS_FAST_MAX=0xd7ff, /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */
|
||||
MBCS_FAST_LIMIT=MBCS_FAST_MAX+1 /* =0xd800 */
|
||||
};
|
||||
|
||||
/**
|
||||
* This is the MBCS part of the UConverterTable union (a runtime data structure).
|
||||
* It keeps all the per-converter data and points into the loaded mapping tables.
|
||||
*
|
||||
* utf8Friendly data structures added with _MBCSHeader.version 4.3
|
||||
*/
|
||||
typedef struct UConverterMBCSTable {
|
||||
/* toUnicode */
|
||||
|
@ -242,10 +328,17 @@ typedef struct UConverterMBCSTable {
|
|||
|
||||
/* fromUnicode */
|
||||
const uint16_t *fromUnicodeTable;
|
||||
const uint16_t *mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */
|
||||
uint16_t sbcsIndex[SBCS_FAST_LIMIT>>6]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */
|
||||
const uint8_t *fromUnicodeBytes;
|
||||
uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */
|
||||
uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */
|
||||
uint32_t fromUBytesLength;
|
||||
uint8_t outputType, unicodeMask;
|
||||
UBool utf8Friendly; /* for utf8Friendly data */
|
||||
UChar maxFastUChar; /* for utf8Friendly data */
|
||||
|
||||
/* roundtrips */
|
||||
uint32_t asciiRoundtrips;
|
||||
|
||||
/* converter name for swaplfnl */
|
||||
char *swapLFNLName;
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "unicode/ucnv.h"
|
||||
#include "unicode/ucnv_err.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "ucnv_bld.h" /* for sizeof(UConverter) */
|
||||
#include "cmemory.h" /* for UAlignedMemory */
|
||||
|
@ -108,6 +109,7 @@ static void TestConvertSafeCloneCallback(void);
|
|||
|
||||
static void TestEBCDICSwapLFNL(void);
|
||||
static void TestConvertEx(void);
|
||||
static void TestConvertExFromUTF8(void);
|
||||
static void TestConvertAlgorithmic(void);
|
||||
void TestDefaultConverterError(void); /* defined in cctest.c */
|
||||
static void TestToUCountPending(void);
|
||||
|
@ -136,6 +138,7 @@ void addTestConvert(TestNode** root)
|
|||
addTest(root, &TestLMBCSMaxChar, "tsconv/ccapitst/TestLMBCSMaxChar");
|
||||
addTest(root, &TestEBCDICSwapLFNL, "tsconv/ccapitst/TestEBCDICSwapLFNL");
|
||||
addTest(root, &TestConvertEx, "tsconv/ccapitst/TestConvertEx");
|
||||
addTest(root, &TestConvertExFromUTF8, "tsconv/ccapitst/TestConvertExFromUTF8");
|
||||
addTest(root, &TestConvertAlgorithmic, "tsconv/ccapitst/TestConvertAlgorithmic");
|
||||
addTest(root, &TestDefaultConverterError, "tsconv/ccapitst/TestDefaultConverterError");
|
||||
addTest(root, &TestToUCountPending, "tsconv/ccapitst/TestToUCountPending");
|
||||
|
@ -2205,6 +2208,11 @@ convertExStreaming(UConverter *srcCnv, UConverter *targetCnv,
|
|||
pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
|
||||
FALSE, flush, &errorCode);
|
||||
targetLength=(int32_t)(target-targetBuffer);
|
||||
if(target>targetLimit) {
|
||||
log_err("ucnv_convertEx(%s) chunk[%d] target %p exceeds targetLimit %p\n",
|
||||
testName, chunkSize, target, targetLimit);
|
||||
break; /* TODO: major problem! */
|
||||
}
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* continue converting another chunk */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
|
@ -2402,6 +2410,264 @@ static void TestConvertEx() {
|
|||
#endif
|
||||
}
|
||||
|
||||
/* Test illegal UTF-8 input: Data and functions for TestConvertExFromUTF8(). */
|
||||
static const char *const badUTF8[]={
|
||||
/* truncated multi-byte sequences */
|
||||
"\xd0",
|
||||
"\xe0",
|
||||
"\xe1",
|
||||
"\xed",
|
||||
"\xee",
|
||||
"\xf0",
|
||||
"\xf1",
|
||||
"\xf4",
|
||||
"\xf8",
|
||||
"\xfc",
|
||||
|
||||
"\xe0\x80",
|
||||
"\xe0\xa0",
|
||||
"\xe1\x80",
|
||||
"\xed\x80",
|
||||
"\xed\xa0",
|
||||
"\xee\x80",
|
||||
"\xf0\x80",
|
||||
"\xf0\x90",
|
||||
"\xf1\x80",
|
||||
"\xf4\x80",
|
||||
"\xf4\x90",
|
||||
"\xf8\x80",
|
||||
"\xfc\x80",
|
||||
|
||||
"\xf0\x80\x80",
|
||||
"\xf0\x90\x80",
|
||||
"\xf1\x80\x80",
|
||||
"\xf4\x80\x80",
|
||||
"\xf4\x90\x80",
|
||||
"\xf8\x80\x80",
|
||||
"\xfc\x80\x80",
|
||||
|
||||
"\xf8\x80\x80\x80",
|
||||
"\xfc\x80\x80\x80",
|
||||
|
||||
"\xfc\x80\x80\x80\x80",
|
||||
|
||||
/* complete sequences but non-shortest forms or out of range etc. */
|
||||
"\xc0\x80",
|
||||
"\xe0\x80\x80",
|
||||
"\xed\xa0\x80",
|
||||
"\xf0\x80\x80\x80",
|
||||
"\xf4\x90\x80\x80",
|
||||
"\xf8\x80\x80\x80\x80",
|
||||
"\xfc\x80\x80\x80\x80\x80",
|
||||
"\xfe",
|
||||
"\xff"
|
||||
};
|
||||
|
||||
/* get some character that can be converted and convert it */
|
||||
static UBool getTestChar(UConverter *cnv, const char *converterName,
|
||||
char charUTF8[4], int32_t *pCharUTF8Length,
|
||||
char char0[8], int32_t *pChar0Length,
|
||||
char char1[8], int32_t *pChar1Length) {
|
||||
UChar utf16[U16_MAX_LENGTH];
|
||||
int32_t utf16Length;
|
||||
|
||||
const UChar *utf16Source;
|
||||
char *target;
|
||||
|
||||
USet *set;
|
||||
UChar32 c;
|
||||
UErrorCode errorCode;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
set=uset_open(1, 0);
|
||||
ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &errorCode);
|
||||
c=uset_charAt(set, uset_size(set)/2);
|
||||
uset_close(set);
|
||||
|
||||
utf16Length=0;
|
||||
U16_APPEND_UNSAFE(utf16, utf16Length, c);
|
||||
*pCharUTF8Length=0;
|
||||
U8_APPEND_UNSAFE(charUTF8, *pCharUTF8Length, c);
|
||||
|
||||
utf16Source=utf16;
|
||||
target=char0;
|
||||
ucnv_fromUnicode(cnv,
|
||||
&target, char0+sizeof(char0),
|
||||
&utf16Source, utf16+utf16Length,
|
||||
NULL, FALSE, &errorCode);
|
||||
*pChar0Length=(int32_t)(target-char0);
|
||||
|
||||
utf16Source=utf16;
|
||||
target=char1;
|
||||
ucnv_fromUnicode(cnv,
|
||||
&target, char1+sizeof(char1),
|
||||
&utf16Source, utf16+utf16Length,
|
||||
NULL, FALSE, &errorCode);
|
||||
*pChar1Length=(int32_t)(target-char1);
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("unable to get test character for %s - %s\n", converterName, u_errorName(errorCode));
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
|
||||
char charUTF8[4], int32_t charUTF8Length,
|
||||
char char0[8], int32_t char0Length,
|
||||
char char1[8], int32_t char1Length) {
|
||||
char utf8[16];
|
||||
int32_t utf8Length;
|
||||
|
||||
char output[16];
|
||||
int32_t outputLength;
|
||||
|
||||
char invalidChars[8];
|
||||
int8_t invalidLength;
|
||||
|
||||
char *source;
|
||||
char *target;
|
||||
|
||||
UChar pivotBuffer[8];
|
||||
UChar *pivotSource, *pivotTarget;
|
||||
|
||||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
/* test truncated sequences */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
|
||||
|
||||
memcpy(utf8, charUTF8, charUTF8Length);
|
||||
|
||||
for(i=0; i<LENGTHOF(badUTF8); ++i) {
|
||||
/* truncated sequence? */
|
||||
int32_t length=strlen(badUTF8[i]);
|
||||
if(length>=(1+U8_COUNT_TRAIL_BYTES(badUTF8[i][0]))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* assemble a string with the test character and the truncated sequence */
|
||||
memcpy(utf8+charUTF8Length, badUTF8[i], length);
|
||||
utf8Length=charUTF8Length+length;
|
||||
|
||||
/* convert and check the invalidChars */
|
||||
source=utf8;
|
||||
target=output;
|
||||
pivotSource=pivotTarget=pivotBuffer;
|
||||
errorCode=U_ZERO_ERROR;
|
||||
ucnv_convertEx(cnv, utf8Cnv,
|
||||
&target, output+sizeof(output),
|
||||
&source, utf8+utf8Length,
|
||||
pivotBuffer, &pivotSource, &pivotTarget, pivotBuffer+LENGTHOF(pivotBuffer),
|
||||
TRUE, TRUE, /* reset & flush */
|
||||
&errorCode);
|
||||
outputLength=(int32_t)(target-output);
|
||||
if(errorCode!=U_TRUNCATED_CHAR_FOUND || pivotSource!=pivotBuffer) {
|
||||
log_err("unexpected error %s from %s badUTF8[%ld]\n", u_errorName(errorCode), converterName, (long)i);
|
||||
continue;
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
invalidLength=(int8_t)sizeof(invalidChars);
|
||||
ucnv_getInvalidChars(utf8Cnv, invalidChars, &invalidLength, &errorCode);
|
||||
if(invalidLength!=length || 0!=memcmp(invalidChars, badUTF8[i], length)) {
|
||||
log_err("wrong invalidChars from %s badUTF8[%ld]\n", converterName, (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void testFromBadUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
|
||||
char charUTF8[4], int32_t charUTF8Length,
|
||||
char char0[8], int32_t char0Length,
|
||||
char char1[8], int32_t char1Length) {
|
||||
char utf8[600], expect[600];
|
||||
int32_t utf8Length, expectLength;
|
||||
|
||||
char testName[32];
|
||||
|
||||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &errorCode);
|
||||
|
||||
/*
|
||||
* assemble an input string with the test character between each
|
||||
* bad sequence,
|
||||
* and an expected string with repeated test character output
|
||||
*/
|
||||
memcpy(utf8, charUTF8, charUTF8Length);
|
||||
utf8Length=charUTF8Length;
|
||||
|
||||
memcpy(expect, char0, char0Length);
|
||||
expectLength=char0Length;
|
||||
|
||||
for(i=0; i<LENGTHOF(badUTF8); ++i) {
|
||||
int32_t length=strlen(badUTF8[i]);
|
||||
memcpy(utf8+utf8Length, badUTF8[i], length);
|
||||
utf8Length+=length;
|
||||
|
||||
memcpy(utf8+utf8Length, charUTF8, charUTF8Length);
|
||||
utf8Length+=charUTF8Length;
|
||||
|
||||
memcpy(expect+expectLength, char1, char1Length);
|
||||
expectLength+=char1Length;
|
||||
}
|
||||
|
||||
/* expect that each bad UTF-8 sequence is detected and skipped */
|
||||
strcpy(testName, "from bad UTF-8 to ");
|
||||
strcat(testName, converterName);
|
||||
|
||||
convertExMultiStreaming(utf8Cnv, cnv,
|
||||
utf8, utf8Length,
|
||||
expect, expectLength,
|
||||
testName,
|
||||
U_ZERO_ERROR);
|
||||
}
|
||||
|
||||
/* Test illegal UTF-8 input. */
|
||||
static void TestConvertExFromUTF8() {
|
||||
static const char *const converterNames[]={
|
||||
"windows-1252",
|
||||
"shift-jis",
|
||||
"us-ascii",
|
||||
"iso-8859-1",
|
||||
"utf-8"
|
||||
};
|
||||
|
||||
UConverter *utf8Cnv, *cnv;
|
||||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
/* fromUnicode versions of some character, from initial state and later */
|
||||
char charUTF8[4], char0[8], char1[8];
|
||||
int32_t charUTF8Length, char0Length, char1Length;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
utf8Cnv=ucnv_open("UTF-8", &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("unable to open UTF-8 converter - %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
for(i=0; i<LENGTHOF(converterNames); ++i) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
cnv=ucnv_open(converterNames[i], &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("unable to open %s converter - %s\n", converterNames[i], u_errorName(errorCode));
|
||||
continue;
|
||||
}
|
||||
if(!getTestChar(cnv, converterNames[i], charUTF8, &charUTF8Length, char0, &char0Length, char1, &char1Length)) {
|
||||
continue;
|
||||
}
|
||||
testFromTruncatedUTF8(utf8Cnv, cnv, converterNames[i], charUTF8, charUTF8Length, char0, char0Length, char1, char1Length);
|
||||
testFromBadUTF8(utf8Cnv, cnv, converterNames[i], charUTF8, charUTF8Length, char0, char0Length, char1, char1Length);
|
||||
ucnv_close(cnv);
|
||||
}
|
||||
ucnv_close(utf8Cnv);
|
||||
}
|
||||
|
||||
static void
|
||||
TestConvertAlgorithmic() {
|
||||
#if !UCONFIG_NO_LEGACY_CONVERSION
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2006, International Business Machines
|
||||
* Copyright (C) 2003-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -50,7 +50,18 @@ enum {
|
|||
ESC_CB='&'
|
||||
};
|
||||
|
||||
ConversionTest::~ConversionTest() {}
|
||||
ConversionTest::ConversionTest() {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
utf8Cnv=ucnv_open("UTF-8", &errorCode);
|
||||
ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("unable to open UTF-8 converter");
|
||||
}
|
||||
}
|
||||
|
||||
ConversionTest::~ConversionTest() {
|
||||
ucnv_close(utf8Cnv);
|
||||
}
|
||||
|
||||
void
|
||||
ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
|
||||
|
@ -948,6 +959,112 @@ ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *
|
|||
|
||||
// fromUnicode test worker functions --------------------------------------- ***
|
||||
|
||||
static int32_t
|
||||
stepFromUTF8(ConversionCase &cc,
|
||||
UConverter *utf8Cnv, UConverter *cnv,
|
||||
char *result, int32_t resultCapacity,
|
||||
int32_t step,
|
||||
UErrorCode *pErrorCode) {
|
||||
const char *source, *sourceLimit, *utf8Limit;
|
||||
UChar pivotBuffer[32];
|
||||
UChar *pivotSource, *pivotTarget, *pivotLimit;
|
||||
char *target, *targetLimit, *resultLimit;
|
||||
UBool flush;
|
||||
|
||||
source=cc.utf8;
|
||||
pivotSource=pivotTarget=pivotBuffer;
|
||||
target=result;
|
||||
utf8Limit=source+cc.utf8Length;
|
||||
resultLimit=result+resultCapacity;
|
||||
|
||||
// call ucnv_convertEx() with in/out buffers no larger than (step) at a time
|
||||
// move only one buffer (in vs. out) at a time to be extra mean
|
||||
// step==0 performs bulk conversion
|
||||
|
||||
// initialize the partial limits for the loop
|
||||
if(step==0) {
|
||||
// use the entire buffers
|
||||
sourceLimit=utf8Limit;
|
||||
targetLimit=resultLimit;
|
||||
flush=cc.finalFlush;
|
||||
|
||||
pivotLimit=pivotBuffer+LENGTHOF(pivotBuffer);
|
||||
} else {
|
||||
// start with empty partial buffers
|
||||
sourceLimit=source;
|
||||
targetLimit=target;
|
||||
flush=FALSE;
|
||||
|
||||
// empty pivot is not allowed, make it of length step
|
||||
pivotLimit=pivotBuffer+step;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
// resetting the opposite conversion direction must not affect this one
|
||||
ucnv_resetFromUnicode(utf8Cnv);
|
||||
ucnv_resetToUnicode(cnv);
|
||||
|
||||
// convert
|
||||
ucnv_convertEx(cnv, utf8Cnv,
|
||||
&target, targetLimit,
|
||||
&source, sourceLimit,
|
||||
pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
|
||||
FALSE, flush, pErrorCode);
|
||||
|
||||
// check pointers and errors
|
||||
if(source>sourceLimit || target>targetLimit) {
|
||||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
if(target!=targetLimit) {
|
||||
// buffer overflow must only be set when the target is filled
|
||||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
} else if(targetLimit==resultLimit) {
|
||||
// not just a partial overflow
|
||||
break;
|
||||
}
|
||||
|
||||
// the partial target is filled, set a new limit, reset the error and continue
|
||||
targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
if(pivotSource==pivotBuffer) {
|
||||
// toUnicode error, should not occur
|
||||
// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
|
||||
break;
|
||||
} else {
|
||||
// fromUnicode error
|
||||
// some other error occurred, done
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if(source!=sourceLimit) {
|
||||
// when no error occurs, then the input must be consumed
|
||||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
if(sourceLimit==utf8Limit) {
|
||||
// we are done
|
||||
if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
|
||||
// ucnv_convertEx() warns about not terminating the output
|
||||
// but ucnv_fromUnicode() does not and so
|
||||
// checkFromUnicode() does not expect it
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// the partial conversion succeeded, set a new limit and continue
|
||||
sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
|
||||
flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
|
||||
}
|
||||
}
|
||||
|
||||
return (int32_t)(target-result);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
stepFromUnicode(ConversionCase &cc, UConverter *cnv,
|
||||
char *result, int32_t resultCapacity,
|
||||
|
@ -1048,6 +1165,7 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
|
|||
cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
|
||||
return FALSE;
|
||||
}
|
||||
ucnv_resetToUnicode(utf8Cnv);
|
||||
|
||||
// set the callback
|
||||
if(callback!=NULL) {
|
||||
|
@ -1086,6 +1204,19 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
|
|||
}
|
||||
}
|
||||
|
||||
// convert unicode to utf8
|
||||
char utf8[200];
|
||||
cc.utf8=utf8;
|
||||
u_strToUTF8(utf8, LENGTHOF(utf8), &cc.utf8Length,
|
||||
cc.unicode, cc.unicodeLength,
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
// skip UTF-8 testing of a string with an unpaired surrogate,
|
||||
// or of one that's too long
|
||||
// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
|
||||
cc.utf8Length=-1;
|
||||
}
|
||||
|
||||
int32_t resultOffsets[200];
|
||||
char result[200];
|
||||
int32_t resultLength;
|
||||
|
@ -1093,22 +1224,18 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
|
|||
|
||||
static const struct {
|
||||
int32_t step;
|
||||
const char *name;
|
||||
const char *name, *utf8Name;
|
||||
} steps[]={
|
||||
{ 0, "bulk" }, // must be first for offsets to be checked
|
||||
{ 1, "step=1" },
|
||||
{ 3, "step=3" },
|
||||
{ 7, "step=7" }
|
||||
{ 0, "bulk", "utf8" }, // must be first for offsets to be checked
|
||||
{ 1, "step=1", "utf8 step=1" },
|
||||
{ 3, "step=3", "utf8 step=3" },
|
||||
{ 7, "step=7", "utf8 step=7" }
|
||||
};
|
||||
int32_t i, step;
|
||||
|
||||
ok=TRUE;
|
||||
for(i=0; i<LENGTHOF(steps) && ok; ++i) {
|
||||
step=steps[i].step;
|
||||
if(step!=0) {
|
||||
// bulk test is first, then offsets are not checked any more
|
||||
cc.offsets=NULL;
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
resultLength=stepFromUnicode(cc, cnv,
|
||||
result, LENGTHOF(result),
|
||||
|
@ -1124,6 +1251,28 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
|
|||
// otherwise do nothing to make sure that flushing resets
|
||||
ucnv_resetFromUnicode(cnv);
|
||||
}
|
||||
|
||||
// bulk test is first, then offsets are not checked any more
|
||||
cc.offsets=NULL;
|
||||
|
||||
// test direct conversion from UTF-8
|
||||
if(cc.utf8Length>=0) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
|
||||
result, LENGTHOF(result),
|
||||
step, &errorCode);
|
||||
ok=checkFromUnicode(
|
||||
cc, cnv, steps[i].utf8Name,
|
||||
(uint8_t *)result, resultLength,
|
||||
NULL,
|
||||
errorCode);
|
||||
if(U_FAILURE(errorCode) || !cc.finalFlush) {
|
||||
// reset if an error occurred or we did not flush
|
||||
// otherwise do nothing to make sure that flushing resets
|
||||
ucnv_resetToUnicode(utf8Cnv);
|
||||
ucnv_resetFromUnicode(cnv);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// not a real loop, just a convenience for breaking out of the block
|
||||
|
|
|
@ -27,18 +27,25 @@
|
|||
#include "intltest.h"
|
||||
|
||||
struct ConversionCase {
|
||||
/* setup */
|
||||
int32_t caseNr;
|
||||
const char *charset, *cbopt, *name;
|
||||
UChar subString[16];
|
||||
char subchar[8];
|
||||
int8_t setSub;
|
||||
|
||||
/* input and expected output */
|
||||
const uint8_t *bytes;
|
||||
int32_t bytesLength;
|
||||
const UChar *unicode;
|
||||
int32_t unicodeLength;
|
||||
const int32_t *offsets;
|
||||
|
||||
/* UTF-8 version of unicode[unicodeLength] */
|
||||
const char *utf8;
|
||||
int32_t utf8Length;
|
||||
|
||||
/* options */
|
||||
UBool finalFlush;
|
||||
UBool fallbacks;
|
||||
UErrorCode outErrorCode;
|
||||
|
@ -46,6 +53,7 @@ struct ConversionCase {
|
|||
const UChar *invalidUChars;
|
||||
int32_t invalidLength;
|
||||
|
||||
/* actual output */
|
||||
uint8_t resultBytes[200];
|
||||
UChar resultUnicode[200];
|
||||
int32_t resultOffsets[200];
|
||||
|
@ -56,7 +64,7 @@ struct ConversionCase {
|
|||
|
||||
class ConversionTest : public IntlTest {
|
||||
public:
|
||||
ConversionTest() {}
|
||||
ConversionTest();
|
||||
virtual ~ConversionTest();
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=0);
|
||||
|
@ -86,6 +94,9 @@ private:
|
|||
|
||||
UConverter *
|
||||
cnv_open(const char *name, UErrorCode &errorCode);
|
||||
|
||||
/* for testing direct UTF-8 conversion */
|
||||
UConverter *utf8Cnv;
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2005, International Business Machines
|
||||
* Copyright (C) 2002-2006, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: utfperf.cpp
|
||||
|
@ -16,45 +16,151 @@
|
|||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/uperf.h"
|
||||
#include "uoptions.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
/* definitions and text buffers */
|
||||
|
||||
#define INPUT_CAPACITY (1024*1024)
|
||||
#define INTERMEDIATE_CAPACITY 4096
|
||||
#define INTERMEDIATE_SMALL_CAPACITY 20
|
||||
#define PIVOT_CAPACITY 1024
|
||||
#define OUTPUT_CAPACITY INPUT_CAPACITY
|
||||
|
||||
static UChar input[INPUT_CAPACITY];
|
||||
static char utf8[INPUT_CAPACITY];
|
||||
static UChar pivot[INTERMEDIATE_CAPACITY];
|
||||
|
||||
static UChar output[OUTPUT_CAPACITY];
|
||||
static char intermediate[INTERMEDIATE_CAPACITY];
|
||||
static char intermediate[OUTPUT_CAPACITY];
|
||||
|
||||
static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
|
||||
static int32_t utf8Length, encodedLength, outputLength, countInputCodePoints;
|
||||
|
||||
static int32_t fromUCallbackCount;
|
||||
|
||||
class Command : public UPerfFunction {
|
||||
private:
|
||||
Command(const char * name, int32_t buf_cap):name(name),buf_cap(buf_cap){
|
||||
errorCode=U_ZERO_ERROR;
|
||||
cnv=ucnv_open(name, &errorCode);
|
||||
}
|
||||
// Command-line options specific to utfperf.
|
||||
// Options do not have abbreviations: Force readable command lines.
|
||||
// (Using U+0001 for abbreviation characters.)
|
||||
enum {
|
||||
CHARSET,
|
||||
CHUNK_LENGTH,
|
||||
PIVOT_LENGTH,
|
||||
UTFPERF_OPTIONS_COUNT
|
||||
};
|
||||
|
||||
static UOption options[UTFPERF_OPTIONS_COUNT]={
|
||||
UOPTION_DEF("charset", '\x01', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("chunk", '\x01', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("pivot", '\x01', UOPT_REQUIRES_ARG)
|
||||
};
|
||||
|
||||
static const char *const utfperf_usage =
|
||||
"\t--charset Charset for which to test performance, e.g. windows-1251.\n"
|
||||
"\t Default: UTF-8\n"
|
||||
"\t--chunk Length (in bytes) of charset output chunks. [4096]\n"
|
||||
"\t--pivot Length (in UChars) of the UTF-16 pivot buffer, if applicable.\n"
|
||||
"\t [1024]\n";
|
||||
|
||||
// Test object.
|
||||
class UtfPerformanceTest : public UPerfTest{
|
||||
public:
|
||||
static UPerfFunction* get(const char * name, int32_t buf_cap){
|
||||
Command * t = new Command(name, buf_cap);
|
||||
if (U_SUCCESS(t->errorCode)){
|
||||
return t;
|
||||
} else {
|
||||
//fprintf(stderr, "error opening converter for \"%s\" - %s\n", name, u_errorName(errorCode));
|
||||
delete t;
|
||||
return NULL;
|
||||
UtfPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
|
||||
: UPerfTest(argc, argv, options, LENGTHOF(options), utfperf_usage, status) {
|
||||
if (U_SUCCESS(status)) {
|
||||
charset = options[CHARSET].value;
|
||||
|
||||
chunkLength = atoi(options[CHUNK_LENGTH].value);
|
||||
if (chunkLength < 1 || OUTPUT_CAPACITY < chunkLength) {
|
||||
fprintf(stderr, "error: chunk length must be 1..%ld\n", (long)OUTPUT_CAPACITY);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
||||
pivotLength = atoi(options[PIVOT_LENGTH].value);
|
||||
if (pivotLength < 1 || PIVOT_CAPACITY < pivotLength) {
|
||||
fprintf(stderr, "error: pivot length must be 1..%ld\n", (long)PIVOT_CAPACITY);
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
|
||||
int32_t inputLength;
|
||||
UPerfTest::getBuffer(inputLength, status);
|
||||
countInputCodePoints = u_countChar32(buffer, bufferLen);
|
||||
u_strToUTF8(utf8, (int32_t)sizeof(utf8), &utf8Length, buffer, bufferLen, &status);
|
||||
}
|
||||
}
|
||||
|
||||
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
|
||||
|
||||
const UChar *getBuffer() const { return buffer; }
|
||||
int32_t getBufferLen() const { return bufferLen; }
|
||||
|
||||
const char *charset;
|
||||
int32_t chunkLength, pivotLength;
|
||||
};
|
||||
|
||||
U_CDECL_BEGIN
|
||||
// Custom callback for counting callback calls.
|
||||
static void U_CALLCONV
|
||||
fromUCallback(const void *context,
|
||||
UConverterFromUnicodeArgs *fromUArgs,
|
||||
const UChar *codeUnits,
|
||||
int32_t length,
|
||||
UChar32 codePoint,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode *pErrorCode) {
|
||||
if (reason <= UCNV_IRREGULAR) {
|
||||
++fromUCallbackCount;
|
||||
}
|
||||
UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, pErrorCode);
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
// Base class for Roundtrip, FromUnicode and FromUTF8 with common setup.
|
||||
class Command : public UPerfFunction {
|
||||
protected:
|
||||
Command(const UtfPerformanceTest &testcase)
|
||||
: testcase(testcase),
|
||||
input(testcase.getBuffer()), inputLength(testcase.getBufferLen()),
|
||||
errorCode(U_ZERO_ERROR) {
|
||||
cnv=ucnv_open(testcase.charset, &errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error opening converter for \"%s\" - %s\n", testcase.charset, u_errorName(errorCode));
|
||||
}
|
||||
ucnv_setFromUCallBack(cnv, fromUCallback, NULL, NULL, NULL, &errorCode);
|
||||
}
|
||||
public:
|
||||
virtual ~Command(){
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
ucnv_close(cnv);
|
||||
}
|
||||
}
|
||||
// virtual void call(UErrorCode* pErrorCode) { ... }
|
||||
virtual long getOperationsPerIteration(){
|
||||
return countInputCodePoints;
|
||||
}
|
||||
|
||||
const UtfPerformanceTest &testcase;
|
||||
const UChar *input;
|
||||
int32_t inputLength;
|
||||
UErrorCode errorCode;
|
||||
UConverter *cnv;
|
||||
};
|
||||
|
||||
// Test roundtrip UTF-16->encoding->UTF-16.
|
||||
class Roundtrip : public Command {
|
||||
protected:
|
||||
Roundtrip(const UtfPerformanceTest &testcase) : Command(testcase) {}
|
||||
public:
|
||||
static UPerfFunction* get(const UtfPerformanceTest &testcase) {
|
||||
Roundtrip * t = new Roundtrip(testcase);
|
||||
if (U_SUCCESS(t->errorCode)){
|
||||
return t;
|
||||
} else {
|
||||
delete t;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode){
|
||||
const UChar *pIn, *pInLimit;
|
||||
UChar *pOut, *pOutLimit;
|
||||
|
@ -63,6 +169,7 @@ public:
|
|||
UBool flush;
|
||||
|
||||
ucnv_reset(cnv);
|
||||
fromUCallbackCount=0;
|
||||
|
||||
pIn=input;
|
||||
pInLimit=input+inputLength;
|
||||
|
@ -70,24 +177,24 @@ public:
|
|||
pOut=output;
|
||||
pOutLimit=output+OUTPUT_CAPACITY;
|
||||
|
||||
pInterLimit=intermediate+buf_cap;
|
||||
pInterLimit=intermediate+testcase.chunkLength;
|
||||
|
||||
encodedLength=outputLength=0;
|
||||
flush=FALSE;
|
||||
|
||||
while(pIn<pInLimit || !flush) {
|
||||
do {
|
||||
/* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
|
||||
pInter=intermediate;
|
||||
flush=(UBool)(pIn==pInLimit);
|
||||
ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, flush, pErrorCode);
|
||||
ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, TRUE, pErrorCode);
|
||||
encodedLength+=(int32_t)(pInter-intermediate);
|
||||
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* in case flush was TRUE make sure that we convert once more to really flush */
|
||||
flush=FALSE;
|
||||
/* make sure that we convert once more to really flush */
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
} else if(pIn==pInLimit) {
|
||||
flush=TRUE;
|
||||
}
|
||||
|
||||
/* convert the block [intermediate..pInter[ back to UTF-16 */
|
||||
|
@ -97,7 +204,7 @@ public:
|
|||
return;
|
||||
}
|
||||
/* intermediate must have been consumed (p==pInter) because of the converter semantics */
|
||||
}
|
||||
} while(!flush);
|
||||
|
||||
outputLength=pOut-output;
|
||||
if(inputLength!=outputLength) {
|
||||
|
@ -105,45 +212,142 @@ public:
|
|||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
virtual long getOperationsPerIteration(){
|
||||
return countInputCodePoints;
|
||||
}
|
||||
const char * name;
|
||||
int32_t buf_cap;
|
||||
UErrorCode errorCode;
|
||||
UConverter *cnv;
|
||||
};
|
||||
|
||||
class UtfPerformanceTest : public UPerfTest{
|
||||
// Test one-way conversion UTF-16->encoding.
|
||||
class FromUnicode : public Command {
|
||||
protected:
|
||||
FromUnicode(const UtfPerformanceTest &testcase) : Command(testcase) {}
|
||||
public:
|
||||
UtfPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status) :UPerfTest(argc,argv,status){
|
||||
getBuffer(inputLength, status);
|
||||
u_strncpy(input, buffer, inputLength);
|
||||
countInputCodePoints = u_countChar32(input, inputLength);
|
||||
}
|
||||
|
||||
virtual UPerfFunction* runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ){
|
||||
switch (index) {
|
||||
case 0: name = "UTF_8"; if (exec) return Command::get("UTF-8", INTERMEDIATE_CAPACITY); break;
|
||||
case 1: name = "UTF_8_SB"; if (exec) return Command::get("UTF-8",INTERMEDIATE_SMALL_CAPACITY); break;
|
||||
case 2: name = "SCSU"; if (exec) return Command::get("SCSU", INTERMEDIATE_CAPACITY); break;
|
||||
case 3: name = "SCSU_SB"; if (exec) return Command::get("SCSU", INTERMEDIATE_SMALL_CAPACITY); break;
|
||||
case 4: name = "BOCU_1"; if (exec) return Command::get("BOCU-1", INTERMEDIATE_CAPACITY); break;
|
||||
case 5: name = "BOCU_1_SB"; if (exec) return Command::get("BOCU-1",INTERMEDIATE_SMALL_CAPACITY); break;
|
||||
default: name = ""; break;
|
||||
static UPerfFunction* get(const UtfPerformanceTest &testcase) {
|
||||
FromUnicode * t = new FromUnicode(testcase);
|
||||
if (U_SUCCESS(t->errorCode)){
|
||||
return t;
|
||||
} else {
|
||||
delete t;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode){
|
||||
const UChar *pIn, *pInLimit;
|
||||
char *pInter, *pInterLimit;
|
||||
|
||||
ucnv_resetFromUnicode(cnv);
|
||||
fromUCallbackCount=0;
|
||||
|
||||
pIn=input;
|
||||
pInLimit=input+inputLength;
|
||||
|
||||
pInterLimit=intermediate+testcase.chunkLength;
|
||||
|
||||
encodedLength=0;
|
||||
|
||||
for(;;) {
|
||||
pInter=intermediate;
|
||||
ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, TRUE, pErrorCode);
|
||||
encodedLength+=(int32_t)(pInter-intermediate);
|
||||
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* make sure that we convert once more to really flush */
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
} else {
|
||||
break; // all done
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
};
|
||||
|
||||
// Test one-way conversion UTF-8->encoding.
|
||||
class FromUTF8 : public Command {
|
||||
protected:
|
||||
FromUTF8(const UtfPerformanceTest &testcase)
|
||||
: Command(testcase),
|
||||
utf8Cnv(NULL),
|
||||
input8(utf8), input8Length(utf8Length) {
|
||||
utf8Cnv=ucnv_open("UTF-8", &errorCode);
|
||||
}
|
||||
public:
|
||||
static UPerfFunction* get(const UtfPerformanceTest &testcase) {
|
||||
FromUTF8 * t = new FromUTF8(testcase);
|
||||
if (U_SUCCESS(t->errorCode)){
|
||||
return t;
|
||||
} else {
|
||||
delete t;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
~FromUTF8() {
|
||||
ucnv_close(utf8Cnv);
|
||||
}
|
||||
virtual void call(UErrorCode* pErrorCode){
|
||||
const char *pIn, *pInLimit;
|
||||
char *pInter, *pInterLimit;
|
||||
UChar *pivotSource, *pivotTarget, *pivotLimit;
|
||||
|
||||
ucnv_resetToUnicode(utf8Cnv);
|
||||
ucnv_resetFromUnicode(cnv);
|
||||
fromUCallbackCount=0;
|
||||
|
||||
pIn=input8;
|
||||
pInLimit=input8+input8Length;
|
||||
|
||||
pInterLimit=intermediate+testcase.chunkLength;
|
||||
|
||||
pivotSource=pivotTarget=pivot;
|
||||
pivotLimit=pivot+testcase.pivotLength;
|
||||
|
||||
encodedLength=0;
|
||||
|
||||
for(;;) {
|
||||
pInter=intermediate;
|
||||
ucnv_convertEx(cnv, utf8Cnv,
|
||||
&pInter, pInterLimit,
|
||||
&pIn, pInLimit,
|
||||
pivot, &pivotSource, &pivotTarget, pivotLimit,
|
||||
FALSE, TRUE, pErrorCode);
|
||||
encodedLength+=(int32_t)(pInter-intermediate);
|
||||
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
/* make sure that we convert once more to really flush */
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
} else if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
} else {
|
||||
break; // all done
|
||||
}
|
||||
}
|
||||
}
|
||||
protected:
|
||||
UConverter *utf8Cnv;
|
||||
const char *input8;
|
||||
int32_t input8Length;
|
||||
};
|
||||
|
||||
UPerfFunction* UtfPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
|
||||
switch (index) {
|
||||
case 0: name = "Roundtrip"; if (exec) return Roundtrip::get(*this); break;
|
||||
case 1: name = "FromUnicode"; if (exec) return FromUnicode::get(*this); break;
|
||||
case 2: name = "FromUTF8"; if (exec) return FromUTF8::get(*this); break;
|
||||
default: name = ""; break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[])
|
||||
{
|
||||
// Default values for command-line options.
|
||||
options[CHARSET].value = "UTF-8";
|
||||
options[CHUNK_LENGTH].value = "4096";
|
||||
options[PIVOT_LENGTH].value = "1024";
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UtfPerformanceTest test(argc, argv, status);
|
||||
|
||||
if (U_FAILURE(status)){
|
||||
printf("The error is %s\n", u_errorName(status));
|
||||
test.usage();
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -152,5 +356,10 @@ int main(int argc, const char *argv[])
|
|||
"arguments.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (fromUCallbackCount > 0) {
|
||||
printf("Number of fromUnicode callback calls in the last iteration: %ld\n", (long)fromUCallbackCount);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
30
icu4c/source/test/testdata/conversion.txt
vendored
30
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -1,6 +1,6 @@
|
|||
//*******************************************************************************
|
||||
//
|
||||
// Copyright (C) 2003-2006, International Business Machines
|
||||
// Copyright (C) 2003-2007, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//
|
||||
// file name: conversion.txt
|
||||
|
@ -474,6 +474,34 @@ conversion:table(nofallback) {
|
|||
fromUnicode {
|
||||
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
||||
Cases {
|
||||
// Code coverage for UTF-8->SBCS conversion (ucnv_convertEx()).
|
||||
// Test code path for non-roundtripping ASCII characters
|
||||
// (try EBCDIC SBCS, and IBM PC SBCS with control code rotation).
|
||||
{
|
||||
"ibm-37",
|
||||
"a\x85c",
|
||||
:bin{ 811583 },
|
||||
:intvector{ 0,1,2 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
{
|
||||
"ibm-850",
|
||||
"a\x1ac",
|
||||
:bin{ 617f63 },
|
||||
:intvector{ 0,1,2 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
// Code coverage for UTF-8->DBCS conversion (ucnv_convertEx()).
|
||||
// Test code path for non-roundtripping ASCII characters
|
||||
// (try IBM PC DBCS with control code rotation).
|
||||
{
|
||||
"ibm-943",
|
||||
"a\x1ac\u30a1\x7ff",
|
||||
:bin{ 617f6383401c66 },
|
||||
:intvector{ 0,1,2,3,3,4,5 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
// SCSU regression test.
|
||||
{
|
||||
"SCSU",
|
||||
"1\U00010001\u0085\U000500022\ud8003\udc014\ue001",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2005, International Business Machines
|
||||
* Copyright (c) 2002-2006, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
**********************************************************************
|
||||
|
@ -16,6 +16,10 @@
|
|||
#include "unicode/utimer.h"
|
||||
#include "ucbuf.h"
|
||||
|
||||
// Forward declarations from uoptions.h.
|
||||
struct UOption;
|
||||
typedef struct UOption UOption;
|
||||
|
||||
#if !UCONFIG_NO_CONVERSION
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -126,6 +130,14 @@ public:
|
|||
protected:
|
||||
UPerfTest(int32_t argc, const char* argv[], UErrorCode& status);
|
||||
|
||||
UPerfTest(int32_t argc, const char* argv[],
|
||||
UOption addOptions[], int32_t addOptionsCount,
|
||||
const char *addUsage,
|
||||
UErrorCode& status);
|
||||
|
||||
void init(UOption addOptions[], int32_t addOptionsCount,
|
||||
UErrorCode& status);
|
||||
|
||||
virtual UPerfFunction* runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); // overide !
|
||||
|
||||
virtual UBool runTestLoop( char* testname, char* par );
|
||||
|
@ -141,6 +153,7 @@ protected:
|
|||
int32_t iterations;
|
||||
int32_t passes;
|
||||
int32_t time;
|
||||
const char * _addUsage;
|
||||
const char** _argv;
|
||||
int32_t _argc;
|
||||
int32_t _remainingArgc;
|
||||
|
|
|
@ -26,11 +26,14 @@ const char UPerfTest::gUsageString[] =
|
|||
"\t-e or --encoding encoding of source files\n"
|
||||
"\t-u or --uselen perform timing analysis on non-null terminated buffer using length\n"
|
||||
"\t-f or --file-name file to be used as input data\n"
|
||||
"\t-p or --passes Number of passes to be performed. Requires Numeric argument. Cannot be used with --time\n"
|
||||
"\t-p or --passes Number of passes to be performed. Requires Numeric argument.\n"
|
||||
"\t Cannot be used with --time\n"
|
||||
"\t-i or --iterations Number of iterations to be performed. Requires Numeric argument\n"
|
||||
"\t-t or --time Threshold time for looping until in seconds. Requires Numeric argument.Cannot be used with --iterations\n"
|
||||
"\t-t or --time Threshold time for looping until in seconds. Requires Numeric argument.\n"
|
||||
"\t Cannot be used with --iterations\n"
|
||||
"\t-l or --line-mode The data file should be processed in line mode\n"
|
||||
"\t-b or --bulk-mode The data file should be processed in file based. Cannot be used with --line-mode\n"
|
||||
"\t-b or --bulk-mode The data file should be processed in file based.\n"
|
||||
"\t Cannot be used with --line-mode\n"
|
||||
"\t-L or --locale Locale for the test\n";
|
||||
|
||||
enum
|
||||
|
@ -47,11 +50,12 @@ enum
|
|||
TIME,
|
||||
LINE_MODE,
|
||||
BULK_MODE,
|
||||
LOCALE
|
||||
LOCALE,
|
||||
OPTIONS_COUNT
|
||||
};
|
||||
|
||||
|
||||
static UOption options[]={
|
||||
static UOption options[OPTIONS_COUNT+20]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
|
@ -67,32 +71,57 @@ static UOption options[]={
|
|||
UOPTION_DEF( "locale", 'L', UOPT_REQUIRES_ARG)
|
||||
};
|
||||
|
||||
UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status){
|
||||
|
||||
_argc = argc;
|
||||
_argv = argv;
|
||||
ucharBuf = NULL;
|
||||
encoding = "";
|
||||
uselen = FALSE;
|
||||
fileName = NULL;
|
||||
sourceDir = ".";
|
||||
lines = NULL;
|
||||
numLines = 0;
|
||||
line_mode = TRUE;
|
||||
buffer = NULL;
|
||||
bufferLen = 0;
|
||||
verbose = FALSE;
|
||||
bulk_mode = FALSE;
|
||||
passes = iterations = time = 0;
|
||||
locale = NULL;
|
||||
|
||||
UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status)
|
||||
: _argc(argc), _argv(argv), _addUsage(NULL),
|
||||
ucharBuf(NULL), encoding(""),
|
||||
uselen(FALSE),
|
||||
fileName(NULL), sourceDir("."),
|
||||
lines(NULL), numLines(0), line_mode(TRUE),
|
||||
buffer(NULL), bufferLen(0),
|
||||
verbose(FALSE), bulk_mode(FALSE),
|
||||
passes(1), iterations(0), time(0),
|
||||
locale(NULL) {
|
||||
init(NULL, 0, status);
|
||||
}
|
||||
|
||||
UPerfTest::UPerfTest(int32_t argc, const char* argv[],
|
||||
UOption addOptions[], int32_t addOptionsCount,
|
||||
const char *addUsage,
|
||||
UErrorCode& status)
|
||||
: _argc(argc), _argv(argv), _addUsage(addUsage),
|
||||
ucharBuf(NULL), encoding(""),
|
||||
uselen(FALSE),
|
||||
fileName(NULL), sourceDir("."),
|
||||
lines(NULL), numLines(0), line_mode(TRUE),
|
||||
buffer(NULL), bufferLen(0),
|
||||
verbose(FALSE), bulk_mode(FALSE),
|
||||
passes(1), iterations(0), time(0),
|
||||
locale(NULL) {
|
||||
init(addOptions, addOptionsCount, status);
|
||||
}
|
||||
|
||||
void UPerfTest::init(UOption addOptions[], int32_t addOptionsCount,
|
||||
UErrorCode& status) {
|
||||
//initialize the argument list
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
U_MAIN_INIT_ARGS(_argc, _argv);
|
||||
|
||||
// add specific options
|
||||
int32_t optionsCount = OPTIONS_COUNT;
|
||||
if (addOptionsCount > 0) {
|
||||
memcpy(options+optionsCount, addOptions, addOptionsCount*sizeof(UOption));
|
||||
optionsCount += addOptionsCount;
|
||||
}
|
||||
|
||||
//parse the arguments
|
||||
_remainingArgc = u_parseArgs(argc, (char**)argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
|
||||
_remainingArgc = u_parseArgs(_argc, (char**)_argv, optionsCount, options);
|
||||
|
||||
// copy back values for additional options
|
||||
if (addOptionsCount > 0) {
|
||||
memcpy(addOptions, options+OPTIONS_COUNT, addOptionsCount*sizeof(UOption));
|
||||
}
|
||||
|
||||
// Now setup the arguments
|
||||
if(argc==1 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
|
||||
if(_argc==1 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
@ -122,12 +151,16 @@ UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status){
|
|||
}
|
||||
if(options[ITERATIONS].doesOccur) {
|
||||
iterations = atoi(options[ITERATIONS].value);
|
||||
}
|
||||
|
||||
if(options[TIME].doesOccur) {
|
||||
if(options[TIME].doesOccur) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
} else if(options[TIME].doesOccur) {
|
||||
time = atoi(options[TIME].value);
|
||||
} else {
|
||||
iterations = 1000; // some default
|
||||
}
|
||||
|
||||
|
||||
if(options[LINE_MODE].doesOccur) {
|
||||
line_mode = TRUE;
|
||||
bulk_mode = FALSE;
|
||||
|
@ -142,11 +175,6 @@ UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status){
|
|||
locale = options[LOCALE].value;
|
||||
}
|
||||
|
||||
if(time > 0 && iterations >0){
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t len = 0;
|
||||
resolvedFileName = NULL;
|
||||
if(fileName!=NULL){
|
||||
|
@ -205,6 +233,9 @@ ULine* UPerfTest::getLines(UErrorCode& status){
|
|||
return lines;
|
||||
}
|
||||
const UChar* UPerfTest::getBuffer(int32_t& len, UErrorCode& status){
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
len = ucbuf_size(ucharBuf);
|
||||
buffer = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (len+1));
|
||||
u_strncpy(buffer,ucbuf_getBuffer(ucharBuf,&bufferLen,&status),len);
|
||||
|
@ -421,6 +452,11 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
|
|||
*/
|
||||
void UPerfTest::usage( void )
|
||||
{
|
||||
puts(gUsageString);
|
||||
if (_addUsage != NULL) {
|
||||
puts(_addUsage);
|
||||
}
|
||||
|
||||
UBool save_verbose = verbose;
|
||||
verbose = TRUE;
|
||||
fprintf(stdout,"Test names:\n");
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2006, International Business Machines
|
||||
* Copyright (C) 2003-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -287,8 +287,10 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
/*
|
||||
* Remove fromUnicode fallbacks and SUB mappings which are irrelevant for
|
||||
* the toUnicode table.
|
||||
* This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable
|
||||
* for the base toUnicode table but not for the base fromUnicode table.
|
||||
* The table must be sorted.
|
||||
* Destroys previous data in the reverseMap.
|
||||
* Modifies previous data in the reverseMap.
|
||||
*/
|
||||
static int32_t
|
||||
reduceToUMappings(UCMTable *table) {
|
||||
|
@ -570,6 +572,7 @@ makeToUTable(CnvExtData *extData, UCMTable *table) {
|
|||
/*
|
||||
* Remove toUnicode fallbacks and non-<subchar1> SUB mappings
|
||||
* which are irrelevant for the fromUnicode extension table.
|
||||
* Remove MBCS_FROM_U_EXT_FLAG bits.
|
||||
* Overwrite the reverseMap with an index array to the relevant mappings.
|
||||
* Modify the code point sequences to a generator-friendly format where
|
||||
* the first code points remains unchanged but the following are recoded
|
||||
|
@ -596,6 +599,10 @@ prepareFromUMappings(UCMTable *table) {
|
|||
|
||||
for(i=j=0; i<count; ++m, ++i) {
|
||||
flag=m->f;
|
||||
if(flag>=0) {
|
||||
flag&=MBCS_FROM_U_EXT_MASK;
|
||||
m->f=flag;
|
||||
}
|
||||
if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
|
||||
map[j++]=i;
|
||||
|
||||
|
@ -1065,4 +1072,3 @@ CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *sta
|
|||
makeToUTable(extData, table) &&
|
||||
makeFromUTable(extData, table);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2006, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -25,6 +25,10 @@
|
|||
#include "makeconv.h"
|
||||
#include "genmbcs.h"
|
||||
|
||||
/*
|
||||
* TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files.
|
||||
* Reduce tests for maxCharLength.
|
||||
*/
|
||||
|
||||
typedef struct MBCSData {
|
||||
NewConverter newConverter;
|
||||
|
@ -42,6 +46,11 @@ typedef struct MBCSData {
|
|||
uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
|
||||
uint8_t *fromUBytes;
|
||||
uint32_t stage2Top, stage3Top;
|
||||
|
||||
/* fromUTF8 */
|
||||
uint16_t stageUTF8[MBCS_UTF8_STAGE_SIZE];
|
||||
|
||||
UBool utf8Friendly;
|
||||
} MBCSData;
|
||||
|
||||
/* prototypes */
|
||||
|
@ -108,26 +117,14 @@ printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
|
|||
|
||||
static void
|
||||
MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
|
||||
int32_t i, maxCharLength;
|
||||
|
||||
uprv_memset(mbcsData, 0, sizeof(MBCSData));
|
||||
|
||||
maxCharLength=ucm->states.maxCharLength;
|
||||
|
||||
mbcsData->ucm=ucm; /* aliased, not owned */
|
||||
|
||||
mbcsData->newConverter.close=MBCSClose;
|
||||
mbcsData->newConverter.isValid=MBCSIsValid;
|
||||
mbcsData->newConverter.addTable=MBCSAddTable;
|
||||
mbcsData->newConverter.write=MBCSWrite;
|
||||
|
||||
mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */
|
||||
mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */
|
||||
|
||||
/* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
|
||||
for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
|
||||
mbcsData->stage1[i]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
|
||||
}
|
||||
}
|
||||
|
||||
NewConverter *
|
||||
|
@ -139,19 +136,28 @@ MBCSOpen(UCMFile *ucm) {
|
|||
return &mbcsData->newConverter;
|
||||
}
|
||||
|
||||
static void
|
||||
MBCSDestruct(MBCSData *mbcsData) {
|
||||
uprv_free(mbcsData->unicodeCodeUnits);
|
||||
uprv_free(mbcsData->fromUBytes);
|
||||
}
|
||||
|
||||
static void
|
||||
MBCSClose(NewConverter *cnvData) {
|
||||
MBCSData *mbcsData=(MBCSData *)cnvData;
|
||||
if(mbcsData!=NULL) {
|
||||
uprv_free(mbcsData->unicodeCodeUnits);
|
||||
uprv_free(mbcsData->fromUBytes);
|
||||
MBCSDestruct(mbcsData);
|
||||
uprv_free(mbcsData);
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
MBCSStartMappings(MBCSData *mbcsData) {
|
||||
int32_t i, sum;
|
||||
int32_t i, sum, maxCharLength,
|
||||
stage2NullLength, stage2AllocLength,
|
||||
stage3NullLength, stage3AllocLength;
|
||||
|
||||
/* toUnicode */
|
||||
|
||||
/* allocate the code unit array and prefill it with "unassigned" values */
|
||||
sum=mbcsData->ucm->states.countToUCodeUnits;
|
||||
|
@ -171,21 +177,102 @@ MBCSStartMappings(MBCSData *mbcsData) {
|
|||
}
|
||||
}
|
||||
|
||||
/* fromUnicode */
|
||||
maxCharLength=mbcsData->ucm->states.maxCharLength;
|
||||
|
||||
/* allocate the codepage mappings and preset the first 16 characters to 0 */
|
||||
if(mbcsData->ucm->states.maxCharLength==1) {
|
||||
if(maxCharLength==1) {
|
||||
/* allocate 64k 16-bit results for single-byte codepages */
|
||||
sum=0x20000;
|
||||
} else {
|
||||
/* allocate 1M * maxCharLength bytes for at most 1M mappings */
|
||||
sum=0x100000*mbcsData->ucm->states.maxCharLength;
|
||||
sum=0x100000*maxCharLength;
|
||||
}
|
||||
mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
|
||||
if(mbcsData->fromUBytes==NULL) {
|
||||
fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
|
||||
return FALSE;
|
||||
}
|
||||
/* initialize the all-unassigned first stage 3 block */
|
||||
uprv_memset(mbcsData->fromUBytes, 0, 64);
|
||||
uprv_memset(mbcsData->fromUBytes, 0, sum);
|
||||
|
||||
/*
|
||||
* UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time.
|
||||
* See ucnvmbcs.h for details.
|
||||
*
|
||||
* There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which
|
||||
* assumes that the initial stage 2/3 blocks are the all-unassigned ones.
|
||||
* Therefore, we refine the data structure while maintaining this placement
|
||||
* even though it would be convenient to allocate the ASCII block at the
|
||||
* beginning of stage 3, for example.
|
||||
*
|
||||
* UTF-8-friendly fromUnicode tries work from sorted tables and are built
|
||||
* pre-compacted, overlapping adjacent stage 2/3 blocks.
|
||||
* This is necessary because the block allocation and compaction changes
|
||||
* at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional
|
||||
* stage table uses direct indexes into stage 3, without a multiplier and
|
||||
* thus with a smaller reach.
|
||||
*
|
||||
* Non-UTF-8-friendly fromUnicode tries work from unsorted tables
|
||||
* (because implicit precision is used), and are compacted
|
||||
* in post-processing.
|
||||
*
|
||||
* Preallocation for UTF-8-friendly fromUnicode tries:
|
||||
*
|
||||
* Stage 3:
|
||||
* 64-entry all-unassigned first block followed by ASCII (128 entries).
|
||||
*
|
||||
* Stage 2:
|
||||
* 64-entry all-unassigned first block followed by preallocated
|
||||
* 64-block for ASCII.
|
||||
*/
|
||||
|
||||
/* Preallocate ASCII as a linear 128-entry stage 3 block. */
|
||||
stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE;
|
||||
stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE;
|
||||
|
||||
stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
|
||||
stage3AllocLength=128; /* ASCII U+0000..U+007f */
|
||||
|
||||
/* Initialize stage 1 for the preallocated blocks. */
|
||||
sum=stage2NullLength;
|
||||
for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) {
|
||||
mbcsData->stage1[i]=sum;
|
||||
sum+=MBCS_STAGE_2_BLOCK_SIZE;
|
||||
}
|
||||
mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */
|
||||
|
||||
/*
|
||||
* Stage 2 indexes count 16-blocks in stage 3 as follows:
|
||||
* SBCS: directly, indexes increment by 16
|
||||
* MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1
|
||||
* MBCS UTF-8: directly, indexes increment by 16
|
||||
*/
|
||||
if(maxCharLength==1) {
|
||||
sum=stage3NullLength;
|
||||
for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
|
||||
mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum;
|
||||
sum+=MBCS_STAGE_3_BLOCK_SIZE;
|
||||
}
|
||||
} else {
|
||||
sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY;
|
||||
for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
|
||||
mbcsData->stage2[mbcsData->stage1[0]+i]=sum;
|
||||
sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY;
|
||||
}
|
||||
}
|
||||
|
||||
sum=stage3NullLength;
|
||||
for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) {
|
||||
mbcsData->stageUTF8[i]=sum;
|
||||
sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a 64-entry all-unassigned first stage 3 block,
|
||||
* for UTF-8-friendly lookup with a trail byte,
|
||||
* plus 128 entries for ASCII.
|
||||
*/
|
||||
mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -437,11 +524,13 @@ MBCSSingleAddFromUnicode(MBCSData *mbcsData,
|
|||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c,
|
||||
int8_t flag) {
|
||||
uint16_t *p;
|
||||
uint16_t *stage3, *p;
|
||||
uint32_t index;
|
||||
uint16_t old;
|
||||
uint8_t b;
|
||||
|
||||
uint32_t blockSize, newTop, i, nextOffset, newBlock, min;
|
||||
|
||||
/* ignore |2 SUB mappings */
|
||||
if(flag==2) {
|
||||
return TRUE;
|
||||
|
@ -453,13 +542,28 @@ MBCSSingleAddFromUnicode(MBCSData *mbcsData,
|
|||
* Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
|
||||
* We assume that length<=maxCharLength and that c<=0x10ffff.
|
||||
*/
|
||||
stage3=(uint16_t *)mbcsData->fromUBytes;
|
||||
b=*bytes;
|
||||
|
||||
/* inspect stage 1 */
|
||||
index=c>>10;
|
||||
index=c>>MBCS_STAGE_1_SHIFT;
|
||||
if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
|
||||
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
|
||||
} else {
|
||||
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
|
||||
}
|
||||
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
|
||||
/* allocate another block in stage 2 */
|
||||
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
|
||||
newBlock=mbcsData->stage2Top;
|
||||
if(mbcsData->utf8Friendly) {
|
||||
min=newBlock-nextOffset; /* minimum block start with overlap */
|
||||
while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) {
|
||||
--newBlock;
|
||||
}
|
||||
}
|
||||
newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
|
||||
|
||||
if(newTop>MBCS_MAX_STAGE_2_TOP) {
|
||||
fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -468,26 +572,46 @@ MBCSSingleAddFromUnicode(MBCSData *mbcsData,
|
|||
* each stage 2 block contains 64 16-bit words:
|
||||
* 6 code point bits 9..4 with 1 stage 3 index
|
||||
*/
|
||||
mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
|
||||
mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
|
||||
mbcsData->stage1[index]=(uint16_t)newBlock;
|
||||
mbcsData->stage2Top=newTop;
|
||||
}
|
||||
|
||||
/* inspect stage 2 */
|
||||
index=(uint32_t)mbcsData->stage1[index]+((c>>4)&0x3f);
|
||||
index=mbcsData->stage1[index]+nextOffset;
|
||||
if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
|
||||
/* allocate 64-entry blocks for UTF-8-friendly lookup */
|
||||
blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
|
||||
nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
|
||||
} else {
|
||||
blockSize=MBCS_STAGE_3_BLOCK_SIZE;
|
||||
nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
|
||||
}
|
||||
if(mbcsData->stage2Single[index]==0) {
|
||||
/* allocate another block in stage 3 */
|
||||
if(mbcsData->stage3Top>=0x10000) {
|
||||
newBlock=mbcsData->stage3Top;
|
||||
if(mbcsData->utf8Friendly) {
|
||||
min=newBlock-nextOffset; /* minimum block start with overlap */
|
||||
while(min<newBlock && stage3[newBlock-1]==0) {
|
||||
--newBlock;
|
||||
}
|
||||
}
|
||||
newTop=newBlock+blockSize;
|
||||
|
||||
if(newTop>MBCS_STAGE_3_SBCS_SIZE) {
|
||||
fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
|
||||
return FALSE;
|
||||
}
|
||||
/* each block has 16 uint16_t entries */
|
||||
mbcsData->stage2Single[index]=(uint16_t)mbcsData->stage3Top;
|
||||
uprv_memset(mbcsData->fromUBytes+2*mbcsData->stage3Top, 0, 32);
|
||||
mbcsData->stage3Top+=16;
|
||||
i=index;
|
||||
while(newBlock<newTop) {
|
||||
mbcsData->stage2Single[i++]=(uint16_t)newBlock;
|
||||
newBlock+=MBCS_STAGE_3_BLOCK_SIZE;
|
||||
}
|
||||
mbcsData->stage3Top=newTop; /* ==newBlock */
|
||||
}
|
||||
|
||||
/* write the codepage entry into stage 3 and get the previous entry */
|
||||
p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf);
|
||||
p=stage3+mbcsData->stage2Single[index]+nextOffset;
|
||||
old=*p;
|
||||
if(flag<=0) {
|
||||
*p=(uint16_t)(0xf00|b);
|
||||
|
@ -520,21 +644,14 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
|||
int8_t flag) {
|
||||
char buffer[10];
|
||||
const uint8_t *pb;
|
||||
uint8_t *p;
|
||||
uint32_t index, b, old;
|
||||
uint8_t *stage3, *p;
|
||||
uint32_t index, b, old, stage3Index;
|
||||
int32_t maxCharLength;
|
||||
|
||||
/* ignore |2 SUB mappings */
|
||||
if(flag==2) {
|
||||
return TRUE;
|
||||
}
|
||||
uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap;
|
||||
|
||||
maxCharLength=mbcsData->ucm->states.maxCharLength;
|
||||
|
||||
if(maxCharLength==1) {
|
||||
return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag);
|
||||
}
|
||||
|
||||
if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
|
||||
(*bytes==0xe || *bytes==0xf)
|
||||
) {
|
||||
|
@ -556,12 +673,27 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
|||
* all-unassigned mappings.
|
||||
* We assume that length<=maxCharLength and that c<=0x10ffff.
|
||||
*/
|
||||
stage3=mbcsData->fromUBytes;
|
||||
|
||||
/* inspect stage 1 */
|
||||
index=c>>10;
|
||||
index=c>>MBCS_STAGE_1_SHIFT;
|
||||
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
|
||||
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
|
||||
} else {
|
||||
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
|
||||
}
|
||||
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
|
||||
/* allocate another block in stage 2 */
|
||||
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
|
||||
newBlock=mbcsData->stage2Top;
|
||||
if(mbcsData->utf8Friendly) {
|
||||
min=newBlock-nextOffset; /* minimum block start with overlap */
|
||||
while(min<newBlock && mbcsData->stage2[newBlock-1]==0) {
|
||||
--newBlock;
|
||||
}
|
||||
}
|
||||
newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
|
||||
|
||||
if(newTop>MBCS_MAX_STAGE_2_TOP) {
|
||||
fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
|
||||
(int)c, printBytes(buffer, bytes, length));
|
||||
return FALSE;
|
||||
|
@ -571,23 +703,90 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
|||
* each stage 2 block contains 64 32-bit words:
|
||||
* 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
|
||||
*/
|
||||
mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
|
||||
mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
|
||||
i=index;
|
||||
while(newBlock<newTop) {
|
||||
mbcsData->stage1[i++]=(uint16_t)newBlock;
|
||||
newBlock+=MBCS_STAGE_2_BLOCK_SIZE;
|
||||
}
|
||||
mbcsData->stage2Top=newTop; /* ==newBlock */
|
||||
}
|
||||
|
||||
/* inspect stage 2 */
|
||||
index=mbcsData->stage1[index]+((c>>4)&0x3f);
|
||||
index=mbcsData->stage1[index]+nextOffset;
|
||||
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
|
||||
/* allocate 64-entry blocks for UTF-8-friendly lookup */
|
||||
blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
|
||||
nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
|
||||
} else {
|
||||
blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
|
||||
nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
|
||||
}
|
||||
if(mbcsData->stage2[index]==0) {
|
||||
/* allocate another block in stage 3 */
|
||||
if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) {
|
||||
newBlock=mbcsData->stage3Top;
|
||||
if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) {
|
||||
/*
|
||||
* Overlap stage 3 blocks only in multiples of 16-entry blocks
|
||||
* because of the indexing granularity in stage 2.
|
||||
*/
|
||||
maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength;
|
||||
for(overlap=0;
|
||||
overlap<maxOverlap && stage3[newBlock-overlap-1]==0;
|
||||
++overlap) {}
|
||||
|
||||
overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
|
||||
overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength;
|
||||
|
||||
newBlock-=overlap;
|
||||
}
|
||||
newTop=newBlock+blockSize;
|
||||
|
||||
if(newTop>MBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) {
|
||||
fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
|
||||
(int)c, printBytes(buffer, bytes, length));
|
||||
return FALSE;
|
||||
}
|
||||
/* each block has 16*maxCharLength bytes */
|
||||
mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength;
|
||||
uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength);
|
||||
mbcsData->stage3Top+=16*maxCharLength;
|
||||
i=index;
|
||||
while(newBlock<newTop) {
|
||||
mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
|
||||
newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
|
||||
}
|
||||
mbcsData->stage3Top=newTop; /* ==newBlock */
|
||||
}
|
||||
|
||||
stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[index];
|
||||
|
||||
/* Build an alternate, UTF-8-friendly stage table as well. */
|
||||
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
|
||||
/* Overflow for uint16_t entries in stageUTF8? */
|
||||
if(stage3Index>0xffff) {
|
||||
/*
|
||||
* This can occur only if the mapping table is nearly perfectly filled and if
|
||||
* MBCS_UTF8_MAX==0xffff.
|
||||
* (There is no known charset like this. GB 18030 does not map
|
||||
* surrogate code points and LMBCS does not map 256 PUA code points.)
|
||||
*
|
||||
* Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff
|
||||
* (stage3Index can at most reach exactly MBCS_UTF8_LIMIT)
|
||||
* because we have a sorted table and there are at most MBCS_UTF8_LIMIT
|
||||
* mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
|
||||
* the initial all-unassigned block in stage3.
|
||||
*
|
||||
* (See svn revision 20866 of the markus/ucnvutf8 feature branch for
|
||||
* code that causes MBCSAddTable() to rebuild the table not utf8Friendly
|
||||
* in case of overflow. That code was not tested.)
|
||||
*/
|
||||
fprintf(stderr, "too many stage 3 entries for UTF-8-friendly format, processing U+%04x<->0x%s\n",
|
||||
(int)c, printBytes(buffer, bytes, length));
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
* The stage 3 block has been assigned for the regular trie.
|
||||
* Just copy its index into stageUTF8[], without the granularity.
|
||||
*/
|
||||
mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
|
||||
}
|
||||
|
||||
/* write the codepage bytes into stage 3 and get the previous bytes */
|
||||
|
@ -609,7 +808,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
|||
}
|
||||
|
||||
old=0;
|
||||
p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength;
|
||||
p=stage3+(stage3Index+nextOffset)*maxCharLength;
|
||||
switch(maxCharLength) {
|
||||
case 2:
|
||||
old=*(uint16_t *)p;
|
||||
|
@ -633,7 +832,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
|||
}
|
||||
|
||||
/* check that this Unicode code point was still unassigned */
|
||||
if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
|
||||
if((mbcsData->stage2[index+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
|
||||
if(flag>=0) {
|
||||
fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
|
||||
(int)c, printBytes(buffer, bytes, length), (int)old);
|
||||
|
@ -647,20 +846,57 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
|||
}
|
||||
if(flag<=0) {
|
||||
/* set the roundtrip flag */
|
||||
mbcsData->stage2[index]|=(1UL<<(16+(c&0xf)));
|
||||
mbcsData->stage2[index+(nextOffset>>4)]|=(1UL<<(16+(c&0xf)));
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CFUNC UBool
|
||||
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c, int8_t flag) {
|
||||
/*
|
||||
* A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under
|
||||
* the following conditions:
|
||||
*
|
||||
* - a |2 SUB mapping for <subchar1> (no base table data structure for them)
|
||||
* - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry)
|
||||
* - a multi-byte mapping with leading 0x00 bytes (no explicit length field)
|
||||
*
|
||||
* Some of these tests are redundant with ucm_mappingType().
|
||||
*/
|
||||
if( (flag==2 && length==1) ||
|
||||
(flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */
|
||||
(flag<=1 && length>1 && bytes[0]==0)
|
||||
) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Additional restrictions for UTF-8-friendly fromUnicode tables,
|
||||
* for code points up to the maximum optimized one:
|
||||
*
|
||||
* - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
|
||||
* - any |1 fallback (no roundtrip flags in the optimized table)
|
||||
*/
|
||||
if(utf8Friendly && flag<=1 && c<=MBCS_UTF8_MAX && (bytes[0]==0 || flag==1)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* All other mappings do fit into the base table. */
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
|
||||
static UBool
|
||||
MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
|
||||
MBCSData *mbcsData;
|
||||
UCMapping *m;
|
||||
UChar32 c;
|
||||
int32_t i;
|
||||
UBool isOK;
|
||||
int32_t i, maxCharLength;
|
||||
int8_t f;
|
||||
UBool isOK, utf8Friendly;
|
||||
|
||||
staticData->unicodeMask=table->unicodeMask;
|
||||
if(staticData->unicodeMask==3) {
|
||||
|
@ -671,42 +907,74 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
|||
staticData->conversionType=UCNV_MBCS;
|
||||
|
||||
mbcsData=(MBCSData *)cnvData;
|
||||
maxCharLength=mbcsData->ucm->states.maxCharLength;
|
||||
|
||||
/*
|
||||
* Generation of UTF-8-friendly data requires
|
||||
* a sorted table, which makeconv generates when explicit precision
|
||||
* indicators are used.
|
||||
*/
|
||||
mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
|
||||
|
||||
if(!MBCSStartMappings(mbcsData)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
staticData->hasFromUnicodeFallback=FALSE;
|
||||
staticData->hasToUnicodeFallback=FALSE;
|
||||
|
||||
isOK=TRUE;
|
||||
|
||||
m=table->mappings;
|
||||
for(i=0; i<table->mappingsLength; ++m, ++i) {
|
||||
c=m->u;
|
||||
f=m->f;
|
||||
|
||||
switch(m->f) {
|
||||
switch(f) {
|
||||
case -1:
|
||||
/* there was no precision/fallback indicator */
|
||||
/* fall through to set the mappings */
|
||||
case 0:
|
||||
/* set roundtrip mappings */
|
||||
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) &&
|
||||
MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
|
||||
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
|
||||
if(maxCharLength==1) {
|
||||
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
|
||||
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
/* set only a fallback mapping from Unicode to codepage */
|
||||
staticData->hasFromUnicodeFallback=TRUE;
|
||||
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
|
||||
if(maxCharLength==1) {
|
||||
staticData->hasFromUnicodeFallback=TRUE;
|
||||
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
|
||||
staticData->hasFromUnicodeFallback=TRUE;
|
||||
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
/* ignore |2 SUB mappings */
|
||||
/* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
|
||||
if(maxCharLength>1 && !MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
/* set only a fallback mapping from codepage to Unicode */
|
||||
staticData->hasToUnicodeFallback=TRUE;
|
||||
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
|
||||
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
break;
|
||||
default:
|
||||
/* will not occur because the parser checked it already */
|
||||
fprintf(stderr, "error: illegal fallback indicator %d\n", m->f);
|
||||
fprintf(stderr, "error: illegal fallback indicator %d\n", f);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
@ -979,17 +1247,10 @@ compactStage2(MBCSData *mbcsData) {
|
|||
static void
|
||||
MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
|
||||
UCMStates *states;
|
||||
int32_t maxCharLength;
|
||||
int32_t maxCharLength, stage3Width;
|
||||
|
||||
states=&mbcsData->ucm->states;
|
||||
maxCharLength=states->maxCharLength;
|
||||
|
||||
/* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
|
||||
if(VERBOSE) {
|
||||
printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
|
||||
(unsigned long)mbcsData->stage3Top/maxCharLength,
|
||||
(unsigned long)mbcsData->stage3Top/maxCharLength);
|
||||
}
|
||||
stage3Width=maxCharLength=states->maxCharLength;
|
||||
|
||||
ucm_optimizeStates(states,
|
||||
&mbcsData->unicodeCodeUnits,
|
||||
|
@ -997,12 +1258,67 @@ MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
|
|||
VERBOSE);
|
||||
|
||||
/* try to compact the fromUnicode tables */
|
||||
transformEUC(mbcsData);
|
||||
if(maxCharLength==1) {
|
||||
singleCompactStage3(mbcsData);
|
||||
singleCompactStage2(mbcsData);
|
||||
} else {
|
||||
compactStage2(mbcsData);
|
||||
if(transformEUC(mbcsData)) {
|
||||
--stage3Width;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8-friendly tries are built precompacted, to cope with variable
|
||||
* stage 3 allocation block sizes.
|
||||
*
|
||||
* Tables without precision indicators cannot be built that way,
|
||||
* because if a block was overlapped with a previous one, then a smaller
|
||||
* code point for the same block would not fit.
|
||||
* Therefore, such tables are not marked UTF-8-friendly and must be
|
||||
* compacted after all mappings are entered.
|
||||
*/
|
||||
if(!mbcsData->utf8Friendly) {
|
||||
if(maxCharLength==1) {
|
||||
singleCompactStage3(mbcsData);
|
||||
singleCompactStage2(mbcsData);
|
||||
} else {
|
||||
compactStage2(mbcsData);
|
||||
}
|
||||
}
|
||||
|
||||
if(VERBOSE) {
|
||||
/*uint32_t c, i1, i2, i2Limit, i3;*/
|
||||
|
||||
printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n",
|
||||
maxCharLength==1 ? "16" : "32",
|
||||
(unsigned long)mbcsData->stage2Top,
|
||||
(unsigned long)mbcsData->stage2Top);
|
||||
printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n",
|
||||
(int)stage3Width,
|
||||
(unsigned long)mbcsData->stage3Top/stage3Width,
|
||||
(unsigned long)mbcsData->stage3Top/stage3Width);
|
||||
#if 0
|
||||
c=0;
|
||||
for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) {
|
||||
i2=mbcsData->stage1[i1];
|
||||
if(i2==0) {
|
||||
c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE;
|
||||
continue;
|
||||
}
|
||||
for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) {
|
||||
if(maxCharLength==1) {
|
||||
i3=mbcsData->stage2Single[i2];
|
||||
} else {
|
||||
i3=(uint16_t)mbcsData->stage2[i2];
|
||||
}
|
||||
if(i3==0) {
|
||||
c+=MBCS_STAGE_3_BLOCK_SIZE;
|
||||
continue;
|
||||
}
|
||||
printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n",
|
||||
(unsigned long)c,
|
||||
(unsigned long)i1,
|
||||
(unsigned long)i2,
|
||||
(unsigned long)i3);
|
||||
c+=MBCS_STAGE_3_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1010,7 +1326,7 @@ static uint32_t
|
|||
MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
UNewDataMemory *pData, int32_t tableType) {
|
||||
MBCSData *mbcsData=(MBCSData *)cnvData;
|
||||
uint32_t top;
|
||||
uint32_t top, stageUTF8Length=0;
|
||||
int32_t i, stage1Top;
|
||||
|
||||
_MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
@ -1031,6 +1347,10 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
|
||||
/* stage3Top has counted 16-bit results, now we need to count bytes */
|
||||
mbcsData->stage3Top*=2;
|
||||
|
||||
if(mbcsData->utf8Friendly) {
|
||||
header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
|
||||
}
|
||||
} else {
|
||||
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
|
||||
|
@ -1044,6 +1364,11 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
/* stage2Top has counted 32-bit results, now we need to count bytes */
|
||||
mbcsData->stage2Top*=4;
|
||||
|
||||
if(mbcsData->utf8Friendly) {
|
||||
stageUTF8Length=MBCS_UTF8_STAGE_SIZE;
|
||||
header.version[2]=(uint8_t)(MBCS_UTF8_MAX>>8); /* store 0xd7 for max==0xd7ff */
|
||||
}
|
||||
|
||||
/* stage3Top has already counted bytes */
|
||||
}
|
||||
|
||||
|
@ -1053,7 +1378,9 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
|
||||
/* fill the header */
|
||||
header.version[0]=4;
|
||||
header.version[1]=2;
|
||||
header.version[1]=3;
|
||||
/* header.version[2] set above for utf8Friendly data */
|
||||
|
||||
header.countStates=mbcsData->ucm->states.countStates;
|
||||
header.countToUFallbacks=mbcsData->countToUFallbacks;
|
||||
|
||||
|
@ -1070,7 +1397,7 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
mbcsData->stage2Top;
|
||||
header.fromUBytesLength=mbcsData->stage3Top;
|
||||
|
||||
top=header.offsetFromUBytes+header.fromUBytesLength;
|
||||
top=header.offsetFromUBytes+header.fromUBytesLength+stageUTF8Length*2;
|
||||
|
||||
header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
|
||||
|
||||
|
@ -1096,7 +1423,10 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
|||
}
|
||||
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
|
||||
|
||||
/* return the number of bytes that should have been written */
|
||||
return header.offsetFromUBytes+header.fromUBytesLength;
|
||||
}
|
||||
if(stageUTF8Length>0) {
|
||||
udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
|
||||
}
|
||||
|
||||
/* return the number of bytes that should have been written */
|
||||
return top;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2006, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -20,25 +20,93 @@
|
|||
#include "makeconv.h"
|
||||
|
||||
enum {
|
||||
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
|
||||
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
|
||||
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
|
||||
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
|
||||
/*
|
||||
* TODO: Consider using ucnvmbcs.h constants.
|
||||
* However, not all values need to be exactly the same, for example
|
||||
* the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX
|
||||
* may be higher in makeconv than in the runtime code because that
|
||||
* affects only a small number of .cnv files [if any] but all
|
||||
* runtime UConverterSharedData objects.
|
||||
*/
|
||||
MBCS_STAGE_2_SHIFT=4,
|
||||
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */
|
||||
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
|
||||
MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */
|
||||
MBCS_STAGE_1_SHIFT=10,
|
||||
MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */
|
||||
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */
|
||||
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */
|
||||
MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
|
||||
MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
|
||||
|
||||
MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
|
||||
MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
|
||||
|
||||
MBCS_STAGE_3_BLOCK_SIZE=16, /* 16; 16=1<<4 for 4 bits in stage 3 */
|
||||
MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */
|
||||
MBCS_STAGE_3_BLOCK_MASK=0xf,
|
||||
MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
|
||||
|
||||
MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */
|
||||
MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */
|
||||
MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */
|
||||
|
||||
/*
|
||||
* SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures.
|
||||
* Possible values are 0x01ff..0xffff, in steps of 0x100.
|
||||
*
|
||||
* Unlike for MBCS, this constant only affects the stage 3 block allocation size;
|
||||
* there is no additional stage 1/2 table stored in the .cnv file.
|
||||
* The max value should be at least 0x7ff to cover 2-byte UTF-8.
|
||||
* 0xfff also covers a number other small scripts which have legacy charsets
|
||||
* (like Thai).
|
||||
* Higher values up to 0x1fff are harmless and potentially useful because
|
||||
* that covers small-script blocks which usually have either dense mappings
|
||||
* or no mappings at all.
|
||||
* Starting at U+2000, there are mostly symbols and format characters
|
||||
* with a low density of SBCS mappings, which would result in more wasted
|
||||
* stage 3 entries with the larger block size.
|
||||
*/
|
||||
SBCS_UTF8_MAX=0x1fff,
|
||||
|
||||
/*
|
||||
* MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures.
|
||||
* Possible values are 0x01ff..0xffff, in steps of 0x100.
|
||||
*
|
||||
* Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table
|
||||
* with extreme input data. The function checks for this overflow.
|
||||
*
|
||||
* 0xd7ff is chosen for the majority of common characters including Unihan and Hangul.
|
||||
* At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc.
|
||||
* Larger values cause slightly larger MBCS .cnv files.
|
||||
*/
|
||||
MBCS_UTF8_MAX=0xd7ff,
|
||||
MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */
|
||||
|
||||
MBCS_UTF8_STAGE_SHIFT=6,
|
||||
MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */
|
||||
MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f,
|
||||
|
||||
/* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */
|
||||
MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */
|
||||
|
||||
MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */
|
||||
MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */
|
||||
|
||||
/* =4 number of regular stage 3 blocks for final UTF-8 trail byte */
|
||||
MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE,
|
||||
|
||||
MBCS_MAX_FALLBACK_COUNT=8192
|
||||
};
|
||||
|
||||
U_CFUNC NewConverter *
|
||||
MBCSOpen(UCMFile *ucm);
|
||||
|
||||
/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
|
||||
U_CFUNC UBool
|
||||
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c, int8_t flag);
|
||||
|
||||
U_CFUNC NewConverter *
|
||||
CnvExtOpen(UCMFile *ucm);
|
||||
|
||||
|
|
|
@ -36,7 +36,6 @@
|
|||
|
||||
#define DEBUG 0
|
||||
|
||||
|
||||
typedef struct ConvData {
|
||||
UCMFile *ucm;
|
||||
NewConverter *cnvData, *extData;
|
||||
|
@ -137,7 +136,7 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
|
|||
|
||||
if(VERBOSE)
|
||||
{
|
||||
fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv");
|
||||
printf("- Opened udata %s.%s\n", cnvName, "cnv");
|
||||
}
|
||||
|
||||
|
||||
|
@ -160,7 +159,7 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
|
|||
}
|
||||
if(VERBOSE)
|
||||
{
|
||||
fprintf(stderr, "- Wrote %u bytes to the udata.\n", (int)sz2);
|
||||
printf("- Wrote %u bytes to the udata.\n", (int)sz2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -602,6 +601,10 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
|
|||
states=&data->ucm->states;
|
||||
|
||||
if(dataIsBase) {
|
||||
/*
|
||||
* Build a normal .cnv file with a base table
|
||||
* and an optional extension table.
|
||||
*/
|
||||
data->cnvData=MBCSOpen(data->ucm);
|
||||
if(data->cnvData==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
|
@ -618,27 +621,50 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
|
|||
fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
|
||||
} else if(data->ucm->ext->mappingsLength>0) {
|
||||
/* prepare the extension table, if there is one */
|
||||
data->extData=CnvExtOpen(data->ucm);
|
||||
if(data->extData==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
|
||||
} else if(
|
||||
!ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
|
||||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
|
||||
/* add the base table after ucm_checkBaseExt()! */
|
||||
if( U_SUCCESS(*pErrorCode) &&
|
||||
!data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
|
||||
} else if(
|
||||
data->ucm->ext->mappingsLength>0 &&
|
||||
!ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
} else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
|
||||
/* sort the table so that it can be turned into UTF-8-friendly data */
|
||||
ucm_sortTable(data->ucm->base);
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
if(
|
||||
/* add the base table after ucm_checkBaseExt()! */
|
||||
!data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
} else {
|
||||
/*
|
||||
* addTable() may have requested moving more mappings to the extension table
|
||||
* if they fit into the base toUnicode table but not into the
|
||||
* base fromUnicode table.
|
||||
* (Especially for UTF-8-friendly fromUnicode tables.)
|
||||
* Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
|
||||
* to be excluded from the extension toUnicode data.
|
||||
* See MBCSOkForBaseFromUnicode() for which mappings do not fit into
|
||||
* the base fromUnicode table.
|
||||
*/
|
||||
ucm_moveMappings(data->ucm->base, data->ucm->ext);
|
||||
ucm_sortTable(data->ucm->ext);
|
||||
if(data->ucm->ext->mappingsLength>0) {
|
||||
/* prepare the extension table, if there is one */
|
||||
data->extData=CnvExtOpen(data->ucm);
|
||||
if(data->extData==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
} else if(
|
||||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Build an extension-only .cnv file. */
|
||||
char baseFilename[500];
|
||||
char *basename;
|
||||
|
||||
|
@ -662,7 +688,6 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
|
|||
data->extData=CnvExtOpen(data->ucm);
|
||||
if(data->extData==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
|
||||
} else {
|
||||
/* fill in gaps in extension file header fields */
|
||||
UCMapping *m, *mLimit;
|
||||
|
@ -700,16 +725,6 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
|
|||
fallbackFlags|=2;
|
||||
}
|
||||
}
|
||||
for(m=data->ucm->base->mappings, mLimit=m+data->ucm->base->mappingsLength;
|
||||
m<mLimit && fallbackFlags!=3;
|
||||
++m
|
||||
) {
|
||||
if(m->f==1) {
|
||||
fallbackFlags|=1;
|
||||
} else if(m->f==3) {
|
||||
fallbackFlags|=2;
|
||||
}
|
||||
}
|
||||
|
||||
if(fallbackFlags&1) {
|
||||
staticData->hasFromUnicodeFallback=TRUE;
|
||||
|
@ -728,10 +743,52 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
|
|||
|
||||
} else if(
|
||||
!ucm_checkValidity(data->ucm->ext, baseStates) ||
|
||||
!ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
|
||||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
|
||||
!ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
} else {
|
||||
if(states->maxCharLength>1) {
|
||||
/*
|
||||
* When building a normal .cnv file with a base table
|
||||
* for an MBCS (not SBCS) table with explicit precision flags,
|
||||
* the MBCSAddTable() function marks some mappings for moving
|
||||
* to the extension table.
|
||||
* They fit into the base toUnicode table but not into the
|
||||
* base fromUnicode table.
|
||||
* (Note: We do have explicit precision flags because they are
|
||||
* required for extension table generation, and
|
||||
* ucm_checkBaseExt() verified it.)
|
||||
*
|
||||
* We do not call MBCSAddTable() here (we probably could)
|
||||
* so we need to do the analysis before building the extension table.
|
||||
* We assume the "worst case" of a UTF-8-friendly table, even if
|
||||
* MBCSAddTable() might revert to a regular table due to some overflow.
|
||||
* Redundant mappings in the extension table are ok except they cost some size.
|
||||
* Overflows in MBCSAddTable() should be very rare.
|
||||
* TODO: Change "worst case" comment if the MBCSAddTable() loop goes away.
|
||||
*
|
||||
* Do this after ucm_checkBaseExt().
|
||||
*/
|
||||
int32_t needsMove=0;
|
||||
for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
|
||||
m<mLimit;
|
||||
++m
|
||||
) {
|
||||
if(!MBCSOkForBaseFromUnicode(TRUE, m->b.bytes, m->bLen, m->u, m->f)) {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
++needsMove;
|
||||
}
|
||||
}
|
||||
|
||||
if(needsMove!=0) {
|
||||
ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
|
||||
ucm_sortTable(data->ucm->ext);
|
||||
}
|
||||
}
|
||||
if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2005, International Business Machines
|
||||
* Copyright (C) 2003-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -238,7 +238,7 @@ ucm_sortTable(UCMTable *t) {
|
|||
* allocate mappingsCapacity instead of mappingsLength so that
|
||||
* if mappings are added, the reverseMap need not be
|
||||
* reallocated each time
|
||||
* (see moveMappings() and ucm_addMapping())
|
||||
* (see ucm_moveMappings() and ucm_addMapping())
|
||||
*/
|
||||
t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
|
||||
if(t->reverseMap==NULL) {
|
||||
|
@ -264,20 +264,12 @@ ucm_sortTable(UCMTable *t) {
|
|||
t->isSorted=TRUE;
|
||||
}
|
||||
|
||||
enum {
|
||||
MOVE_TO_EXT=1,
|
||||
REMOVE_MAPPING=2
|
||||
};
|
||||
|
||||
/*
|
||||
* move mappings with their move flag set from the base table
|
||||
* and optionally to the extension table
|
||||
*
|
||||
* works only with explicit precision flags because it uses some of the
|
||||
* flags bits
|
||||
* remove mappings with their move flag set from the base table
|
||||
* and move some of them (with UCM_MOVE_TO_EXT) to the extension table
|
||||
*/
|
||||
static void
|
||||
moveMappings(UCMTable *base, UCMTable *ext) {
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_moveMappings(UCMTable *base, UCMTable *ext) {
|
||||
UCMapping *mb, *mbLimit;
|
||||
int8_t flag;
|
||||
|
||||
|
@ -290,12 +282,12 @@ moveMappings(UCMTable *base, UCMTable *ext) {
|
|||
/* reset the move flag */
|
||||
mb->moveFlag=0;
|
||||
|
||||
if(ext!=NULL && (flag&MOVE_TO_EXT)) {
|
||||
if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
|
||||
/* add the mapping to the extension table */
|
||||
ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
|
||||
}
|
||||
|
||||
/* move the last base mapping down and overwrite the current one */
|
||||
/* remove this mapping: move the last base mapping down and overwrite the current one */
|
||||
if(mb<(mbLimit-1)) {
|
||||
uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
|
||||
}
|
||||
|
@ -364,7 +356,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
|||
* if ext is DBCS, move DBCS mappings here
|
||||
* and check SBCS ones for Unicode prefix below
|
||||
*/
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
|
||||
/* does mb map from an input sequence that is a prefix of me's? */
|
||||
|
@ -373,7 +365,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
|||
) {
|
||||
if(moveToExt) {
|
||||
/* mark this mapping to be moved to the extension table */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -394,11 +386,11 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
|||
if( mb->f==me->f && mb->bLen==me->bLen &&
|
||||
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
||||
) {
|
||||
me->moveFlag|=REMOVE_MAPPING;
|
||||
me->moveFlag|=UCM_REMOVE_MAPPING;
|
||||
result|=NEEDS_MOVE;
|
||||
} else if(intersectBase) {
|
||||
/* mapping in base but not in ext, move it */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -476,7 +468,7 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
|||
if(cmp<0) {
|
||||
if(intersectBase) {
|
||||
/* mapping in base but not in ext, move it */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
|
||||
/*
|
||||
|
@ -490,7 +482,7 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
|||
) {
|
||||
if(moveToExt) {
|
||||
/* mark this mapping to be moved to the extension table */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -511,11 +503,11 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
|||
if( mb->f==me->f && mb->uLen==me->uLen &&
|
||||
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
||||
) {
|
||||
me->moveFlag|=REMOVE_MAPPING;
|
||||
me->moveFlag|=UCM_REMOVE_MAPPING;
|
||||
result|=NEEDS_MOVE;
|
||||
} else if(intersectBase) {
|
||||
/* mapping in base but not in ext, move it */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -586,8 +578,8 @@ ucm_checkBaseExt(UCMStates *baseStates,
|
|||
}
|
||||
|
||||
if(result&NEEDS_MOVE) {
|
||||
moveMappings(ext, NULL);
|
||||
moveMappings(base, moveTarget);
|
||||
ucm_moveMappings(ext, NULL);
|
||||
ucm_moveMappings(base, moveTarget);
|
||||
ucm_sortTable(base);
|
||||
ucm_sortTable(ext);
|
||||
if(moveTarget!=NULL) {
|
||||
|
@ -715,7 +707,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
|||
if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
|
||||
fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
|
||||
ucm_printMapping(table, m, stderr);
|
||||
m->moveFlag|=REMOVE_MAPPING;
|
||||
m->moveFlag|=UCM_REMOVE_MAPPING;
|
||||
needsMove=TRUE;
|
||||
continue;
|
||||
}
|
||||
|
@ -728,7 +720,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
|||
printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
|
||||
isOK=FALSE;
|
||||
} else if(type>0) {
|
||||
m->moveFlag|=MOVE_TO_EXT;
|
||||
m->moveFlag|=UCM_MOVE_TO_EXT;
|
||||
needsMove=TRUE;
|
||||
}
|
||||
}
|
||||
|
@ -737,7 +729,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
|||
return FALSE;
|
||||
}
|
||||
if(needsMove) {
|
||||
moveMappings(ucm->base, ucm->ext);
|
||||
ucm_moveMappings(ucm->base, ucm->ext);
|
||||
return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
|
||||
} else {
|
||||
ucm_sortTable(ucm->base);
|
||||
|
@ -1058,15 +1050,31 @@ ucm_mappingType(UCMStates *baseStates,
|
|||
|
||||
/*
|
||||
* Suitable for an ICU conversion base table means:
|
||||
* - a 1:1 mapping
|
||||
* - not a |2 SUB mappings for <subchar1>
|
||||
* - not a |1 fallback to 0x00
|
||||
* - no leading 0x00 bytes
|
||||
* - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
|
||||
* - SBCS: any 1:1 mapping
|
||||
* (the table stores additional bits to distinguish mapping types)
|
||||
* - MBCS: not a |2 SUB mapping for <subchar1>
|
||||
* - MBCS: not a |1 fallback to 0x00
|
||||
* - MBCS: not a multi-byte mapping with leading 0x00 bytes
|
||||
*
|
||||
* Further restrictions for fromUnicode tables
|
||||
* are enforced in makeconv (MBCSOkForBaseFromUnicode()).
|
||||
*
|
||||
* All of the MBCS fromUnicode specific tests could be removed from here,
|
||||
* but the ones above are for unusual mappings, and removing the tests
|
||||
* from here would change canonucm output which seems gratuitous.
|
||||
* (Markus Scherer 2006-nov-28)
|
||||
*
|
||||
* Exception: All implicit mappings (f<0) that need to be moved
|
||||
* because of fromUnicode restrictions _must_ be moved here because
|
||||
* makeconv uses a hack for moving mappings only for the fromUnicode table
|
||||
* that only works with non-negative values of f.
|
||||
*/
|
||||
if( m->uLen==1 && count==1 &&
|
||||
!((m->f==2 && m->bLen==1 && baseStates->maxCharLength>1) ||
|
||||
(m->f==1 && m->bLen==1 && bytes[0]==0) ||
|
||||
(m->bLen>1 && bytes[0]==0))
|
||||
(baseStates->maxCharLength==1 ||
|
||||
!((m->f==2 && m->bLen==1) ||
|
||||
(m->f==1 && bytes[0]==0) ||
|
||||
(m->f<=1 && m->bLen>1 && bytes[0]==0)))
|
||||
) {
|
||||
return 0; /* suitable for a base table */
|
||||
} else {
|
||||
|
@ -1178,4 +1186,3 @@ ucm_readTable(UCMFile *ucm, FileStream* convFile,
|
|||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2005, International Business Machines
|
||||
* Copyright (C) 2003-2006, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -29,6 +29,12 @@
|
|||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/* constants for UCMapping.moveFlag */
|
||||
enum {
|
||||
UCM_MOVE_TO_EXT=1,
|
||||
UCM_REMOVE_MAPPING=2
|
||||
};
|
||||
|
||||
/*
|
||||
* Per-mapping data structure
|
||||
*
|
||||
|
@ -52,6 +58,7 @@ typedef struct UCMapping {
|
|||
int8_t uLen, bLen, f, moveFlag;
|
||||
} UCMapping;
|
||||
|
||||
/* constants for UCMTable.flagsType */
|
||||
enum {
|
||||
UCM_FLAGS_INITIAL, /* no mappings parsed yet */
|
||||
UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
|
||||
|
@ -150,6 +157,13 @@ ucm_resetTable(UCMTable *table);
|
|||
U_CAPI void U_EXPORT2
|
||||
ucm_sortTable(UCMTable *t);
|
||||
|
||||
/*
|
||||
* Remove mappings with their move flag set from the base table
|
||||
* and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_moveMappings(UCMTable *base, UCMTable *ext);
|
||||
|
||||
/**
|
||||
* Read a table from a .ucm file, from after the CHARMAP line to
|
||||
* including the END CHARMAP line.
|
||||
|
@ -186,7 +200,7 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
|
|||
*
|
||||
* For both tables in the same file, the extension table is automatically
|
||||
* built.
|
||||
* For separate files, the extension file can use a complete mapping table,
|
||||
* For separate files, the extension file can use a complete mapping table (.ucm file),
|
||||
* so that common mappings need not be stripped out manually.
|
||||
*
|
||||
*
|
||||
|
|
Loading…
Add table
Reference in a new issue