ICU-5518 merge direct-from-UTF-8 conversion code from http://source.icu-project.org/repos/icu/icu/branches/markus/ucnvutf8 -r 20735:20990 to icu/trunk

X-SVN-Rev: 21010
This commit is contained in:
Markus Scherer 2007-02-06 05:24:14 +00:00
parent 464ae7d46f
commit 9acca77737
20 changed files with 3800 additions and 677 deletions

View file

@ -889,20 +889,25 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
* }
*/
for(;;) {
/* convert */
fromUnicode(pArgs, err);
if(U_SUCCESS(*err)) {
/* convert */
fromUnicode(pArgs, err);
/*
* set a flag for whether the converter
* successfully processed the end of the input
*
* need not check cnv->preFromULength==0 because a replay (<0) will cause
* s<sourceLimit before converterSawEndOfInput is checked
*/
converterSawEndOfInput=
(UBool)(U_SUCCESS(*err) &&
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
cnv->fromUChar32==0);
/*
* set a flag for whether the converter
* successfully processed the end of the input
*
* need not check cnv->preFromULength==0 because a replay (<0) will cause
* s<sourceLimit before converterSawEndOfInput is checked
*/
converterSawEndOfInput=
(UBool)(U_SUCCESS(*err) &&
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
cnv->fromUChar32==0);
} else {
/* handle error from ucnv_convertEx() */
converterSawEndOfInput=FALSE;
}
/* no callback called yet for this iteration */
calledCallback=FALSE;
@ -1093,6 +1098,64 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
}
}
/*
* Output the fromUnicode overflow buffer.
* Call this function if(cnv->charErrorBufferLength>0).
* @return TRUE if overflow
*/
static UBool
ucnv_outputOverflowFromUnicode(UConverter *cnv,
char **target, const char *targetLimit,
int32_t **pOffsets,
UErrorCode *err) {
int32_t *offsets;
char *overflow, *t;
int32_t i, length;
t=*target;
if(pOffsets!=NULL) {
offsets=*pOffsets;
} else {
offsets=NULL;
}
overflow=(char *)cnv->charErrorBuffer;
length=cnv->charErrorBufferLength;
i=0;
while(i<length) {
if(t==targetLimit) {
/* the overflow buffer contains too much, keep the rest */
int32_t j=0;
do {
overflow[j++]=overflow[i++];
} while(i<length);
cnv->charErrorBufferLength=(int8_t)j;
*target=t;
if(offsets!=NULL) {
*pOffsets=offsets;
}
*err=U_BUFFER_OVERFLOW_ERROR;
return TRUE;
}
/* copy the overflow contents to the target */
*t++=overflow[i++];
if(offsets!=NULL) {
*offsets++=-1; /* no source index available for old output */
}
}
/* the overflow buffer is completely copied to the target */
cnv->charErrorBufferLength=0;
*target=t;
if(offsets!=NULL) {
*pOffsets=offsets;
}
return FALSE;
}
U_CAPI void U_EXPORT2
ucnv_fromUnicode(UConverter *cnv,
char **target, const char *targetLimit,
@ -1145,43 +1208,17 @@ ucnv_fromUnicode(UConverter *cnv,
return;
}
/* flush the target overflow buffer */
if(cnv->charErrorBufferLength>0) {
char *overflow;
int32_t i, length;
overflow=(char *)cnv->charErrorBuffer;
length=cnv->charErrorBufferLength;
i=0;
do {
if(t==targetLimit) {
/* the overflow buffer contains too much, keep the rest */
int32_t j=0;
do {
overflow[j++]=overflow[i++];
} while(i<length);
cnv->charErrorBufferLength=(int8_t)j;
*target=t;
*err=U_BUFFER_OVERFLOW_ERROR;
return;
}
/* copy the overflow contents to the target */
*t++=overflow[i++];
if(offsets!=NULL) {
*offsets++=-1; /* no source index available for old output */
}
} while(i<length);
/* the overflow buffer is completely copied to the target */
cnv->charErrorBufferLength=0;
/* output the target overflow buffer */
if( cnv->charErrorBufferLength>0 &&
ucnv_outputOverflowFromUnicode(cnv, target, targetLimit, &offsets, err)
) {
/* U_BUFFER_OVERFLOW_ERROR */
return;
}
/* *target may have moved, therefore stop using t */
if(!flush && s==sourceLimit && cnv->preFromULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
*target=t;
return;
}
@ -1199,7 +1236,7 @@ ucnv_fromUnicode(UConverter *cnv,
args.offsets=offsets;
args.source=s;
args.sourceLimit=sourceLimit;
args.target=t;
args.target=*target;
args.targetLimit=targetLimit;
args.size=sizeof(args);
@ -1304,7 +1341,7 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
pArgs->flush && pArgs->source==pArgs->sourceLimit &&
cnv->toULength==0);
} else {
/* handle error from getNextUChar() */
/* handle error from getNextUChar() or ucnv_convertEx() */
converterSawEndOfInput=FALSE;
}
@ -1495,6 +1532,64 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
}
}
/*
* Output the toUnicode overflow buffer.
* Call this function if(cnv->UCharErrorBufferLength>0).
* @return TRUE if overflow
*/
static UBool
ucnv_outputOverflowToUnicode(UConverter *cnv,
UChar **target, const UChar *targetLimit,
int32_t **pOffsets,
UErrorCode *err) {
int32_t *offsets;
UChar *overflow, *t;
int32_t i, length;
t=*target;
if(pOffsets!=NULL) {
offsets=*pOffsets;
} else {
offsets=NULL;
}
overflow=cnv->UCharErrorBuffer;
length=cnv->UCharErrorBufferLength;
i=0;
while(i<length) {
if(t==targetLimit) {
/* the overflow buffer contains too much, keep the rest */
int32_t j=0;
do {
overflow[j++]=overflow[i++];
} while(i<length);
cnv->UCharErrorBufferLength=(int8_t)j;
*target=t;
if(offsets!=NULL) {
*pOffsets=offsets;
}
*err=U_BUFFER_OVERFLOW_ERROR;
return TRUE;
}
/* copy the overflow contents to the target */
*t++=overflow[i++];
if(offsets!=NULL) {
*offsets++=-1; /* no source index available for old output */
}
}
/* the overflow buffer is completely copied to the target */
cnv->UCharErrorBufferLength=0;
*target=t;
if(offsets!=NULL) {
*pOffsets=offsets;
}
return FALSE;
}
U_CAPI void U_EXPORT2
ucnv_toUnicode(UConverter *cnv,
UChar **target, const UChar *targetLimit,
@ -1547,43 +1642,17 @@ ucnv_toUnicode(UConverter *cnv,
return;
}
/* flush the target overflow buffer */
if(cnv->UCharErrorBufferLength>0) {
UChar *overflow;
int32_t i, length;
overflow=cnv->UCharErrorBuffer;
length=cnv->UCharErrorBufferLength;
i=0;
do {
if(t==targetLimit) {
/* the overflow buffer contains too much, keep the rest */
int32_t j=0;
do {
overflow[j++]=overflow[i++];
} while(i<length);
cnv->UCharErrorBufferLength=(int8_t)j;
*target=t;
*err=U_BUFFER_OVERFLOW_ERROR;
return;
}
/* copy the overflow contents to the target */
*t++=overflow[i++];
if(offsets!=NULL) {
*offsets++=-1; /* no source index available for old output */
}
} while(i<length);
/* the overflow buffer is completely copied to the target */
cnv->UCharErrorBufferLength=0;
/* output the target overflow buffer */
if( cnv->UCharErrorBufferLength>0 &&
ucnv_outputOverflowToUnicode(cnv, target, targetLimit, &offsets, err)
) {
/* U_BUFFER_OVERFLOW_ERROR */
return;
}
/* *target may have moved, therefore stop using t */
if(!flush && s==sourceLimit && cnv->preToULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
*target=t;
return;
}
@ -1601,7 +1670,7 @@ ucnv_toUnicode(UConverter *cnv,
args.offsets=offsets;
args.source=s;
args.sourceLimit=sourceLimit;
args.target=t;
args.target=*target;
args.targetLimit=targetLimit;
args.size=sizeof(args);
@ -1951,7 +2020,14 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
UBool reset, UBool flush,
UErrorCode *pErrorCode) {
UChar pivotBuffer[CHUNK_SIZE];
UChar *myPivotSource, *myPivotTarget;
const UChar *myPivotSource;
UChar *myPivotTarget;
const char *s;
char *t;
UConverterToUnicodeArgs toUArgs;
UConverterFromUnicodeArgs fromUArgs;
UConverterConvert convert;
/* error checking */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
@ -1966,6 +2042,25 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
return;
}
s=*source;
t=*target;
if((sourceLimit!=NULL && sourceLimit<s) || targetLimit<t) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
/*
* Make sure that the buffer sizes do not exceed the number range for
* int32_t. See ucnv_toUnicode() for a more detailed comment.
*/
if(
(sourceLimit!=NULL && ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s)) ||
((size_t)(targetLimit-t)>(size_t)0x7fffffff && targetLimit>t)
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(pivotStart==NULL) {
if(!flush) {
/* streaming conversion requires an explicit pivot buffer */
@ -1974,8 +2069,8 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
}
/* use the stack pivot buffer */
pivotStart=myPivotSource=myPivotTarget=pivotBuffer;
pivotSource=&myPivotSource;
myPivotSource=myPivotTarget=pivotStart=pivotBuffer;
pivotSource=(UChar **)&myPivotSource;
pivotTarget=&myPivotTarget;
pivotLimit=pivotBuffer+CHUNK_SIZE;
} else if( pivotStart>=pivotLimit ||
@ -1995,51 +2090,260 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
if(reset) {
ucnv_resetToUnicode(sourceCnv);
ucnv_resetFromUnicode(targetCnv);
*pivotTarget=*pivotSource=pivotStart;
*pivotSource=*pivotTarget=pivotStart;
} else if(targetCnv->charErrorBufferLength>0) {
/* output the targetCnv overflow buffer */
if(ucnv_outputOverflowFromUnicode(targetCnv, target, targetLimit, NULL, pErrorCode)) {
/* U_BUFFER_OVERFLOW_ERROR */
return;
}
/* *target has moved, therefore stop using t */
if( !flush &&
targetCnv->preFromULength>=0 && *pivotSource==*pivotTarget &&
sourceCnv->UCharErrorBufferLength==0 && sourceCnv->preToULength>=0 && s==sourceLimit
) {
/* the fromUnicode overflow buffer is emptied and there is no new input: we are done */
return;
}
}
/* conversion loop */
/* Is direct-UTF-8 conversion available? */
if( sourceCnv->sharedData->staticData->conversionType==UCNV_UTF8 &&
targetCnv->sharedData->impl->fromUTF8!=NULL
) {
convert=targetCnv->sharedData->impl->fromUTF8;
} else if( targetCnv->sharedData->staticData->conversionType==UCNV_UTF8 &&
sourceCnv->sharedData->impl->toUTF8!=NULL
) {
convert=sourceCnv->sharedData->impl->toUTF8;
} else {
convert=NULL;
}
/*
* If direct-UTF-8 conversion is available, then we use a smaller
* pivot buffer for error handling and partial matches
* so that we quickly return to direct conversion.
*
* 32 is large enough for UCNV_EXT_MAX_UCHARS and UCNV_ERROR_BUFFER_LENGTH.
*
* We could reduce the pivot buffer size further, at the cost of
* buffer overflows from callbacks.
* The pivot buffer should not be smaller than the maximum number of
* fromUnicode extension table input UChars
* (for m:n conversion, see
* targetCnv->sharedData->mbcs.extIndexes[UCNV_EXT_COUNT_UCHARS])
* or 2 for surrogate pairs.
*
* Too small a buffer can cause thrashing between pivoting and direct
* conversion, with function call overhead outweighing the benefits
* of direct conversion.
*/
if(convert!=NULL && (pivotLimit-pivotStart)>32) {
pivotLimit=pivotStart+32;
}
/* prepare the converter arguments */
fromUArgs.converter=targetCnv;
fromUArgs.flush=FALSE;
fromUArgs.offsets=NULL;
fromUArgs.target=*target;
fromUArgs.targetLimit=targetLimit;
fromUArgs.size=sizeof(fromUArgs);
toUArgs.converter=sourceCnv;
toUArgs.flush=flush;
toUArgs.offsets=NULL;
toUArgs.source=s;
toUArgs.sourceLimit=sourceLimit;
toUArgs.targetLimit=pivotLimit;
toUArgs.size=sizeof(toUArgs);
/*
* TODO: Consider separating this function into two functions,
* extracting exactly the conversion loop,
* for readability and to reduce the set of visible variables.
*
* Otherwise stop using s and t from here on.
*/
s=t=NULL;
/*
* conversion loop
*
* The sequence of steps in the loop may appear backward,
* but the principle is simple:
* In the chain of
* source - sourceCnv overflow - pivot - targetCnv overflow - target
* empty out later buffers before refilling them from earlier ones.
*
* The targetCnv overflow buffer is flushed out only once before the loop.
*/
for(;;) {
if(reset) {
/*
* if we did a reset in this function, we know that there is nothing
* to convert to the target yet, so we save a function call
*/
reset=FALSE;
} else {
/*
* convert to the target first in case the pivot is filled at entry
* or the targetCnv has some output bytes in its state
*/
ucnv_fromUnicode(targetCnv,
target, targetLimit,
(const UChar **)pivotSource, *pivotTarget,
NULL,
(UBool)(flush && *source==sourceLimit),
pErrorCode);
/*
* if(pivot not empty or error or replay or flush fromUnicode) {
* fromUnicode(pivot -> target);
* }
*
* For pivoting conversion; and for direct conversion for
* error callback handling and flushing the replay buffer.
*/
if( *pivotSource<*pivotTarget ||
U_FAILURE(*pErrorCode) ||
targetCnv->preFromULength<0 ||
fromUArgs.flush
) {
fromUArgs.source=*pivotSource;
fromUArgs.sourceLimit=*pivotTarget;
_fromUnicodeWithCallback(&fromUArgs, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
/* target overflow, or conversion error */
*pivotSource=(UChar *)fromUArgs.source;
break;
}
/* ucnv_fromUnicode() must have consumed the pivot contents since it returned with U_SUCCESS() */
*pivotSource=*pivotTarget=pivotStart;
/*
* _fromUnicodeWithCallback() must have consumed the pivot contents
* (*pivotSource==*pivotTarget) since it returned with U_SUCCESS()
*/
}
/* convert from the source to the pivot */
ucnv_toUnicode(sourceCnv,
pivotTarget, pivotLimit,
source, sourceLimit,
NULL,
flush,
pErrorCode);
/* The pivot buffer is empty; reset it so we start at pivotStart. */
*pivotSource=*pivotTarget=pivotStart;
/*
* if(sourceCnv overflow buffer not empty) {
* move(sourceCnv overflow buffer -> pivot);
* continue;
* }
*/
/* output the sourceCnv overflow buffer */
if(sourceCnv->UCharErrorBufferLength>0) {
if(ucnv_outputOverflowToUnicode(sourceCnv, pivotTarget, pivotLimit, NULL, pErrorCode)) {
/* U_BUFFER_OVERFLOW_ERROR */
*pErrorCode=U_ZERO_ERROR;
}
continue;
}
/*
* check for end of input and break if done
*
* Checking both flush and fromUArgs.flush ensures that the converters
* have been called with the flush flag set if the ucnv_convertEx()
* caller set it.
*/
if( toUArgs.source==sourceLimit &&
sourceCnv->preToULength>=0 && sourceCnv->toULength==0 &&
(!flush || fromUArgs.flush)
) {
/* done successfully */
break;
}
/*
* use direct conversion if available
* but not if continuing a partial match
* or flushing the toUnicode replay buffer
*/
if(convert!=NULL && targetCnv->preFromUFirstCP<0 && sourceCnv->preToULength==0) {
if(*pErrorCode==U_USING_DEFAULT_WARNING) {
/* remove a warning that may be set by this function */
*pErrorCode=U_ZERO_ERROR;
}
convert(&fromUArgs, &toUArgs, pErrorCode);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*pErrorCode)) {
if(sourceCnv->toULength>0) {
/*
* Fall through to calling _toUnicodeWithCallback()
* for callback handling.
*
* The pivot buffer will be reset with
* *pivotSource=*pivotTarget=pivotStart;
* which indicates a toUnicode error to the caller
* (*pivotSource==pivotStart shows no pivot UChars consumed).
*/
} else {
/*
* Indicate a fromUnicode error to the caller
* (*pivotSource>pivotStart shows some pivot UChars consumed).
*/
*pivotSource=*pivotTarget=pivotStart+1;
/*
* Loop around to calling _fromUnicodeWithCallbacks()
* for callback handling.
*/
continue;
}
} else if(*pErrorCode==U_USING_DEFAULT_WARNING) {
/*
* No error, but the implementation requested to temporarily
* fall back to pivoting.
*/
*pErrorCode=U_ZERO_ERROR;
/*
* The following else branches are almost identical to the end-of-input
* handling in _toUnicodeWithCallback().
* Avoid calling it just for the end of input.
*/
} else if(flush && sourceCnv->toULength>0) { /* flush==toUArgs.flush */
/*
* the entire input stream is consumed
* and there is a partial, truncated input sequence left
*/
/* inject an error and continue with callback handling */
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
} else {
/* input consumed */
if(flush) {
/* reset the converters without calling the callback functions */
_reset(sourceCnv, UCNV_RESET_TO_UNICODE, FALSE);
_reset(targetCnv, UCNV_RESET_FROM_UNICODE, FALSE);
}
/* done successfully */
break;
}
}
/*
* toUnicode(source -> pivot);
*
* For pivoting conversion; and for direct conversion for
* error callback handling, continuing partial matches
* and flushing the replay buffer.
*
* The pivot buffer is empty and reset.
*/
toUArgs.target=pivotStart; /* ==*pivotTarget */
/* toUArgs.targetLimit=pivotLimit; already set before the loop */
_toUnicodeWithCallback(&toUArgs, pErrorCode);
*pivotTarget=toUArgs.target;
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
/* pivot overflow: continue with the conversion loop */
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode) || *pivotTarget==pivotStart) {
} else if(U_FAILURE(*pErrorCode) || (!flush && *pivotTarget==pivotStart)) {
/* conversion error, or there was nothing left to convert */
break;
}
/* else ucnv_toUnicode() wrote into the pivot buffer: continue */
/*
* else:
* _toUnicodeWithCallback() wrote into the pivot buffer,
* continue with fromUnicode conversion.
*
* Set the fromUnicode flush flag if we flush and if toUnicode has
* processed the end of the input.
*/
if( flush && toUArgs.source==sourceLimit &&
sourceCnv->preToULength>=0 &&
sourceCnv->UCharErrorBufferLength==0
) {
fromUArgs.flush=TRUE;
}
}
/*
@ -2049,6 +2353,9 @@ ucnv_convertEx(UConverter *targetCnv, UConverter *sourceCnv,
* - a conversion error occurred
*/
*source=toUArgs.source;
*target=fromUArgs.target;
/* terminate the target buffer if possible */
if(flush && U_SUCCESS(*pErrorCode)) {
if(*target!=targetLimit) {

View file

@ -1,7 +1,7 @@
/*
********************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2006, International Business Machines Corporation and
* Copyright (c) 1996-2007, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*
@ -1433,6 +1433,7 @@ ucnv_swap(const UDataSwapper *ds,
outBytes+offset, pErrorCode);
} else {
/* otherwise: swap the stage tables separately */
int32_t maxFastUChar;
/* stage 1 table: uint16_t[0x440 or 0x40] */
if(inStaticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
@ -1467,6 +1468,20 @@ ucnv_swap(const UDataSwapper *ds,
/* just uint8_t[], nothing to swap */
break;
}
/*
* utf8Friendly MBCS files (mbcsHeader.version 4.3)
* contain an additional mbcsIndex table:
* uint16_t[(maxFastUChar+1)>>6];
* where maxFastUChar=((mbcsHeader.version[2]<<8)|0xff).
*/
if(mbcsHeader.version[1]>=3 && (maxFastUChar=mbcsHeader.version[2])!=0) {
maxFastUChar=(maxFastUChar<<8)|0xff;
offset+=count;
count=((maxFastUChar+1)>>6)*2;
ds->swapArray16(ds, inBytes+offset, (int32_t)count,
outBytes+offset, pErrorCode);
}
}
}

View file

@ -1,12 +1,11 @@
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* uconv_cnv.h:
* defines all the low level conversion functions
* T_UnicodeConverter_{to,from}Unicode_$ConversionType
* ucnv_cnv.h:
* Definitions for converter implementations.
*
* Modification History:
*
@ -104,6 +103,23 @@ typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *);
*/
typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *);
/*
* Converter implementation function for ucnv_convertEx(), for direct conversion
* between two charsets without pivoting through UTF-16.
* The rules are the same as for UConverterToUnicode and UConverterFromUnicode.
* In addition,
* - The toUnicode side must behave and keep state exactly like the
* UConverterToUnicode implementation for the same source charset.
* - A U_USING_DEFAULT_WARNING can be set to request to temporarily fall back
* to pivoting. When this function is called, the conversion framework makes
* sure that this warning is not set on input.
* - Continuing a partial match and flushing the toUnicode replay buffer
* are handled by pivoting, using the toUnicode and fromUnicode functions.
*/
typedef void (*UConverterConvert) (UConverterFromUnicodeArgs *pFromUArgs,
UConverterToUnicodeArgs *pToUArgs,
UErrorCode *pErrorCode);
/*
* Converter implementation function for ucnv_getNextUChar().
* If the function pointer is NULL, then the toUnicode function will be used.
@ -214,6 +230,9 @@ struct UConverterImpl {
UConverterWriteSub writeSub;
UConverterSafeClone safeClone;
UConverterGetUnicodeSet getUnicodeSet;
UConverterConvert toUTF8;
UConverterConvert fromUTF8;
};
extern const UConverterSharedData

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2006, International Business Machines
* Copyright (C) 2002-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u8.c
@ -724,6 +724,263 @@ static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
return 0xffff;
}
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
static const UChar32
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
static const UChar32
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
static void
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
UConverterToUnicodeArgs *pToUArgs,
UErrorCode *pErrorCode) {
UConverter *utf8, *cnv;
const uint8_t *source, *sourceLimit;
uint8_t *target;
int32_t targetCapacity;
int32_t count;
int8_t oldToULength, toULength, toULimit;
UChar32 c;
uint8_t b, t1, t2;
/* set up the local pointers */
utf8=pToUArgs->converter;
cnv=pFromUArgs->converter;
source=(uint8_t *)pToUArgs->source;
sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
target=(uint8_t *)pFromUArgs->target;
targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
/* get the converter state from the UTF-8 UConverter */
c=(UChar32)utf8->toUnicodeStatus;
if(c!=0) {
toULength=oldToULength=utf8->toULength;
toULimit=(int8_t)utf8->mode;
} else {
toULength=oldToULength=toULimit=0;
}
count=(int32_t)(sourceLimit-source)+oldToULength;
if(count<toULimit) {
/*
* Not enough input to complete the partial character.
* Jump to moreBytes below - it will not output to target.
*/
} else if(targetCapacity<toULimit) {
/*
* Not enough target capacity to output the partial character.
* Let the standard converter handle this.
*/
*pErrorCode=U_USING_DEFAULT_WARNING;
return;
} else {
/*
* Use a single counter for source and target, counting the minimum of
* the source length and the target capacity.
* As a result, the source length is checked only once per multi-byte
* character instead of twice.
*
* Make sure that the last byte sequence is complete, or else
* stop just before it.
* (The longest legal byte sequence has 3 trail bytes.)
* Count oldToULength (number of source bytes from a previous buffer)
* into the source length but reduce the source index by toULimit
* while going back over trail bytes in order to not go back into
* the bytes that will be read for finishing a partial
* sequence from the previous buffer.
* Let the standard converter handle edge cases.
*/
int32_t i;
if(count>targetCapacity) {
count=targetCapacity;
}
i=0;
while(i<3 && i<(count-toULimit)) {
b=source[count-oldToULength-i-1];
if(U8_IS_TRAIL(b)) {
++i;
} else {
if(i<utf8_countTrailBytes[b]) {
/* stop converting before the lead byte if there are not enough trail bytes for it */
count-=i+1;
}
break;
}
}
}
if(c!=0) {
utf8->toUnicodeStatus=0;
utf8->toULength=0;
goto moreBytes;
/* See note in ucnv_SBCSFromUTF8() about this goto. */
}
/* conversion loop */
while(count>0) {
b=*source++;
if((int8_t)b>=0) {
/* convert ASCII */
*target++=b;
--count;
continue;
} else {
if(b>0xe0) {
if( /* handle U+1000..U+D7FF inline */
(t1=source[0]) >= 0x80 && ((b<0xed) && (t1 <= 0xbf) ||
(b==0xed && (t1 <= 0x9f))) &&
(t2=source[1]) >= 0x80 && t2 <= 0xbf
) {
source+=2;
*target++=b;
*target++=t1;
*target++=t2;
count-=3;
continue;
}
} else if(b<0xe0) {
if( /* handle U+0080..U+07FF inline */
b>=0xc2 &&
(t1=*source) >= 0x80 && t1 <= 0xbf
) {
++source;
*target++=b;
*target++=t1;
count-=2;
continue;
}
} else if(b==0xe0) {
if( /* handle U+0800..U+0FFF inline */
(t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
(t2=source[1]) >= 0x80 && t2 <= 0xbf
) {
source+=2;
*target++=b;
*target++=t1;
*target++=t2;
count-=3;
continue;
}
}
/* handle "complicated" and error cases, and continuing partial characters */
oldToULength=0;
toULength=1;
toULimit=utf8_countTrailBytes[b]+1;
c=b;
moreBytes:
while(toULength<toULimit) {
if(source<sourceLimit) {
b=*source;
if(U8_IS_TRAIL(b)) {
++source;
++toULength;
c=(c<<6)+b;
} else {
break; /* sequence too short, stop with toULength<toULimit */
}
} else {
/* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
source-=(toULength-oldToULength);
while(oldToULength<toULength) {
utf8->toUBytes[oldToULength++]=*source++;
}
utf8->toUnicodeStatus=c;
utf8->toULength=toULength;
utf8->mode=toULimit;
pToUArgs->source=(char *)source;
pFromUArgs->target=(char *)target;
return;
}
}
if( toULength==toULimit && /* consumed all trail bytes */
(toULength==3 || toULength==2) && /* BMP */
(c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
(c<=0xd7ff || 0xe000<=c) /* not a surrogate */
) {
/* legal byte sequence for BMP code point */
} else if(
toULength==toULimit && toULength==4 &&
(0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
) {
/* legal byte sequence for supplementary code point */
} else {
/* error handling: illegal UTF-8 byte sequence */
source-=(toULength-oldToULength);
while(oldToULength<toULength) {
utf8->toUBytes[oldToULength++]=*source++;
}
utf8->toULength=toULength;
pToUArgs->source=(char *)source;
pFromUArgs->target=(char *)target;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
return;
}
/* copy the legal byte sequence to the target */
{
int8_t i;
for(i=0; i<oldToULength; ++i) {
*target++=utf8->toUBytes[i];
}
source-=(toULength-oldToULength);
for(; i<toULength; ++i) {
*target++=*source++;
}
count-=toULength;
}
}
}
if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
if(target==(const uint8_t *)pFromUArgs->targetLimit) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
} else {
b=*source;
toULimit=utf8_countTrailBytes[b]+1;
if(toULimit>(sourceLimit-source)) {
/* collect a truncated byte sequence */
toULength=0;
c=b;
for(;;) {
utf8->toUBytes[toULength++]=b;
if(++source==sourceLimit) {
/* partial byte sequence at end of source */
utf8->toUnicodeStatus=c;
utf8->toULength=toULength;
utf8->mode=toULimit;
break;
} else if(!U8_IS_TRAIL(b=*source)) {
/* lead byte in trail byte position */
utf8->toULength=toULength;
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
break;
}
c=(c<<6)+b;
}
} else {
/* partial-sequence target overflow: fall back to the pivoting implementation */
*pErrorCode=U_USING_DEFAULT_WARNING;
}
}
}
/* write back the updated pointers */
pToUArgs->source=(char *)source;
pFromUArgs->target=(char *)target;
}
/* UTF-8 converter data ----------------------------------------------------- */
static const UConverterImpl _UTF8Impl={
@ -746,7 +1003,10 @@ static const UConverterImpl _UTF8Impl={
NULL,
NULL,
NULL,
ucnv_getNonSurrogateUnicodeSet
ucnv_getNonSurrogateUnicodeSet,
ucnv_UTF8FromUTF8,
ucnv_UTF8FromUTF8
};
/* The 1208 CCSID refers to any version of Unicode of UTF-8 */

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2004, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvlat1.cpp
@ -330,6 +330,105 @@ noMoreInput:
pArgs->offsets=offsets;
}
/* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
static void
ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
UConverterToUnicodeArgs *pToUArgs,
UErrorCode *pErrorCode) {
UConverter *utf8;
const uint8_t *source, *sourceLimit;
uint8_t *target;
int32_t targetCapacity;
UChar32 c;
uint8_t b, t1;
/* set up the local pointers */
utf8=pToUArgs->converter;
source=(uint8_t *)pToUArgs->source;
sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
target=(uint8_t *)pFromUArgs->target;
targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
/* get the converter state from the UTF-8 UConverter */
c=(UChar32)utf8->toUnicodeStatus;
if(c!=0 && source<sourceLimit) {
if(targetCapacity==0) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
return;
} else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {
++source;
*target++=(uint8_t)(((c&3)<<6)|t1);
--targetCapacity;
utf8->toUnicodeStatus=0;
utf8->toULength=0;
} else {
/* complicated, illegal or unmappable input: fall back to the pivoting implementation */
*pErrorCode=U_USING_DEFAULT_WARNING;
return;
}
}
/*
* Make sure that the last byte sequence before sourceLimit is complete
* or runs into a lead byte.
* In the conversion loop compare source with sourceLimit only once
* per multi-byte character.
* For Latin-1, adjust sourceLimit only for 1 trail byte because
* the conversion loop handles at most 2-byte sequences.
*/
if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {
--sourceLimit;
}
/* conversion loop */
while(source<sourceLimit) {
if(targetCapacity>0) {
b=*source++;
if((int8_t)b>=0) {
/* convert ASCII */
*target++=(uint8_t)b;
--targetCapacity;
} else if( /* handle U+0080..U+00FF inline */
b>=0xc2 && b<=0xc3 &&
(t1=(uint8_t)(*source-0x80)) <= 0x3f
) {
++source;
*target++=(uint8_t)(((b&3)<<6)|t1);
--targetCapacity;
} else {
/* complicated, illegal or unmappable input: fall back to the pivoting implementation */
pToUArgs->source=(char *)(source-1);
pFromUArgs->target=(char *)target;
*pErrorCode=U_USING_DEFAULT_WARNING;
return;
}
} else {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
break;
}
}
/*
* The sourceLimit may have been adjusted before the conversion loop
* to stop before a truncated sequence.
* If so, then collect the truncated sequence now.
* For Latin-1, there is at most exactly one lead byte because of the
* smaller sourceLimit adjustment logic.
*/
if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
utf8->toULength=1;
utf8->mode=utf8_countTrailBytes[b]+1;
}
/* write back the updated pointers */
pToUArgs->source=(char *)source;
pFromUArgs->target=(char *)target;
}
static void
_Latin1GetUnicodeSet(const UConverter *cnv,
const USetAdder *sa,
@ -358,7 +457,10 @@ static const UConverterImpl _Latin1Impl={
NULL,
NULL,
NULL,
_Latin1GetUnicodeSet
_Latin1GetUnicodeSet,
NULL,
ucnv_Latin1FromUTF8
};
static const UConverterStaticData _Latin1StaticData={
@ -532,6 +634,95 @@ _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
return 0xffff;
}
/* "Convert" UTF-8 to US-ASCII: Validate and copy. */
static void
ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
UConverterToUnicodeArgs *pToUArgs,
UErrorCode *pErrorCode) {
const uint8_t *source, *sourceLimit;
uint8_t *target;
int32_t targetCapacity, length;
uint8_t c;
if(pToUArgs->converter->toUnicodeStatus!=0) {
/* no handling of partial UTF-8 characters here, fall back to pivoting */
*pErrorCode=U_USING_DEFAULT_WARNING;
return;
}
/* set up the local pointers */
source=(const uint8_t *)pToUArgs->source;
sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;
target=(uint8_t *)pFromUArgs->target;
targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
/*
* since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
* for the minimum of the sourceLength and targetCapacity
*/
length=(int32_t)(sourceLimit-source);
if(length<targetCapacity) {
targetCapacity=length;
}
/* unroll the loop with the most common case */
if(targetCapacity>=16) {
int32_t count, loops;
uint8_t oredChars;
loops=count=targetCapacity>>4;
do {
oredChars=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
oredChars|=*target++=*source++;
/* were all 16 entries really valid? */
if(oredChars>0x7f) {
/* no, return to the first of these 16 */
source-=16;
target-=16;
break;
}
} while(--count>0);
count=loops-count;
targetCapacity-=16*count;
}
/* conversion loop */
c=0;
while(targetCapacity>0 && (c=*source)<=0x7f) {
++source;
*target++=c;
--targetCapacity;
}
if(c>0x7f) {
/* non-ASCII character, handle in standard converter */
*pErrorCode=U_USING_DEFAULT_WARNING;
} else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {
/* target is full */
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
/* write back the updated pointers */
pToUArgs->source=(const char *)source;
pFromUArgs->target=(char *)target;
}
static void
_ASCIIGetUnicodeSet(const UConverter *cnv,
const USetAdder *sa,
@ -560,7 +751,10 @@ static const UConverterImpl _ASCIIImpl={
NULL,
NULL,
NULL,
_ASCIIGetUnicodeSet
_ASCIIGetUnicodeSet,
NULL,
ucnv_ASCIIFromUTF8
};
static const UConverterStaticData _ASCIIStaticData={

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2000-2004, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -41,6 +41,59 @@
* the same toUnicode structures, while the fromUnicode structures for SBCS
* differ from those for other MBCS-style converters.
*
* _MBCSHeader.version 4.3 optionally modifies the fromUnicode data structures
* slightly and optionally adds a table for conversion to MBCS (non-SBCS)
* charsets.
*
* The modifications are to make the data utf8Friendly. Not every 4.3 file
* file contains utf8Friendly data.
* It is utf8Friendly if _MBCSHeader.version[2]!=0.
* In this case, the data structures are utf8Friendly up to the code point
* maxFastUChar=((_MBCSHeader.version[2]<<8)|0xff)
*
* A utf8Friendly file has fromUnicode stage 3 entries for code points up to
* maxFastUChar allocated in blocks of 64 for indexing with the 6 bits from
* a UTF-8 trail byte. ASCII is allocated linearly with 128 contiguous entries.
*
* In addition, a utf8Friendly MBCS file contains an additional
* uint16_t mbcsIndex[(maxFastUChar+1)>>6];
* which replaces the stage 1 and 2 tables for indexing with bits from the
* UTF-8 lead byte and middle trail byte. Unlike the older MBCS stage 2 table,
* the mbcsIndex does not contain roundtrip flags. Therefore, all fallbacks
* from code points up to maxFastUChar (and roundtrips to 0x00) are moved to
* the extension data structure. This also allows for faster roundtrip
* conversion from UTF-16.
*
* SBCS files do not contain an additional sbcsIndex[] array because the
* proportional size increase would be noticeable, but the runtime
* code builds one for the code point range for which the runtime conversion
* code is optimized.
*
* For SBCS, maxFastUChar should be at least U+0FFF. The initial makeconv
* implementation sets it to U+1FFF. Because the sbcsIndex is not stored in
* the file, a larger maxFastUChar only affects stage 3 block allocation size
* and is free in empty blocks. (Larger blocks with sparse contents cause larger
* files.) U+1FFF includes almost all of the small scripts.
* U+0FFF covers UTF-8 two-byte sequences and three-byte sequences starting with
* 0xe0. This includes most scripts with legacy SBCS charsets.
* The initial runtime implementation using 4.3 files only builds an sbcsIndex
* for code points up to U+0FFF.
*
* For MBCS, maxFastUChar should be at least U+D7FF (=initial value).
* This boundary is convenient because practically all of the commonly used
* characters are below it, and because it is the boundary to surrogate
* code points, above which special handling is necessary anyway.
* (Surrogate pair assembly for UTF-16, validity checking for UTF-8.)
*
* maxFastUChar could be up to U+FFFF to cover the whole BMP, which could be
* useful especially for conversion from UTF-8 when the input can be assumed
* to be valid, because the surrogate range would then not have to be
* checked.
* (With maxFastUChar=0xffff, makeconv would have to check for mbcsIndex value
* overflow because with the all-unassigned block 0 and nearly full mappings
* from the BMP it is theoretically possible that an index into stage 3
* exceeds 16 bits.)
*
* _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
* If it is present, then an ICU version reading header versions 4.0 or 4.1
* will be able to use the base table and ignore the extension.
@ -60,7 +113,7 @@
* struct _MBCSHeader (see the definition in this header file below)
* contains 32-bit fields as follows:
* 8 values:
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.3.x.0)
* 1 uint32_t countStates
* 2 uint32_t countToUFallbacks
* 3 uint32_t offsetToUCodeUnits
@ -121,6 +174,15 @@
* uint16_t fromUBytes[fromUBytesLength/2]; or
* uint32_t fromUBytes[fromUBytesLength/4];
* }
*
* -- optional utf8Friendly mbcsIndex -- _MBCSHeader.version 4.3 (ICU 3.8) and higher
* if(outputType!=MBCS_OUTPUT_1 &&
* _MBCSHeader.version[1]>=3 &&
* (maxFastUChar=_MBCSHeader.version[2])!=0
* ) {
* maxFastUChar=(maxFastUChar<<8)|0xff;
* uint16_t mbcsIndex[(maxFastUChar+1)>>6];
* }
* }
*
* -- extension table, details see ucnv_ext.h
@ -180,9 +242,17 @@ enum {
#define MBCS_ENTRY_FINAL_VALUE(entry) ((entry)&0xfffff)
#define MBCS_ENTRY_FINAL_VALUE_16(entry) (uint16_t)(entry)
#define IS_ASCII_ROUNDTRIP(b, asciiRoundtrips) (((asciiRoundtrips) & (1<<((b)>>2)))!=0)
/* single-byte fromUnicode: get the 16-bit result word */
#define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ]
/* single-byte fromUnicode using the sbcsIndex */
#define SBCS_RESULT_FROM_LOW_BMP(table, results, c) (results)[ (table)[(c)>>6] +((c)&0x3f) ]
/* single-byte fromUTF8 using the sbcsIndex; l and t must be masked externally; can be l=0 and t<=0x7f */
#define SBCS_RESULT_FROM_UTF8(table, results, l, t) (results)[ (table)[l] +(t) ]
/* multi-byte fromUnicode: get the 32-bit stage 2 entry */
#define MBCS_STAGE_2_FROM_U(table, c) ((const uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ]
#define MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) ( ((stage2Entry) & ((uint32_t)1<< (16+((c)&0xf)) )) !=0)
@ -192,6 +262,12 @@ enum {
#define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3)
/* double-byte fromUnicode using the mbcsIndex */
#define DBCS_RESULT_FROM_MOST_BMP(table, results, c) (results)[ (table)[(c)>>6] +((c)&0x3f) ]
/* double-byte fromUTF8 using the mbcsIndex; l and t1 combined into lt1; lt1 and t2 must be masked externally */
#define DBCS_RESULT_FROM_UTF8(table, results, lt1, t2) (results)[ (table)[lt1] +(t2) ]
/**
* MBCS output types for conversions from Unicode.
@ -226,9 +302,19 @@ typedef struct {
UChar32 codePoint;
} _MBCSToUFallback;
/** Constants for fast and UTF-8-friendly conversion. */
enum {
SBCS_FAST_MAX=0x0fff, /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */
SBCS_FAST_LIMIT=SBCS_FAST_MAX+1, /* =0x1000 */
MBCS_FAST_MAX=0xd7ff, /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */
MBCS_FAST_LIMIT=MBCS_FAST_MAX+1 /* =0xd800 */
};
/**
* This is the MBCS part of the UConverterTable union (a runtime data structure).
* It keeps all the per-converter data and points into the loaded mapping tables.
*
* utf8Friendly data structures added with _MBCSHeader.version 4.3
*/
typedef struct UConverterMBCSTable {
/* toUnicode */
@ -242,10 +328,17 @@ typedef struct UConverterMBCSTable {
/* fromUnicode */
const uint16_t *fromUnicodeTable;
const uint16_t *mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */
uint16_t sbcsIndex[SBCS_FAST_LIMIT>>6]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */
const uint8_t *fromUnicodeBytes;
uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */
uint8_t *swapLFNLFromUnicodeBytes; /* for swaplfnl */
uint32_t fromUBytesLength;
uint8_t outputType, unicodeMask;
UBool utf8Friendly; /* for utf8Friendly data */
UChar maxFastUChar; /* for utf8Friendly data */
/* roundtrips */
uint32_t asciiRoundtrips;
/* converter name for swaplfnl */
char *swapLFNLName;

View file

@ -20,6 +20,7 @@
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "unicode/putil.h"
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "ucnv_bld.h" /* for sizeof(UConverter) */
#include "cmemory.h" /* for UAlignedMemory */
@ -108,6 +109,7 @@ static void TestConvertSafeCloneCallback(void);
static void TestEBCDICSwapLFNL(void);
static void TestConvertEx(void);
static void TestConvertExFromUTF8(void);
static void TestConvertAlgorithmic(void);
void TestDefaultConverterError(void); /* defined in cctest.c */
static void TestToUCountPending(void);
@ -136,6 +138,7 @@ void addTestConvert(TestNode** root)
addTest(root, &TestLMBCSMaxChar, "tsconv/ccapitst/TestLMBCSMaxChar");
addTest(root, &TestEBCDICSwapLFNL, "tsconv/ccapitst/TestEBCDICSwapLFNL");
addTest(root, &TestConvertEx, "tsconv/ccapitst/TestConvertEx");
addTest(root, &TestConvertExFromUTF8, "tsconv/ccapitst/TestConvertExFromUTF8");
addTest(root, &TestConvertAlgorithmic, "tsconv/ccapitst/TestConvertAlgorithmic");
addTest(root, &TestDefaultConverterError, "tsconv/ccapitst/TestDefaultConverterError");
addTest(root, &TestToUCountPending, "tsconv/ccapitst/TestToUCountPending");
@ -2205,6 +2208,11 @@ convertExStreaming(UConverter *srcCnv, UConverter *targetCnv,
pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
FALSE, flush, &errorCode);
targetLength=(int32_t)(target-targetBuffer);
if(target>targetLimit) {
log_err("ucnv_convertEx(%s) chunk[%d] target %p exceeds targetLimit %p\n",
testName, chunkSize, target, targetLimit);
break; /* TODO: major problem! */
}
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
/* continue converting another chunk */
errorCode=U_ZERO_ERROR;
@ -2402,6 +2410,264 @@ static void TestConvertEx() {
#endif
}
/* Test illegal UTF-8 input: Data and functions for TestConvertExFromUTF8(). */
static const char *const badUTF8[]={
/* truncated multi-byte sequences */
"\xd0",
"\xe0",
"\xe1",
"\xed",
"\xee",
"\xf0",
"\xf1",
"\xf4",
"\xf8",
"\xfc",
"\xe0\x80",
"\xe0\xa0",
"\xe1\x80",
"\xed\x80",
"\xed\xa0",
"\xee\x80",
"\xf0\x80",
"\xf0\x90",
"\xf1\x80",
"\xf4\x80",
"\xf4\x90",
"\xf8\x80",
"\xfc\x80",
"\xf0\x80\x80",
"\xf0\x90\x80",
"\xf1\x80\x80",
"\xf4\x80\x80",
"\xf4\x90\x80",
"\xf8\x80\x80",
"\xfc\x80\x80",
"\xf8\x80\x80\x80",
"\xfc\x80\x80\x80",
"\xfc\x80\x80\x80\x80",
/* complete sequences but non-shortest forms or out of range etc. */
"\xc0\x80",
"\xe0\x80\x80",
"\xed\xa0\x80",
"\xf0\x80\x80\x80",
"\xf4\x90\x80\x80",
"\xf8\x80\x80\x80\x80",
"\xfc\x80\x80\x80\x80\x80",
"\xfe",
"\xff"
};
/* get some character that can be converted and convert it */
static UBool getTestChar(UConverter *cnv, const char *converterName,
char charUTF8[4], int32_t *pCharUTF8Length,
char char0[8], int32_t *pChar0Length,
char char1[8], int32_t *pChar1Length) {
UChar utf16[U16_MAX_LENGTH];
int32_t utf16Length;
const UChar *utf16Source;
char *target;
USet *set;
UChar32 c;
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
set=uset_open(1, 0);
ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &errorCode);
c=uset_charAt(set, uset_size(set)/2);
uset_close(set);
utf16Length=0;
U16_APPEND_UNSAFE(utf16, utf16Length, c);
*pCharUTF8Length=0;
U8_APPEND_UNSAFE(charUTF8, *pCharUTF8Length, c);
utf16Source=utf16;
target=char0;
ucnv_fromUnicode(cnv,
&target, char0+sizeof(char0),
&utf16Source, utf16+utf16Length,
NULL, FALSE, &errorCode);
*pChar0Length=(int32_t)(target-char0);
utf16Source=utf16;
target=char1;
ucnv_fromUnicode(cnv,
&target, char1+sizeof(char1),
&utf16Source, utf16+utf16Length,
NULL, FALSE, &errorCode);
*pChar1Length=(int32_t)(target-char1);
if(U_FAILURE(errorCode)) {
log_err("unable to get test character for %s - %s\n", converterName, u_errorName(errorCode));
return FALSE;
}
return TRUE;
}
static void testFromTruncatedUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
char charUTF8[4], int32_t charUTF8Length,
char char0[8], int32_t char0Length,
char char1[8], int32_t char1Length) {
char utf8[16];
int32_t utf8Length;
char output[16];
int32_t outputLength;
char invalidChars[8];
int8_t invalidLength;
char *source;
char *target;
UChar pivotBuffer[8];
UChar *pivotSource, *pivotTarget;
UErrorCode errorCode;
int32_t i;
/* test truncated sequences */
errorCode=U_ZERO_ERROR;
ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
memcpy(utf8, charUTF8, charUTF8Length);
for(i=0; i<LENGTHOF(badUTF8); ++i) {
/* truncated sequence? */
int32_t length=strlen(badUTF8[i]);
if(length>=(1+U8_COUNT_TRAIL_BYTES(badUTF8[i][0]))) {
continue;
}
/* assemble a string with the test character and the truncated sequence */
memcpy(utf8+charUTF8Length, badUTF8[i], length);
utf8Length=charUTF8Length+length;
/* convert and check the invalidChars */
source=utf8;
target=output;
pivotSource=pivotTarget=pivotBuffer;
errorCode=U_ZERO_ERROR;
ucnv_convertEx(cnv, utf8Cnv,
&target, output+sizeof(output),
&source, utf8+utf8Length,
pivotBuffer, &pivotSource, &pivotTarget, pivotBuffer+LENGTHOF(pivotBuffer),
TRUE, TRUE, /* reset & flush */
&errorCode);
outputLength=(int32_t)(target-output);
if(errorCode!=U_TRUNCATED_CHAR_FOUND || pivotSource!=pivotBuffer) {
log_err("unexpected error %s from %s badUTF8[%ld]\n", u_errorName(errorCode), converterName, (long)i);
continue;
}
errorCode=U_ZERO_ERROR;
invalidLength=(int8_t)sizeof(invalidChars);
ucnv_getInvalidChars(utf8Cnv, invalidChars, &invalidLength, &errorCode);
if(invalidLength!=length || 0!=memcmp(invalidChars, badUTF8[i], length)) {
log_err("wrong invalidChars from %s badUTF8[%ld]\n", converterName, (long)i);
}
}
}
static void testFromBadUTF8(UConverter *utf8Cnv, UConverter *cnv, const char *converterName,
char charUTF8[4], int32_t charUTF8Length,
char char0[8], int32_t char0Length,
char char1[8], int32_t char1Length) {
char utf8[600], expect[600];
int32_t utf8Length, expectLength;
char testName[32];
UErrorCode errorCode;
int32_t i;
errorCode=U_ZERO_ERROR;
ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &errorCode);
/*
* assemble an input string with the test character between each
* bad sequence,
* and an expected string with repeated test character output
*/
memcpy(utf8, charUTF8, charUTF8Length);
utf8Length=charUTF8Length;
memcpy(expect, char0, char0Length);
expectLength=char0Length;
for(i=0; i<LENGTHOF(badUTF8); ++i) {
int32_t length=strlen(badUTF8[i]);
memcpy(utf8+utf8Length, badUTF8[i], length);
utf8Length+=length;
memcpy(utf8+utf8Length, charUTF8, charUTF8Length);
utf8Length+=charUTF8Length;
memcpy(expect+expectLength, char1, char1Length);
expectLength+=char1Length;
}
/* expect that each bad UTF-8 sequence is detected and skipped */
strcpy(testName, "from bad UTF-8 to ");
strcat(testName, converterName);
convertExMultiStreaming(utf8Cnv, cnv,
utf8, utf8Length,
expect, expectLength,
testName,
U_ZERO_ERROR);
}
/* Test illegal UTF-8 input. */
static void TestConvertExFromUTF8() {
static const char *const converterNames[]={
"windows-1252",
"shift-jis",
"us-ascii",
"iso-8859-1",
"utf-8"
};
UConverter *utf8Cnv, *cnv;
UErrorCode errorCode;
int32_t i;
/* fromUnicode versions of some character, from initial state and later */
char charUTF8[4], char0[8], char1[8];
int32_t charUTF8Length, char0Length, char1Length;
errorCode=U_ZERO_ERROR;
utf8Cnv=ucnv_open("UTF-8", &errorCode);
if(U_FAILURE(errorCode)) {
log_err("unable to open UTF-8 converter - %s\n", u_errorName(errorCode));
return;
}
for(i=0; i<LENGTHOF(converterNames); ++i) {
errorCode=U_ZERO_ERROR;
cnv=ucnv_open(converterNames[i], &errorCode);
if(U_FAILURE(errorCode)) {
log_err("unable to open %s converter - %s\n", converterNames[i], u_errorName(errorCode));
continue;
}
if(!getTestChar(cnv, converterNames[i], charUTF8, &charUTF8Length, char0, &char0Length, char1, &char1Length)) {
continue;
}
testFromTruncatedUTF8(utf8Cnv, cnv, converterNames[i], charUTF8, charUTF8Length, char0, char0Length, char1, char1Length);
testFromBadUTF8(utf8Cnv, cnv, converterNames[i], charUTF8, charUTF8Length, char0, char0Length, char1, char1Length);
ucnv_close(cnv);
}
ucnv_close(utf8Cnv);
}
static void
TestConvertAlgorithmic() {
#if !UCONFIG_NO_LEGACY_CONVERSION

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2006, International Business Machines
* Copyright (C) 2003-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -50,7 +50,18 @@ enum {
ESC_CB='&'
};
ConversionTest::~ConversionTest() {}
ConversionTest::ConversionTest() {
UErrorCode errorCode=U_ZERO_ERROR;
utf8Cnv=ucnv_open("UTF-8", &errorCode);
ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
if(U_FAILURE(errorCode)) {
errln("unable to open UTF-8 converter");
}
}
ConversionTest::~ConversionTest() {
ucnv_close(utf8Cnv);
}
void
ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
@ -948,6 +959,112 @@ ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *
// fromUnicode test worker functions --------------------------------------- ***
static int32_t
stepFromUTF8(ConversionCase &cc,
UConverter *utf8Cnv, UConverter *cnv,
char *result, int32_t resultCapacity,
int32_t step,
UErrorCode *pErrorCode) {
const char *source, *sourceLimit, *utf8Limit;
UChar pivotBuffer[32];
UChar *pivotSource, *pivotTarget, *pivotLimit;
char *target, *targetLimit, *resultLimit;
UBool flush;
source=cc.utf8;
pivotSource=pivotTarget=pivotBuffer;
target=result;
utf8Limit=source+cc.utf8Length;
resultLimit=result+resultCapacity;
// call ucnv_convertEx() with in/out buffers no larger than (step) at a time
// move only one buffer (in vs. out) at a time to be extra mean
// step==0 performs bulk conversion
// initialize the partial limits for the loop
if(step==0) {
// use the entire buffers
sourceLimit=utf8Limit;
targetLimit=resultLimit;
flush=cc.finalFlush;
pivotLimit=pivotBuffer+LENGTHOF(pivotBuffer);
} else {
// start with empty partial buffers
sourceLimit=source;
targetLimit=target;
flush=FALSE;
// empty pivot is not allowed, make it of length step
pivotLimit=pivotBuffer+step;
}
for(;;) {
// resetting the opposite conversion direction must not affect this one
ucnv_resetFromUnicode(utf8Cnv);
ucnv_resetToUnicode(cnv);
// convert
ucnv_convertEx(cnv, utf8Cnv,
&target, targetLimit,
&source, sourceLimit,
pivotBuffer, &pivotSource, &pivotTarget, pivotLimit,
FALSE, flush, pErrorCode);
// check pointers and errors
if(source>sourceLimit || target>targetLimit) {
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
break;
} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
if(target!=targetLimit) {
// buffer overflow must only be set when the target is filled
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
break;
} else if(targetLimit==resultLimit) {
// not just a partial overflow
break;
}
// the partial target is filled, set a new limit, reset the error and continue
targetLimit=(resultLimit-target)>=step ? target+step : resultLimit;
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode)) {
if(pivotSource==pivotBuffer) {
// toUnicode error, should not occur
// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
break;
} else {
// fromUnicode error
// some other error occurred, done
break;
}
} else {
if(source!=sourceLimit) {
// when no error occurs, then the input must be consumed
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
break;
}
if(sourceLimit==utf8Limit) {
// we are done
if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {
// ucnv_convertEx() warns about not terminating the output
// but ucnv_fromUnicode() does not and so
// checkFromUnicode() does not expect it
*pErrorCode=U_ZERO_ERROR;
}
break;
}
// the partial conversion succeeded, set a new limit and continue
sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit;
flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit);
}
}
return (int32_t)(target-result);
}
static int32_t
stepFromUnicode(ConversionCase &cc, UConverter *cnv,
char *result, int32_t resultCapacity,
@ -1048,6 +1165,7 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode));
return FALSE;
}
ucnv_resetToUnicode(utf8Cnv);
// set the callback
if(callback!=NULL) {
@ -1086,6 +1204,19 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
}
}
// convert unicode to utf8
char utf8[200];
cc.utf8=utf8;
u_strToUTF8(utf8, LENGTHOF(utf8), &cc.utf8Length,
cc.unicode, cc.unicodeLength,
&errorCode);
if(U_FAILURE(errorCode)) {
// skip UTF-8 testing of a string with an unpaired surrogate,
// or of one that's too long
// toUnicode errors are tested in cintltst TestConvertExFromUTF8()
cc.utf8Length=-1;
}
int32_t resultOffsets[200];
char result[200];
int32_t resultLength;
@ -1093,22 +1224,18 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
static const struct {
int32_t step;
const char *name;
const char *name, *utf8Name;
} steps[]={
{ 0, "bulk" }, // must be first for offsets to be checked
{ 1, "step=1" },
{ 3, "step=3" },
{ 7, "step=7" }
{ 0, "bulk", "utf8" }, // must be first for offsets to be checked
{ 1, "step=1", "utf8 step=1" },
{ 3, "step=3", "utf8 step=3" },
{ 7, "step=7", "utf8 step=7" }
};
int32_t i, step;
ok=TRUE;
for(i=0; i<LENGTHOF(steps) && ok; ++i) {
step=steps[i].step;
if(step!=0) {
// bulk test is first, then offsets are not checked any more
cc.offsets=NULL;
}
errorCode=U_ZERO_ERROR;
resultLength=stepFromUnicode(cc, cnv,
result, LENGTHOF(result),
@ -1124,6 +1251,28 @@ ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback call
// otherwise do nothing to make sure that flushing resets
ucnv_resetFromUnicode(cnv);
}
// bulk test is first, then offsets are not checked any more
cc.offsets=NULL;
// test direct conversion from UTF-8
if(cc.utf8Length>=0) {
errorCode=U_ZERO_ERROR;
resultLength=stepFromUTF8(cc, utf8Cnv, cnv,
result, LENGTHOF(result),
step, &errorCode);
ok=checkFromUnicode(
cc, cnv, steps[i].utf8Name,
(uint8_t *)result, resultLength,
NULL,
errorCode);
if(U_FAILURE(errorCode) || !cc.finalFlush) {
// reset if an error occurred or we did not flush
// otherwise do nothing to make sure that flushing resets
ucnv_resetToUnicode(utf8Cnv);
ucnv_resetFromUnicode(cnv);
}
}
}
// not a real loop, just a convenience for breaking out of the block

View file

@ -27,18 +27,25 @@
#include "intltest.h"
struct ConversionCase {
/* setup */
int32_t caseNr;
const char *charset, *cbopt, *name;
UChar subString[16];
char subchar[8];
int8_t setSub;
/* input and expected output */
const uint8_t *bytes;
int32_t bytesLength;
const UChar *unicode;
int32_t unicodeLength;
const int32_t *offsets;
/* UTF-8 version of unicode[unicodeLength] */
const char *utf8;
int32_t utf8Length;
/* options */
UBool finalFlush;
UBool fallbacks;
UErrorCode outErrorCode;
@ -46,6 +53,7 @@ struct ConversionCase {
const UChar *invalidUChars;
int32_t invalidLength;
/* actual output */
uint8_t resultBytes[200];
UChar resultUnicode[200];
int32_t resultOffsets[200];
@ -56,7 +64,7 @@ struct ConversionCase {
class ConversionTest : public IntlTest {
public:
ConversionTest() {}
ConversionTest();
virtual ~ConversionTest();
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=0);
@ -86,6 +94,9 @@ private:
UConverter *
cnv_open(const char *name, UErrorCode &errorCode);
/* for testing direct UTF-8 conversion */
UConverter *utf8Cnv;
};
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2005, International Business Machines
* Copyright (C) 2002-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: utfperf.cpp
@ -16,45 +16,151 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/uperf.h"
#include "uoptions.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/* definitions and text buffers */
#define INPUT_CAPACITY (1024*1024)
#define INTERMEDIATE_CAPACITY 4096
#define INTERMEDIATE_SMALL_CAPACITY 20
#define PIVOT_CAPACITY 1024
#define OUTPUT_CAPACITY INPUT_CAPACITY
static UChar input[INPUT_CAPACITY];
static char utf8[INPUT_CAPACITY];
static UChar pivot[INTERMEDIATE_CAPACITY];
static UChar output[OUTPUT_CAPACITY];
static char intermediate[INTERMEDIATE_CAPACITY];
static char intermediate[OUTPUT_CAPACITY];
static int32_t inputLength, encodedLength, outputLength, countInputCodePoints;
static int32_t utf8Length, encodedLength, outputLength, countInputCodePoints;
static int32_t fromUCallbackCount;
class Command : public UPerfFunction {
private:
Command(const char * name, int32_t buf_cap):name(name),buf_cap(buf_cap){
errorCode=U_ZERO_ERROR;
cnv=ucnv_open(name, &errorCode);
}
// Command-line options specific to utfperf.
// Options do not have abbreviations: Force readable command lines.
// (Using U+0001 for abbreviation characters.)
enum {
CHARSET,
CHUNK_LENGTH,
PIVOT_LENGTH,
UTFPERF_OPTIONS_COUNT
};
static UOption options[UTFPERF_OPTIONS_COUNT]={
UOPTION_DEF("charset", '\x01', UOPT_REQUIRES_ARG),
UOPTION_DEF("chunk", '\x01', UOPT_REQUIRES_ARG),
UOPTION_DEF("pivot", '\x01', UOPT_REQUIRES_ARG)
};
static const char *const utfperf_usage =
"\t--charset Charset for which to test performance, e.g. windows-1251.\n"
"\t Default: UTF-8\n"
"\t--chunk Length (in bytes) of charset output chunks. [4096]\n"
"\t--pivot Length (in UChars) of the UTF-16 pivot buffer, if applicable.\n"
"\t [1024]\n";
// Test object.
class UtfPerformanceTest : public UPerfTest{
public:
static UPerfFunction* get(const char * name, int32_t buf_cap){
Command * t = new Command(name, buf_cap);
if (U_SUCCESS(t->errorCode)){
return t;
} else {
//fprintf(stderr, "error opening converter for \"%s\" - %s\n", name, u_errorName(errorCode));
delete t;
return NULL;
UtfPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
: UPerfTest(argc, argv, options, LENGTHOF(options), utfperf_usage, status) {
if (U_SUCCESS(status)) {
charset = options[CHARSET].value;
chunkLength = atoi(options[CHUNK_LENGTH].value);
if (chunkLength < 1 || OUTPUT_CAPACITY < chunkLength) {
fprintf(stderr, "error: chunk length must be 1..%ld\n", (long)OUTPUT_CAPACITY);
status = U_ILLEGAL_ARGUMENT_ERROR;
}
pivotLength = atoi(options[PIVOT_LENGTH].value);
if (pivotLength < 1 || PIVOT_CAPACITY < pivotLength) {
fprintf(stderr, "error: pivot length must be 1..%ld\n", (long)PIVOT_CAPACITY);
status = U_ILLEGAL_ARGUMENT_ERROR;
}
int32_t inputLength;
UPerfTest::getBuffer(inputLength, status);
countInputCodePoints = u_countChar32(buffer, bufferLen);
u_strToUTF8(utf8, (int32_t)sizeof(utf8), &utf8Length, buffer, bufferLen, &status);
}
}
virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
const UChar *getBuffer() const { return buffer; }
int32_t getBufferLen() const { return bufferLen; }
const char *charset;
int32_t chunkLength, pivotLength;
};
U_CDECL_BEGIN
// Custom callback for counting callback calls.
static void U_CALLCONV
fromUCallback(const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar *codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode *pErrorCode) {
if (reason <= UCNV_IRREGULAR) {
++fromUCallbackCount;
}
UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, pErrorCode);
}
U_CDECL_END
// Base class for Roundtrip, FromUnicode and FromUTF8 with common setup.
class Command : public UPerfFunction {
protected:
Command(const UtfPerformanceTest &testcase)
: testcase(testcase),
input(testcase.getBuffer()), inputLength(testcase.getBufferLen()),
errorCode(U_ZERO_ERROR) {
cnv=ucnv_open(testcase.charset, &errorCode);
if (U_FAILURE(errorCode)) {
fprintf(stderr, "error opening converter for \"%s\" - %s\n", testcase.charset, u_errorName(errorCode));
}
ucnv_setFromUCallBack(cnv, fromUCallback, NULL, NULL, NULL, &errorCode);
}
public:
virtual ~Command(){
if(U_SUCCESS(errorCode)) {
ucnv_close(cnv);
}
}
// virtual void call(UErrorCode* pErrorCode) { ... }
virtual long getOperationsPerIteration(){
return countInputCodePoints;
}
const UtfPerformanceTest &testcase;
const UChar *input;
int32_t inputLength;
UErrorCode errorCode;
UConverter *cnv;
};
// Test roundtrip UTF-16->encoding->UTF-16.
class Roundtrip : public Command {
protected:
Roundtrip(const UtfPerformanceTest &testcase) : Command(testcase) {}
public:
static UPerfFunction* get(const UtfPerformanceTest &testcase) {
Roundtrip * t = new Roundtrip(testcase);
if (U_SUCCESS(t->errorCode)){
return t;
} else {
delete t;
return NULL;
}
}
virtual void call(UErrorCode* pErrorCode){
const UChar *pIn, *pInLimit;
UChar *pOut, *pOutLimit;
@ -63,6 +169,7 @@ public:
UBool flush;
ucnv_reset(cnv);
fromUCallbackCount=0;
pIn=input;
pInLimit=input+inputLength;
@ -70,24 +177,24 @@ public:
pOut=output;
pOutLimit=output+OUTPUT_CAPACITY;
pInterLimit=intermediate+buf_cap;
pInterLimit=intermediate+testcase.chunkLength;
encodedLength=outputLength=0;
flush=FALSE;
while(pIn<pInLimit || !flush) {
do {
/* convert a block of [pIn..pInLimit[ to the encoding in intermediate[] */
pInter=intermediate;
flush=(UBool)(pIn==pInLimit);
ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, flush, pErrorCode);
ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, TRUE, pErrorCode);
encodedLength+=(int32_t)(pInter-intermediate);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
/* in case flush was TRUE make sure that we convert once more to really flush */
flush=FALSE;
/* make sure that we convert once more to really flush */
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode)) {
return;
} else if(pIn==pInLimit) {
flush=TRUE;
}
/* convert the block [intermediate..pInter[ back to UTF-16 */
@ -97,7 +204,7 @@ public:
return;
}
/* intermediate must have been consumed (p==pInter) because of the converter semantics */
}
} while(!flush);
outputLength=pOut-output;
if(inputLength!=outputLength) {
@ -105,45 +212,142 @@ public:
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
}
}
virtual long getOperationsPerIteration(){
return countInputCodePoints;
}
const char * name;
int32_t buf_cap;
UErrorCode errorCode;
UConverter *cnv;
};
class UtfPerformanceTest : public UPerfTest{
// Test one-way conversion UTF-16->encoding.
class FromUnicode : public Command {
protected:
FromUnicode(const UtfPerformanceTest &testcase) : Command(testcase) {}
public:
UtfPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status) :UPerfTest(argc,argv,status){
getBuffer(inputLength, status);
u_strncpy(input, buffer, inputLength);
countInputCodePoints = u_countChar32(input, inputLength);
}
virtual UPerfFunction* runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ){
switch (index) {
case 0: name = "UTF_8"; if (exec) return Command::get("UTF-8", INTERMEDIATE_CAPACITY); break;
case 1: name = "UTF_8_SB"; if (exec) return Command::get("UTF-8",INTERMEDIATE_SMALL_CAPACITY); break;
case 2: name = "SCSU"; if (exec) return Command::get("SCSU", INTERMEDIATE_CAPACITY); break;
case 3: name = "SCSU_SB"; if (exec) return Command::get("SCSU", INTERMEDIATE_SMALL_CAPACITY); break;
case 4: name = "BOCU_1"; if (exec) return Command::get("BOCU-1", INTERMEDIATE_CAPACITY); break;
case 5: name = "BOCU_1_SB"; if (exec) return Command::get("BOCU-1",INTERMEDIATE_SMALL_CAPACITY); break;
default: name = ""; break;
static UPerfFunction* get(const UtfPerformanceTest &testcase) {
FromUnicode * t = new FromUnicode(testcase);
if (U_SUCCESS(t->errorCode)){
return t;
} else {
delete t;
return NULL;
}
}
virtual void call(UErrorCode* pErrorCode){
const UChar *pIn, *pInLimit;
char *pInter, *pInterLimit;
ucnv_resetFromUnicode(cnv);
fromUCallbackCount=0;
pIn=input;
pInLimit=input+inputLength;
pInterLimit=intermediate+testcase.chunkLength;
encodedLength=0;
for(;;) {
pInter=intermediate;
ucnv_fromUnicode(cnv, &pInter, pInterLimit, &pIn, pInLimit, NULL, TRUE, pErrorCode);
encodedLength+=(int32_t)(pInter-intermediate);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
/* make sure that we convert once more to really flush */
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode)) {
return;
} else {
break; // all done
}
}
return NULL;
}
};
// Test one-way conversion UTF-8->encoding.
class FromUTF8 : public Command {
protected:
FromUTF8(const UtfPerformanceTest &testcase)
: Command(testcase),
utf8Cnv(NULL),
input8(utf8), input8Length(utf8Length) {
utf8Cnv=ucnv_open("UTF-8", &errorCode);
}
public:
static UPerfFunction* get(const UtfPerformanceTest &testcase) {
FromUTF8 * t = new FromUTF8(testcase);
if (U_SUCCESS(t->errorCode)){
return t;
} else {
delete t;
return NULL;
}
}
~FromUTF8() {
ucnv_close(utf8Cnv);
}
virtual void call(UErrorCode* pErrorCode){
const char *pIn, *pInLimit;
char *pInter, *pInterLimit;
UChar *pivotSource, *pivotTarget, *pivotLimit;
ucnv_resetToUnicode(utf8Cnv);
ucnv_resetFromUnicode(cnv);
fromUCallbackCount=0;
pIn=input8;
pInLimit=input8+input8Length;
pInterLimit=intermediate+testcase.chunkLength;
pivotSource=pivotTarget=pivot;
pivotLimit=pivot+testcase.pivotLength;
encodedLength=0;
for(;;) {
pInter=intermediate;
ucnv_convertEx(cnv, utf8Cnv,
&pInter, pInterLimit,
&pIn, pInLimit,
pivot, &pivotSource, &pivotTarget, pivotLimit,
FALSE, TRUE, pErrorCode);
encodedLength+=(int32_t)(pInter-intermediate);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
/* make sure that we convert once more to really flush */
*pErrorCode=U_ZERO_ERROR;
} else if(U_FAILURE(*pErrorCode)) {
return;
} else {
break; // all done
}
}
}
protected:
UConverter *utf8Cnv;
const char *input8;
int32_t input8Length;
};
UPerfFunction* UtfPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
switch (index) {
case 0: name = "Roundtrip"; if (exec) return Roundtrip::get(*this); break;
case 1: name = "FromUnicode"; if (exec) return FromUnicode::get(*this); break;
case 2: name = "FromUTF8"; if (exec) return FromUTF8::get(*this); break;
default: name = ""; break;
}
return NULL;
}
int main(int argc, const char *argv[])
{
// Default values for command-line options.
options[CHARSET].value = "UTF-8";
options[CHUNK_LENGTH].value = "4096";
options[PIVOT_LENGTH].value = "1024";
UErrorCode status = U_ZERO_ERROR;
UtfPerformanceTest test(argc, argv, status);
if (U_FAILURE(status)){
printf("The error is %s\n", u_errorName(status));
test.usage();
return status;
}
@ -152,5 +356,10 @@ int main(int argc, const char *argv[])
"arguments.\n");
return -1;
}
if (fromUCallbackCount > 0) {
printf("Number of fromUnicode callback calls in the last iteration: %ld\n", (long)fromUCallbackCount);
}
return 0;
}

View file

@ -1,6 +1,6 @@
//*******************************************************************************
//
// Copyright (C) 2003-2006, International Business Machines
// Copyright (C) 2003-2007, International Business Machines
// Corporation and others. All Rights Reserved.
//
// file name: conversion.txt
@ -474,6 +474,34 @@ conversion:table(nofallback) {
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
// Code coverage for UTF-8->SBCS conversion (ucnv_convertEx()).
// Test code path for non-roundtripping ASCII characters
// (try EBCDIC SBCS, and IBM PC SBCS with control code rotation).
{
"ibm-37",
"a\x85c",
:bin{ 811583 },
:intvector{ 0,1,2 },
:int{1}, :int{0}, "", "?", ""
}
{
"ibm-850",
"a\x1ac",
:bin{ 617f63 },
:intvector{ 0,1,2 },
:int{1}, :int{0}, "", "?", ""
}
// Code coverage for UTF-8->DBCS conversion (ucnv_convertEx()).
// Test code path for non-roundtripping ASCII characters
// (try IBM PC DBCS with control code rotation).
{
"ibm-943",
"a\x1ac\u30a1\x7ff",
:bin{ 617f6383401c66 },
:intvector{ 0,1,2,3,3,4,5 },
:int{1}, :int{0}, "", "?", ""
}
// SCSU regression test.
{
"SCSU",
"1\U00010001\u0085\U000500022\ud8003\udc014\ue001",

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (c) 2002-2005, International Business Machines
* Copyright (c) 2002-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
**********************************************************************
@ -16,6 +16,10 @@
#include "unicode/utimer.h"
#include "ucbuf.h"
// Forward declarations from uoptions.h.
struct UOption;
typedef struct UOption UOption;
#if !UCONFIG_NO_CONVERSION
U_NAMESPACE_USE
@ -126,6 +130,14 @@ public:
protected:
UPerfTest(int32_t argc, const char* argv[], UErrorCode& status);
UPerfTest(int32_t argc, const char* argv[],
UOption addOptions[], int32_t addOptionsCount,
const char *addUsage,
UErrorCode& status);
void init(UOption addOptions[], int32_t addOptionsCount,
UErrorCode& status);
virtual UPerfFunction* runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); // overide !
virtual UBool runTestLoop( char* testname, char* par );
@ -141,6 +153,7 @@ protected:
int32_t iterations;
int32_t passes;
int32_t time;
const char * _addUsage;
const char** _argv;
int32_t _argc;
int32_t _remainingArgc;

View file

@ -26,11 +26,14 @@ const char UPerfTest::gUsageString[] =
"\t-e or --encoding encoding of source files\n"
"\t-u or --uselen perform timing analysis on non-null terminated buffer using length\n"
"\t-f or --file-name file to be used as input data\n"
"\t-p or --passes Number of passes to be performed. Requires Numeric argument. Cannot be used with --time\n"
"\t-p or --passes Number of passes to be performed. Requires Numeric argument.\n"
"\t Cannot be used with --time\n"
"\t-i or --iterations Number of iterations to be performed. Requires Numeric argument\n"
"\t-t or --time Threshold time for looping until in seconds. Requires Numeric argument.Cannot be used with --iterations\n"
"\t-t or --time Threshold time for looping until in seconds. Requires Numeric argument.\n"
"\t Cannot be used with --iterations\n"
"\t-l or --line-mode The data file should be processed in line mode\n"
"\t-b or --bulk-mode The data file should be processed in file based. Cannot be used with --line-mode\n"
"\t-b or --bulk-mode The data file should be processed in file based.\n"
"\t Cannot be used with --line-mode\n"
"\t-L or --locale Locale for the test\n";
enum
@ -47,11 +50,12 @@ enum
TIME,
LINE_MODE,
BULK_MODE,
LOCALE
LOCALE,
OPTIONS_COUNT
};
static UOption options[]={
static UOption options[OPTIONS_COUNT+20]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
@ -67,32 +71,57 @@ static UOption options[]={
UOPTION_DEF( "locale", 'L', UOPT_REQUIRES_ARG)
};
UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status){
_argc = argc;
_argv = argv;
ucharBuf = NULL;
encoding = "";
uselen = FALSE;
fileName = NULL;
sourceDir = ".";
lines = NULL;
numLines = 0;
line_mode = TRUE;
buffer = NULL;
bufferLen = 0;
verbose = FALSE;
bulk_mode = FALSE;
passes = iterations = time = 0;
locale = NULL;
UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status)
: _argc(argc), _argv(argv), _addUsage(NULL),
ucharBuf(NULL), encoding(""),
uselen(FALSE),
fileName(NULL), sourceDir("."),
lines(NULL), numLines(0), line_mode(TRUE),
buffer(NULL), bufferLen(0),
verbose(FALSE), bulk_mode(FALSE),
passes(1), iterations(0), time(0),
locale(NULL) {
init(NULL, 0, status);
}
UPerfTest::UPerfTest(int32_t argc, const char* argv[],
UOption addOptions[], int32_t addOptionsCount,
const char *addUsage,
UErrorCode& status)
: _argc(argc), _argv(argv), _addUsage(addUsage),
ucharBuf(NULL), encoding(""),
uselen(FALSE),
fileName(NULL), sourceDir("."),
lines(NULL), numLines(0), line_mode(TRUE),
buffer(NULL), bufferLen(0),
verbose(FALSE), bulk_mode(FALSE),
passes(1), iterations(0), time(0),
locale(NULL) {
init(addOptions, addOptionsCount, status);
}
void UPerfTest::init(UOption addOptions[], int32_t addOptionsCount,
UErrorCode& status) {
//initialize the argument list
U_MAIN_INIT_ARGS(argc, argv);
U_MAIN_INIT_ARGS(_argc, _argv);
// add specific options
int32_t optionsCount = OPTIONS_COUNT;
if (addOptionsCount > 0) {
memcpy(options+optionsCount, addOptions, addOptionsCount*sizeof(UOption));
optionsCount += addOptionsCount;
}
//parse the arguments
_remainingArgc = u_parseArgs(argc, (char**)argv, (int32_t)(sizeof(options)/sizeof(options[0])), options);
_remainingArgc = u_parseArgs(_argc, (char**)_argv, optionsCount, options);
// copy back values for additional options
if (addOptionsCount > 0) {
memcpy(addOptions, options+OPTIONS_COUNT, addOptionsCount*sizeof(UOption));
}
// Now setup the arguments
if(argc==1 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
if(_argc==1 || options[HELP1].doesOccur || options[HELP2].doesOccur) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
@ -122,12 +151,16 @@ UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status){
}
if(options[ITERATIONS].doesOccur) {
iterations = atoi(options[ITERATIONS].value);
}
if(options[TIME].doesOccur) {
if(options[TIME].doesOccur) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
} else if(options[TIME].doesOccur) {
time = atoi(options[TIME].value);
} else {
iterations = 1000; // some default
}
if(options[LINE_MODE].doesOccur) {
line_mode = TRUE;
bulk_mode = FALSE;
@ -142,11 +175,6 @@ UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status){
locale = options[LOCALE].value;
}
if(time > 0 && iterations >0){
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t len = 0;
resolvedFileName = NULL;
if(fileName!=NULL){
@ -205,6 +233,9 @@ ULine* UPerfTest::getLines(UErrorCode& status){
return lines;
}
const UChar* UPerfTest::getBuffer(int32_t& len, UErrorCode& status){
if (U_FAILURE(status)) {
return NULL;
}
len = ucbuf_size(ucharBuf);
buffer = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (len+1));
u_strncpy(buffer,ucbuf_getBuffer(ucharBuf,&bufferLen,&status),len);
@ -421,6 +452,11 @@ UBool UPerfTest::runTestLoop( char* testname, char* par )
*/
void UPerfTest::usage( void )
{
puts(gUsageString);
if (_addUsage != NULL) {
puts(_addUsage);
}
UBool save_verbose = verbose;
verbose = TRUE;
fprintf(stdout,"Test names:\n");

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2006, International Business Machines
* Copyright (C) 2003-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -287,8 +287,10 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
/*
* Remove fromUnicode fallbacks and SUB mappings which are irrelevant for
* the toUnicode table.
* This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable
* for the base toUnicode table but not for the base fromUnicode table.
* The table must be sorted.
* Destroys previous data in the reverseMap.
* Modifies previous data in the reverseMap.
*/
static int32_t
reduceToUMappings(UCMTable *table) {
@ -570,6 +572,7 @@ makeToUTable(CnvExtData *extData, UCMTable *table) {
/*
* Remove toUnicode fallbacks and non-<subchar1> SUB mappings
* which are irrelevant for the fromUnicode extension table.
* Remove MBCS_FROM_U_EXT_FLAG bits.
* Overwrite the reverseMap with an index array to the relevant mappings.
* Modify the code point sequences to a generator-friendly format where
* the first code points remains unchanged but the following are recoded
@ -596,6 +599,10 @@ prepareFromUMappings(UCMTable *table) {
for(i=j=0; i<count; ++m, ++i) {
flag=m->f;
if(flag>=0) {
flag&=MBCS_FROM_U_EXT_MASK;
m->f=flag;
}
if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
map[j++]=i;
@ -1065,4 +1072,3 @@ CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *sta
makeToUTable(extData, table) &&
makeFromUTable(extData, table);
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2006, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -25,6 +25,10 @@
#include "makeconv.h"
#include "genmbcs.h"
/*
* TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files.
* Reduce tests for maxCharLength.
*/
typedef struct MBCSData {
NewConverter newConverter;
@ -42,6 +46,11 @@ typedef struct MBCSData {
uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
uint8_t *fromUBytes;
uint32_t stage2Top, stage3Top;
/* fromUTF8 */
uint16_t stageUTF8[MBCS_UTF8_STAGE_SIZE];
UBool utf8Friendly;
} MBCSData;
/* prototypes */
@ -108,26 +117,14 @@ printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
static void
MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
int32_t i, maxCharLength;
uprv_memset(mbcsData, 0, sizeof(MBCSData));
maxCharLength=ucm->states.maxCharLength;
mbcsData->ucm=ucm; /* aliased, not owned */
mbcsData->newConverter.close=MBCSClose;
mbcsData->newConverter.isValid=MBCSIsValid;
mbcsData->newConverter.addTable=MBCSAddTable;
mbcsData->newConverter.write=MBCSWrite;
mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */
mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */
/* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
mbcsData->stage1[i]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
}
}
NewConverter *
@ -139,19 +136,28 @@ MBCSOpen(UCMFile *ucm) {
return &mbcsData->newConverter;
}
static void
MBCSDestruct(MBCSData *mbcsData) {
uprv_free(mbcsData->unicodeCodeUnits);
uprv_free(mbcsData->fromUBytes);
}
static void
MBCSClose(NewConverter *cnvData) {
MBCSData *mbcsData=(MBCSData *)cnvData;
if(mbcsData!=NULL) {
uprv_free(mbcsData->unicodeCodeUnits);
uprv_free(mbcsData->fromUBytes);
MBCSDestruct(mbcsData);
uprv_free(mbcsData);
}
}
static UBool
MBCSStartMappings(MBCSData *mbcsData) {
int32_t i, sum;
int32_t i, sum, maxCharLength,
stage2NullLength, stage2AllocLength,
stage3NullLength, stage3AllocLength;
/* toUnicode */
/* allocate the code unit array and prefill it with "unassigned" values */
sum=mbcsData->ucm->states.countToUCodeUnits;
@ -171,21 +177,102 @@ MBCSStartMappings(MBCSData *mbcsData) {
}
}
/* fromUnicode */
maxCharLength=mbcsData->ucm->states.maxCharLength;
/* allocate the codepage mappings and preset the first 16 characters to 0 */
if(mbcsData->ucm->states.maxCharLength==1) {
if(maxCharLength==1) {
/* allocate 64k 16-bit results for single-byte codepages */
sum=0x20000;
} else {
/* allocate 1M * maxCharLength bytes for at most 1M mappings */
sum=0x100000*mbcsData->ucm->states.maxCharLength;
sum=0x100000*maxCharLength;
}
mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
if(mbcsData->fromUBytes==NULL) {
fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum);
return FALSE;
}
/* initialize the all-unassigned first stage 3 block */
uprv_memset(mbcsData->fromUBytes, 0, 64);
uprv_memset(mbcsData->fromUBytes, 0, sum);
/*
* UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time.
* See ucnvmbcs.h for details.
*
* There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which
* assumes that the initial stage 2/3 blocks are the all-unassigned ones.
* Therefore, we refine the data structure while maintaining this placement
* even though it would be convenient to allocate the ASCII block at the
* beginning of stage 3, for example.
*
* UTF-8-friendly fromUnicode tries work from sorted tables and are built
* pre-compacted, overlapping adjacent stage 2/3 blocks.
* This is necessary because the block allocation and compaction changes
* at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional
* stage table uses direct indexes into stage 3, without a multiplier and
* thus with a smaller reach.
*
* Non-UTF-8-friendly fromUnicode tries work from unsorted tables
* (because implicit precision is used), and are compacted
* in post-processing.
*
* Preallocation for UTF-8-friendly fromUnicode tries:
*
* Stage 3:
* 64-entry all-unassigned first block followed by ASCII (128 entries).
*
* Stage 2:
* 64-entry all-unassigned first block followed by preallocated
* 64-block for ASCII.
*/
/* Preallocate ASCII as a linear 128-entry stage 3 block. */
stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE;
stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE;
stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
stage3AllocLength=128; /* ASCII U+0000..U+007f */
/* Initialize stage 1 for the preallocated blocks. */
sum=stage2NullLength;
for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) {
mbcsData->stage1[i]=sum;
sum+=MBCS_STAGE_2_BLOCK_SIZE;
}
mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */
/*
* Stage 2 indexes count 16-blocks in stage 3 as follows:
* SBCS: directly, indexes increment by 16
* MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1
* MBCS UTF-8: directly, indexes increment by 16
*/
if(maxCharLength==1) {
sum=stage3NullLength;
for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum;
sum+=MBCS_STAGE_3_BLOCK_SIZE;
}
} else {
sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY;
for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
mbcsData->stage2[mbcsData->stage1[0]+i]=sum;
sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY;
}
}
sum=stage3NullLength;
for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) {
mbcsData->stageUTF8[i]=sum;
sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
}
/*
* Allocate a 64-entry all-unassigned first stage 3 block,
* for UTF-8-friendly lookup with a trail byte,
* plus 128 entries for ASCII.
*/
mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */
return TRUE;
}
@ -437,11 +524,13 @@ MBCSSingleAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
UChar32 c,
int8_t flag) {
uint16_t *p;
uint16_t *stage3, *p;
uint32_t index;
uint16_t old;
uint8_t b;
uint32_t blockSize, newTop, i, nextOffset, newBlock, min;
/* ignore |2 SUB mappings */
if(flag==2) {
return TRUE;
@ -453,13 +542,28 @@ MBCSSingleAddFromUnicode(MBCSData *mbcsData,
* Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
* We assume that length<=maxCharLength and that c<=0x10ffff.
*/
stage3=(uint16_t *)mbcsData->fromUBytes;
b=*bytes;
/* inspect stage 1 */
index=c>>10;
index=c>>MBCS_STAGE_1_SHIFT;
if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
} else {
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
}
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
/* allocate another block in stage 2 */
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
newBlock=mbcsData->stage2Top;
if(mbcsData->utf8Friendly) {
min=newBlock-nextOffset; /* minimum block start with overlap */
while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) {
--newBlock;
}
}
newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
if(newTop>MBCS_MAX_STAGE_2_TOP) {
fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b);
return FALSE;
}
@ -468,26 +572,46 @@ MBCSSingleAddFromUnicode(MBCSData *mbcsData,
* each stage 2 block contains 64 16-bit words:
* 6 code point bits 9..4 with 1 stage 3 index
*/
mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
mbcsData->stage1[index]=(uint16_t)newBlock;
mbcsData->stage2Top=newTop;
}
/* inspect stage 2 */
index=(uint32_t)mbcsData->stage1[index]+((c>>4)&0x3f);
index=mbcsData->stage1[index]+nextOffset;
if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
/* allocate 64-entry blocks for UTF-8-friendly lookup */
blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
} else {
blockSize=MBCS_STAGE_3_BLOCK_SIZE;
nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
}
if(mbcsData->stage2Single[index]==0) {
/* allocate another block in stage 3 */
if(mbcsData->stage3Top>=0x10000) {
newBlock=mbcsData->stage3Top;
if(mbcsData->utf8Friendly) {
min=newBlock-nextOffset; /* minimum block start with overlap */
while(min<newBlock && stage3[newBlock-1]==0) {
--newBlock;
}
}
newTop=newBlock+blockSize;
if(newTop>MBCS_STAGE_3_SBCS_SIZE) {
fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b);
return FALSE;
}
/* each block has 16 uint16_t entries */
mbcsData->stage2Single[index]=(uint16_t)mbcsData->stage3Top;
uprv_memset(mbcsData->fromUBytes+2*mbcsData->stage3Top, 0, 32);
mbcsData->stage3Top+=16;
i=index;
while(newBlock<newTop) {
mbcsData->stage2Single[i++]=(uint16_t)newBlock;
newBlock+=MBCS_STAGE_3_BLOCK_SIZE;
}
mbcsData->stage3Top=newTop; /* ==newBlock */
}
/* write the codepage entry into stage 3 and get the previous entry */
p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf);
p=stage3+mbcsData->stage2Single[index]+nextOffset;
old=*p;
if(flag<=0) {
*p=(uint16_t)(0xf00|b);
@ -520,21 +644,14 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
int8_t flag) {
char buffer[10];
const uint8_t *pb;
uint8_t *p;
uint32_t index, b, old;
uint8_t *stage3, *p;
uint32_t index, b, old, stage3Index;
int32_t maxCharLength;
/* ignore |2 SUB mappings */
if(flag==2) {
return TRUE;
}
uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap;
maxCharLength=mbcsData->ucm->states.maxCharLength;
if(maxCharLength==1) {
return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag);
}
if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
(*bytes==0xe || *bytes==0xf)
) {
@ -556,12 +673,27 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
* all-unassigned mappings.
* We assume that length<=maxCharLength and that c<=0x10ffff.
*/
stage3=mbcsData->fromUBytes;
/* inspect stage 1 */
index=c>>10;
index=c>>MBCS_STAGE_1_SHIFT;
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
} else {
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
}
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
/* allocate another block in stage 2 */
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
newBlock=mbcsData->stage2Top;
if(mbcsData->utf8Friendly) {
min=newBlock-nextOffset; /* minimum block start with overlap */
while(min<newBlock && mbcsData->stage2[newBlock-1]==0) {
--newBlock;
}
}
newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
if(newTop>MBCS_MAX_STAGE_2_TOP) {
fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
(int)c, printBytes(buffer, bytes, length));
return FALSE;
@ -571,23 +703,90 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
* each stage 2 block contains 64 32-bit words:
* 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
*/
mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
i=index;
while(newBlock<newTop) {
mbcsData->stage1[i++]=(uint16_t)newBlock;
newBlock+=MBCS_STAGE_2_BLOCK_SIZE;
}
mbcsData->stage2Top=newTop; /* ==newBlock */
}
/* inspect stage 2 */
index=mbcsData->stage1[index]+((c>>4)&0x3f);
index=mbcsData->stage1[index]+nextOffset;
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
/* allocate 64-entry blocks for UTF-8-friendly lookup */
blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
} else {
blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
}
if(mbcsData->stage2[index]==0) {
/* allocate another block in stage 3 */
if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) {
newBlock=mbcsData->stage3Top;
if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) {
/*
* Overlap stage 3 blocks only in multiples of 16-entry blocks
* because of the indexing granularity in stage 2.
*/
maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength;
for(overlap=0;
overlap<maxOverlap && stage3[newBlock-overlap-1]==0;
++overlap) {}
overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength;
newBlock-=overlap;
}
newTop=newBlock+blockSize;
if(newTop>MBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) {
fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
(int)c, printBytes(buffer, bytes, length));
return FALSE;
}
/* each block has 16*maxCharLength bytes */
mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength;
uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength);
mbcsData->stage3Top+=16*maxCharLength;
i=index;
while(newBlock<newTop) {
mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
}
mbcsData->stage3Top=newTop; /* ==newBlock */
}
stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[index];
/* Build an alternate, UTF-8-friendly stage table as well. */
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
/* Overflow for uint16_t entries in stageUTF8? */
if(stage3Index>0xffff) {
/*
* This can occur only if the mapping table is nearly perfectly filled and if
* MBCS_UTF8_MAX==0xffff.
* (There is no known charset like this. GB 18030 does not map
* surrogate code points and LMBCS does not map 256 PUA code points.)
*
* Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff
* (stage3Index can at most reach exactly MBCS_UTF8_LIMIT)
* because we have a sorted table and there are at most MBCS_UTF8_LIMIT
* mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
* the initial all-unassigned block in stage3.
*
* (See svn revision 20866 of the markus/ucnvutf8 feature branch for
* code that causes MBCSAddTable() to rebuild the table not utf8Friendly
* in case of overflow. That code was not tested.)
*/
fprintf(stderr, "too many stage 3 entries for UTF-8-friendly format, processing U+%04x<->0x%s\n",
(int)c, printBytes(buffer, bytes, length));
return FALSE;
}
/*
* The stage 3 block has been assigned for the regular trie.
* Just copy its index into stageUTF8[], without the granularity.
*/
mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
}
/* write the codepage bytes into stage 3 and get the previous bytes */
@ -609,7 +808,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
}
old=0;
p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength;
p=stage3+(stage3Index+nextOffset)*maxCharLength;
switch(maxCharLength) {
case 2:
old=*(uint16_t *)p;
@ -633,7 +832,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
}
/* check that this Unicode code point was still unassigned */
if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
if((mbcsData->stage2[index+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
if(flag>=0) {
fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
(int)c, printBytes(buffer, bytes, length), (int)old);
@ -647,20 +846,57 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
}
if(flag<=0) {
/* set the roundtrip flag */
mbcsData->stage2[index]|=(1UL<<(16+(c&0xf)));
mbcsData->stage2[index+(nextOffset>>4)]|=(1UL<<(16+(c&0xf)));
}
return TRUE;
}
U_CFUNC UBool
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
const uint8_t *bytes, int32_t length,
UChar32 c, int8_t flag) {
/*
* A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under
* the following conditions:
*
* - a |2 SUB mapping for <subchar1> (no base table data structure for them)
* - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry)
* - a multi-byte mapping with leading 0x00 bytes (no explicit length field)
*
* Some of these tests are redundant with ucm_mappingType().
*/
if( (flag==2 && length==1) ||
(flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */
(flag<=1 && length>1 && bytes[0]==0)
) {
return FALSE;
}
/*
* Additional restrictions for UTF-8-friendly fromUnicode tables,
* for code points up to the maximum optimized one:
*
* - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
* - any |1 fallback (no roundtrip flags in the optimized table)
*/
if(utf8Friendly && flag<=1 && c<=MBCS_UTF8_MAX && (bytes[0]==0 || flag==1)) {
return FALSE;
}
/* All other mappings do fit into the base table. */
return TRUE;
}
/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
static UBool
MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
MBCSData *mbcsData;
UCMapping *m;
UChar32 c;
int32_t i;
UBool isOK;
int32_t i, maxCharLength;
int8_t f;
UBool isOK, utf8Friendly;
staticData->unicodeMask=table->unicodeMask;
if(staticData->unicodeMask==3) {
@ -671,42 +907,74 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
staticData->conversionType=UCNV_MBCS;
mbcsData=(MBCSData *)cnvData;
maxCharLength=mbcsData->ucm->states.maxCharLength;
/*
* Generation of UTF-8-friendly data requires
* a sorted table, which makeconv generates when explicit precision
* indicators are used.
*/
mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
if(!MBCSStartMappings(mbcsData)) {
return FALSE;
}
staticData->hasFromUnicodeFallback=FALSE;
staticData->hasToUnicodeFallback=FALSE;
isOK=TRUE;
m=table->mappings;
for(i=0; i<table->mappingsLength; ++m, ++i) {
c=m->u;
f=m->f;
switch(m->f) {
switch(f) {
case -1:
/* there was no precision/fallback indicator */
/* fall through to set the mappings */
case 0:
/* set roundtrip mappings */
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) &&
MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
if(maxCharLength==1) {
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
}
break;
case 1:
/* set only a fallback mapping from Unicode to codepage */
staticData->hasFromUnicodeFallback=TRUE;
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
if(maxCharLength==1) {
staticData->hasFromUnicodeFallback=TRUE;
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
staticData->hasFromUnicodeFallback=TRUE;
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
}
break;
case 2:
/* ignore |2 SUB mappings */
/* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
if(maxCharLength>1 && !MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
}
break;
case 3:
/* set only a fallback mapping from codepage to Unicode */
staticData->hasToUnicodeFallback=TRUE;
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
break;
default:
/* will not occur because the parser checked it already */
fprintf(stderr, "error: illegal fallback indicator %d\n", m->f);
fprintf(stderr, "error: illegal fallback indicator %d\n", f);
return FALSE;
}
}
@ -979,17 +1247,10 @@ compactStage2(MBCSData *mbcsData) {
static void
MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
UCMStates *states;
int32_t maxCharLength;
int32_t maxCharLength, stage3Width;
states=&mbcsData->ucm->states;
maxCharLength=states->maxCharLength;
/* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
if(VERBOSE) {
printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
(unsigned long)mbcsData->stage3Top/maxCharLength,
(unsigned long)mbcsData->stage3Top/maxCharLength);
}
stage3Width=maxCharLength=states->maxCharLength;
ucm_optimizeStates(states,
&mbcsData->unicodeCodeUnits,
@ -997,12 +1258,67 @@ MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
VERBOSE);
/* try to compact the fromUnicode tables */
transformEUC(mbcsData);
if(maxCharLength==1) {
singleCompactStage3(mbcsData);
singleCompactStage2(mbcsData);
} else {
compactStage2(mbcsData);
if(transformEUC(mbcsData)) {
--stage3Width;
}
/*
* UTF-8-friendly tries are built precompacted, to cope with variable
* stage 3 allocation block sizes.
*
* Tables without precision indicators cannot be built that way,
* because if a block was overlapped with a previous one, then a smaller
* code point for the same block would not fit.
* Therefore, such tables are not marked UTF-8-friendly and must be
* compacted after all mappings are entered.
*/
if(!mbcsData->utf8Friendly) {
if(maxCharLength==1) {
singleCompactStage3(mbcsData);
singleCompactStage2(mbcsData);
} else {
compactStage2(mbcsData);
}
}
if(VERBOSE) {
/*uint32_t c, i1, i2, i2Limit, i3;*/
printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n",
maxCharLength==1 ? "16" : "32",
(unsigned long)mbcsData->stage2Top,
(unsigned long)mbcsData->stage2Top);
printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n",
(int)stage3Width,
(unsigned long)mbcsData->stage3Top/stage3Width,
(unsigned long)mbcsData->stage3Top/stage3Width);
#if 0
c=0;
for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) {
i2=mbcsData->stage1[i1];
if(i2==0) {
c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE;
continue;
}
for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) {
if(maxCharLength==1) {
i3=mbcsData->stage2Single[i2];
} else {
i3=(uint16_t)mbcsData->stage2[i2];
}
if(i3==0) {
c+=MBCS_STAGE_3_BLOCK_SIZE;
continue;
}
printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n",
(unsigned long)c,
(unsigned long)i1,
(unsigned long)i2,
(unsigned long)i3);
c+=MBCS_STAGE_3_BLOCK_SIZE;
}
}
#endif
}
}
@ -1010,7 +1326,7 @@ static uint32_t
MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
UNewDataMemory *pData, int32_t tableType) {
MBCSData *mbcsData=(MBCSData *)cnvData;
uint32_t top;
uint32_t top, stageUTF8Length=0;
int32_t i, stage1Top;
_MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
@ -1031,6 +1347,10 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
/* stage3Top has counted 16-bit results, now we need to count bytes */
mbcsData->stage3Top*=2;
if(mbcsData->utf8Friendly) {
header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
}
} else {
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
@ -1044,6 +1364,11 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
/* stage2Top has counted 32-bit results, now we need to count bytes */
mbcsData->stage2Top*=4;
if(mbcsData->utf8Friendly) {
stageUTF8Length=MBCS_UTF8_STAGE_SIZE;
header.version[2]=(uint8_t)(MBCS_UTF8_MAX>>8); /* store 0xd7 for max==0xd7ff */
}
/* stage3Top has already counted bytes */
}
@ -1053,7 +1378,9 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
/* fill the header */
header.version[0]=4;
header.version[1]=2;
header.version[1]=3;
/* header.version[2] set above for utf8Friendly data */
header.countStates=mbcsData->ucm->states.countStates;
header.countToUFallbacks=mbcsData->countToUFallbacks;
@ -1070,7 +1397,7 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
mbcsData->stage2Top;
header.fromUBytesLength=mbcsData->stage3Top;
top=header.offsetFromUBytes+header.fromUBytesLength;
top=header.offsetFromUBytes+header.fromUBytesLength+stageUTF8Length*2;
header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
@ -1096,7 +1423,10 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
}
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
/* return the number of bytes that should have been written */
return header.offsetFromUBytes+header.fromUBytesLength;
}
if(stageUTF8Length>0) {
udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
}
/* return the number of bytes that should have been written */
return top;
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2006, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -20,25 +20,93 @@
#include "makeconv.h"
enum {
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
/*
* TODO: Consider using ucnvmbcs.h constants.
* However, not all values need to be exactly the same, for example
* the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX
* may be higher in makeconv than in the runtime code because that
* affects only a small number of .cnv files [if any] but all
* runtime UConverterSharedData objects.
*/
MBCS_STAGE_2_SHIFT=4,
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */
MBCS_STAGE_1_SHIFT=10,
MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */
MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
MBCS_STAGE_3_BLOCK_SIZE=16, /* 16; 16=1<<4 for 4 bits in stage 3 */
MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */
MBCS_STAGE_3_BLOCK_MASK=0xf,
MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */
MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */
MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */
/*
* SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures.
* Possible values are 0x01ff..0xffff, in steps of 0x100.
*
* Unlike for MBCS, this constant only affects the stage 3 block allocation size;
* there is no additional stage 1/2 table stored in the .cnv file.
* The max value should be at least 0x7ff to cover 2-byte UTF-8.
* 0xfff also covers a number other small scripts which have legacy charsets
* (like Thai).
* Higher values up to 0x1fff are harmless and potentially useful because
* that covers small-script blocks which usually have either dense mappings
* or no mappings at all.
* Starting at U+2000, there are mostly symbols and format characters
* with a low density of SBCS mappings, which would result in more wasted
* stage 3 entries with the larger block size.
*/
SBCS_UTF8_MAX=0x1fff,
/*
* MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures.
* Possible values are 0x01ff..0xffff, in steps of 0x100.
*
* Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table
* with extreme input data. The function checks for this overflow.
*
* 0xd7ff is chosen for the majority of common characters including Unihan and Hangul.
* At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc.
* Larger values cause slightly larger MBCS .cnv files.
*/
MBCS_UTF8_MAX=0xd7ff,
MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */
MBCS_UTF8_STAGE_SHIFT=6,
MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */
MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f,
/* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */
MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */
MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */
MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */
/* =4 number of regular stage 3 blocks for final UTF-8 trail byte */
MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE,
MBCS_MAX_FALLBACK_COUNT=8192
};
U_CFUNC NewConverter *
MBCSOpen(UCMFile *ucm);
/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
U_CFUNC UBool
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
const uint8_t *bytes, int32_t length,
UChar32 c, int8_t flag);
U_CFUNC NewConverter *
CnvExtOpen(UCMFile *ucm);

View file

@ -36,7 +36,6 @@
#define DEBUG 0
typedef struct ConvData {
UCMFile *ucm;
NewConverter *cnvData, *extData;
@ -137,7 +136,7 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
if(VERBOSE)
{
fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv");
printf("- Opened udata %s.%s\n", cnvName, "cnv");
}
@ -160,7 +159,7 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
}
if(VERBOSE)
{
fprintf(stderr, "- Wrote %u bytes to the udata.\n", (int)sz2);
printf("- Wrote %u bytes to the udata.\n", (int)sz2);
}
}
@ -602,6 +601,10 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
states=&data->ucm->states;
if(dataIsBase) {
/*
* Build a normal .cnv file with a base table
* and an optional extension table.
*/
data->cnvData=MBCSOpen(data->ucm);
if(data->cnvData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
@ -618,27 +621,50 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(data->ucm->ext->mappingsLength>0) {
/* prepare the extension table, if there is one */
data->extData=CnvExtOpen(data->ucm);
if(data->extData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else if(
!ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
/* add the base table after ucm_checkBaseExt()! */
if( U_SUCCESS(*pErrorCode) &&
!data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
} else if(
data->ucm->ext->mappingsLength>0 &&
!ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
/* sort the table so that it can be turned into UTF-8-friendly data */
ucm_sortTable(data->ucm->base);
}
if(U_SUCCESS(*pErrorCode)) {
if(
/* add the base table after ucm_checkBaseExt()! */
!data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else {
/*
* addTable() may have requested moving more mappings to the extension table
* if they fit into the base toUnicode table but not into the
* base fromUnicode table.
* (Especially for UTF-8-friendly fromUnicode tables.)
* Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
* to be excluded from the extension toUnicode data.
* See MBCSOkForBaseFromUnicode() for which mappings do not fit into
* the base fromUnicode table.
*/
ucm_moveMappings(data->ucm->base, data->ucm->ext);
ucm_sortTable(data->ucm->ext);
if(data->ucm->ext->mappingsLength>0) {
/* prepare the extension table, if there is one */
data->extData=CnvExtOpen(data->ucm);
if(data->extData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else if(
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
}
}
} else {
/* Build an extension-only .cnv file. */
char baseFilename[500];
char *basename;
@ -662,7 +688,6 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
data->extData=CnvExtOpen(data->ucm);
if(data->extData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
/* fill in gaps in extension file header fields */
UCMapping *m, *mLimit;
@ -700,16 +725,6 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
fallbackFlags|=2;
}
}
for(m=data->ucm->base->mappings, mLimit=m+data->ucm->base->mappingsLength;
m<mLimit && fallbackFlags!=3;
++m
) {
if(m->f==1) {
fallbackFlags|=1;
} else if(m->f==3) {
fallbackFlags|=2;
}
}
if(fallbackFlags&1) {
staticData->hasFromUnicodeFallback=TRUE;
@ -728,10 +743,52 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
} else if(
!ucm_checkValidity(data->ucm->ext, baseStates) ||
!ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
!ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else {
if(states->maxCharLength>1) {
/*
* When building a normal .cnv file with a base table
* for an MBCS (not SBCS) table with explicit precision flags,
* the MBCSAddTable() function marks some mappings for moving
* to the extension table.
* They fit into the base toUnicode table but not into the
* base fromUnicode table.
* (Note: We do have explicit precision flags because they are
* required for extension table generation, and
* ucm_checkBaseExt() verified it.)
*
* We do not call MBCSAddTable() here (we probably could)
* so we need to do the analysis before building the extension table.
* We assume the "worst case" of a UTF-8-friendly table, even if
* MBCSAddTable() might revert to a regular table due to some overflow.
* Redundant mappings in the extension table are ok except they cost some size.
* Overflows in MBCSAddTable() should be very rare.
* TODO: Change "worst case" comment if the MBCSAddTable() loop goes away.
*
* Do this after ucm_checkBaseExt().
*/
int32_t needsMove=0;
for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
m<mLimit;
++m
) {
if(!MBCSOkForBaseFromUnicode(TRUE, m->b.bytes, m->bLen, m->u, m->f)) {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
++needsMove;
}
}
if(needsMove!=0) {
ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
ucm_sortTable(data->ucm->ext);
}
}
if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
}
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2005, International Business Machines
* Copyright (C) 2003-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -238,7 +238,7 @@ ucm_sortTable(UCMTable *t) {
* allocate mappingsCapacity instead of mappingsLength so that
* if mappings are added, the reverseMap need not be
* reallocated each time
* (see moveMappings() and ucm_addMapping())
* (see ucm_moveMappings() and ucm_addMapping())
*/
t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
if(t->reverseMap==NULL) {
@ -264,20 +264,12 @@ ucm_sortTable(UCMTable *t) {
t->isSorted=TRUE;
}
enum {
MOVE_TO_EXT=1,
REMOVE_MAPPING=2
};
/*
* move mappings with their move flag set from the base table
* and optionally to the extension table
*
* works only with explicit precision flags because it uses some of the
* flags bits
* remove mappings with their move flag set from the base table
* and move some of them (with UCM_MOVE_TO_EXT) to the extension table
*/
static void
moveMappings(UCMTable *base, UCMTable *ext) {
U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable *base, UCMTable *ext) {
UCMapping *mb, *mbLimit;
int8_t flag;
@ -290,12 +282,12 @@ moveMappings(UCMTable *base, UCMTable *ext) {
/* reset the move flag */
mb->moveFlag=0;
if(ext!=NULL && (flag&MOVE_TO_EXT)) {
if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
/* add the mapping to the extension table */
ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
}
/* move the last base mapping down and overwrite the current one */
/* remove this mapping: move the last base mapping down and overwrite the current one */
if(mb<(mbLimit-1)) {
uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
}
@ -364,7 +356,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
* if ext is DBCS, move DBCS mappings here
* and check SBCS ones for Unicode prefix below
*/
mb->moveFlag|=MOVE_TO_EXT;
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
/* does mb map from an input sequence that is a prefix of me's? */
@ -373,7 +365,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
) {
if(moveToExt) {
/* mark this mapping to be moved to the extension table */
mb->moveFlag|=MOVE_TO_EXT;
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
@ -394,11 +386,11 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
if( mb->f==me->f && mb->bLen==me->bLen &&
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
) {
me->moveFlag|=REMOVE_MAPPING;
me->moveFlag|=UCM_REMOVE_MAPPING;
result|=NEEDS_MOVE;
} else if(intersectBase) {
/* mapping in base but not in ext, move it */
mb->moveFlag|=MOVE_TO_EXT;
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
@ -476,7 +468,7 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
if(cmp<0) {
if(intersectBase) {
/* mapping in base but not in ext, move it */
mb->moveFlag|=MOVE_TO_EXT;
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
/*
@ -490,7 +482,7 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
) {
if(moveToExt) {
/* mark this mapping to be moved to the extension table */
mb->moveFlag|=MOVE_TO_EXT;
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
@ -511,11 +503,11 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
if( mb->f==me->f && mb->uLen==me->uLen &&
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
) {
me->moveFlag|=REMOVE_MAPPING;
me->moveFlag|=UCM_REMOVE_MAPPING;
result|=NEEDS_MOVE;
} else if(intersectBase) {
/* mapping in base but not in ext, move it */
mb->moveFlag|=MOVE_TO_EXT;
mb->moveFlag|=UCM_MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
@ -586,8 +578,8 @@ ucm_checkBaseExt(UCMStates *baseStates,
}
if(result&NEEDS_MOVE) {
moveMappings(ext, NULL);
moveMappings(base, moveTarget);
ucm_moveMappings(ext, NULL);
ucm_moveMappings(base, moveTarget);
ucm_sortTable(base);
ucm_sortTable(ext);
if(moveTarget!=NULL) {
@ -715,7 +707,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
ucm_printMapping(table, m, stderr);
m->moveFlag|=REMOVE_MAPPING;
m->moveFlag|=UCM_REMOVE_MAPPING;
needsMove=TRUE;
continue;
}
@ -728,7 +720,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
isOK=FALSE;
} else if(type>0) {
m->moveFlag|=MOVE_TO_EXT;
m->moveFlag|=UCM_MOVE_TO_EXT;
needsMove=TRUE;
}
}
@ -737,7 +729,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
return FALSE;
}
if(needsMove) {
moveMappings(ucm->base, ucm->ext);
ucm_moveMappings(ucm->base, ucm->ext);
return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
} else {
ucm_sortTable(ucm->base);
@ -1058,15 +1050,31 @@ ucm_mappingType(UCMStates *baseStates,
/*
* Suitable for an ICU conversion base table means:
* - a 1:1 mapping
* - not a |2 SUB mappings for <subchar1>
* - not a |1 fallback to 0x00
* - no leading 0x00 bytes
* - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
* - SBCS: any 1:1 mapping
* (the table stores additional bits to distinguish mapping types)
* - MBCS: not a |2 SUB mapping for <subchar1>
* - MBCS: not a |1 fallback to 0x00
* - MBCS: not a multi-byte mapping with leading 0x00 bytes
*
* Further restrictions for fromUnicode tables
* are enforced in makeconv (MBCSOkForBaseFromUnicode()).
*
* All of the MBCS fromUnicode specific tests could be removed from here,
* but the ones above are for unusual mappings, and removing the tests
* from here would change canonucm output which seems gratuitous.
* (Markus Scherer 2006-nov-28)
*
* Exception: All implicit mappings (f<0) that need to be moved
* because of fromUnicode restrictions _must_ be moved here because
* makeconv uses a hack for moving mappings only for the fromUnicode table
* that only works with non-negative values of f.
*/
if( m->uLen==1 && count==1 &&
!((m->f==2 && m->bLen==1 && baseStates->maxCharLength>1) ||
(m->f==1 && m->bLen==1 && bytes[0]==0) ||
(m->bLen>1 && bytes[0]==0))
(baseStates->maxCharLength==1 ||
!((m->f==2 && m->bLen==1) ||
(m->f==1 && bytes[0]==0) ||
(m->f<=1 && m->bLen>1 && bytes[0]==0)))
) {
return 0; /* suitable for a base table */
} else {
@ -1178,4 +1186,3 @@ ucm_readTable(UCMFile *ucm, FileStream* convFile,
}
}
#endif

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2005, International Business Machines
* Copyright (C) 2003-2006, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -29,6 +29,12 @@
U_CDECL_BEGIN
/* constants for UCMapping.moveFlag */
enum {
UCM_MOVE_TO_EXT=1,
UCM_REMOVE_MAPPING=2
};
/*
* Per-mapping data structure
*
@ -52,6 +58,7 @@ typedef struct UCMapping {
int8_t uLen, bLen, f, moveFlag;
} UCMapping;
/* constants for UCMTable.flagsType */
enum {
UCM_FLAGS_INITIAL, /* no mappings parsed yet */
UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
@ -150,6 +157,13 @@ ucm_resetTable(UCMTable *table);
U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable *t);
/*
* Remove mappings with their move flag set from the base table
* and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
*/
U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable *base, UCMTable *ext);
/**
* Read a table from a .ucm file, from after the CHARMAP line to
* including the END CHARMAP line.
@ -186,7 +200,7 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
*
* For both tables in the same file, the extension table is automatically
* built.
* For separate files, the extension file can use a complete mapping table,
* For separate files, the extension file can use a complete mapping table (.ucm file),
* so that common mappings need not be stripped out manually.
*
*