mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-2404 first code for m:n conversion extensions
X-SVN-Rev: 13490
This commit is contained in:
parent
ff0a9c0244
commit
cea34629f2
35 changed files with 6193 additions and 2125 deletions
|
@ -61,7 +61,7 @@ OBJECTS = putil.o uobject.o cmemory.o umutex.o \
|
|||
udata.o ucmndata.o udatamem.o udataswp.o umapfile.o ucol_swp.o \
|
||||
uresbund.o uresdata.o resbund.o ucat.o locmap.o uloc.o locid.o \
|
||||
uhash.o uhash_us.o \
|
||||
ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_io.o ucnvlat1.o \
|
||||
ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_ext.o ucnv_io.o ucnvlat1.o \
|
||||
ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
|
||||
ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o \
|
||||
unistr.o utf_impl.o ustring.o ustrcase.o cstring.o ustrfmt.o ustrtrns.o \
|
||||
|
|
|
@ -1347,6 +1347,14 @@ InputPath=.\unicode\ucnv_err.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucnv_ext.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucnv_ext.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucnv_imp.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -730,6 +730,12 @@
|
|||
Outputs="..\..\include\unicode\$(InputName).h"/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucnv_ext.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucnv_ext.h">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucnv_imp.h">
|
||||
</File>
|
||||
|
|
|
@ -608,11 +608,14 @@ static void _reset(UConverter *converter, UConverterResetChoice choice,
|
|||
converter->mode = 0;
|
||||
converter->toULength = 0;
|
||||
converter->invalidCharLength = converter->UCharErrorBufferLength = 0;
|
||||
converter->preToULength = 0;
|
||||
}
|
||||
if(choice!=UCNV_RESET_TO_UNICODE) {
|
||||
converter->fromUnicodeStatus = 0;
|
||||
converter->fromUChar32 = 0;
|
||||
converter->invalidUCharLength = converter->charErrorBufferLength = 0;
|
||||
converter->preFromUFirstCP = U_SENTINEL;
|
||||
converter->preFromULength = 0;
|
||||
}
|
||||
|
||||
if (converter->sharedData->impl->reset != NULL) {
|
||||
|
@ -811,6 +814,28 @@ _updateOffsets(int32_t *offsets, int32_t length,
|
|||
|
||||
/* ucnv_fromUnicode --------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Implementation note for m:n conversions
|
||||
*
|
||||
* While collecting source units to find the longest match for m:n conversion,
|
||||
* some source units may need to be stored for a partial match.
|
||||
* When a second buffer does not yield a match on all of the previously stored
|
||||
* source units, then they must be "replayed", i.e., fed back into the converter.
|
||||
*
|
||||
* The code relies on the fact that replaying will not nest -
|
||||
* converting a replay buffer will not result in a replay.
|
||||
* This is because a replay is necessary only after the _continuation_ of a
|
||||
* partial match failed, but a replay buffer is converted as a whole.
|
||||
* It may result in some of its units being stored again for a partial match,
|
||||
* but there will not be a continuation _during_ the replay which could fail.
|
||||
*
|
||||
* It is conceivable that a callback function could call the converter
|
||||
* recursively in a way that causes another replay to be stored, but that
|
||||
* would be an error in the callback function.
|
||||
* Such violations will cause assertion failures in a debug build,
|
||||
* and wrong output, but they will not cause a crash.
|
||||
*/
|
||||
|
||||
static void
|
||||
_fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
||||
UConverterFromUnicode fromUnicode;
|
||||
|
@ -822,6 +847,12 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
int32_t errorInputLength;
|
||||
UBool converterSawEndOfInput, calledCallback;
|
||||
|
||||
/* variables for m:n conversion */
|
||||
UChar replay[UCNV_EXT_MAX_UCHARS];
|
||||
const UChar *realSource, *realSourceLimit;
|
||||
int32_t realSourceIndex;
|
||||
UBool realFlush;
|
||||
|
||||
cnv=pArgs->converter;
|
||||
s=pArgs->source;
|
||||
t=pArgs->target;
|
||||
|
@ -841,6 +872,29 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
}
|
||||
}
|
||||
|
||||
if(cnv->preFromULength>=0) {
|
||||
/* normal mode */
|
||||
realSource=NULL;
|
||||
} else {
|
||||
/*
|
||||
* Previous m:n conversion stored source units from a partial match
|
||||
* and failed to consume all of them.
|
||||
* We need to "replay" them from a temporary buffer and convert them first.
|
||||
*/
|
||||
realSource=pArgs->source;
|
||||
realSourceLimit=pArgs->sourceLimit;
|
||||
realFlush=pArgs->flush;
|
||||
realSourceIndex=sourceIndex;
|
||||
|
||||
uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
|
||||
pArgs->source=replay;
|
||||
pArgs->sourceLimit=replay-cnv->preFromULength;
|
||||
pArgs->flush=FALSE;
|
||||
sourceIndex=-1;
|
||||
|
||||
cnv->preFromULength=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* loop for conversion and error handling
|
||||
*
|
||||
|
@ -897,7 +951,36 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
pArgs->offsets=offsets+=length;
|
||||
}
|
||||
|
||||
sourceIndex+=(int32_t)(pArgs->source-s);
|
||||
if(sourceIndex>=0) {
|
||||
sourceIndex+=(int32_t)(pArgs->source-s);
|
||||
}
|
||||
}
|
||||
|
||||
if(cnv->preFromULength<0) {
|
||||
/*
|
||||
* switch the source to new replay units (cannot occur while replaying)
|
||||
* after offset handling and before end-of-input and callback handling
|
||||
*/
|
||||
if(realSource==NULL) {
|
||||
realSource=pArgs->source;
|
||||
realSourceLimit=pArgs->sourceLimit;
|
||||
realFlush=pArgs->flush;
|
||||
realSourceIndex=sourceIndex;
|
||||
|
||||
uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
|
||||
pArgs->source=replay;
|
||||
pArgs->sourceLimit=replay-cnv->preFromULength;
|
||||
pArgs->flush=FALSE;
|
||||
if((sourceIndex+=cnv->preFromULength)<0) {
|
||||
sourceIndex=-1;
|
||||
}
|
||||
|
||||
cnv->preFromULength=0;
|
||||
} else {
|
||||
/* see implementation note before _fromUnicodeWithCallback() */
|
||||
U_ASSERT(realSource==NULL);
|
||||
*err=U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* update pointers */
|
||||
|
@ -911,6 +994,15 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
* (continue converting by breaking out of only the inner loop)
|
||||
*/
|
||||
break;
|
||||
} else if(realSource!=NULL) {
|
||||
/* switch back from replaying to the real source and continue */
|
||||
pArgs->source=realSource;
|
||||
pArgs->sourceLimit=realSourceLimit;
|
||||
pArgs->flush=realFlush;
|
||||
sourceIndex=realSourceIndex;
|
||||
|
||||
realSource=NULL;
|
||||
break;
|
||||
} else if(pArgs->flush && cnv->fromUChar32!=0) {
|
||||
/*
|
||||
* the entire input stream is consumed
|
||||
|
@ -960,7 +1052,27 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
* the check for buffer overflow is redundant but it is
|
||||
* a high-runner case and hopefully documents the intent
|
||||
* well
|
||||
*
|
||||
* if we were replaying, then the replay buffer must be
|
||||
* copied back into the UConverter
|
||||
* and the real arguments must be restored
|
||||
*/
|
||||
if(realSource!=NULL) {
|
||||
int32_t length;
|
||||
|
||||
U_ASSERT(cnv->preFromULength==0);
|
||||
|
||||
length=(int32_t)(pArgs->sourceLimit-pArgs->source);
|
||||
if(length>0) {
|
||||
uprv_memcpy(cnv->preFromU, pArgs->source, length*U_SIZEOF_UCHAR);
|
||||
cnv->preFromULength=(int8_t)-length;
|
||||
}
|
||||
|
||||
pArgs->source=realSource;
|
||||
pArgs->sourceLimit=realSourceLimit;
|
||||
pArgs->flush=realFlush;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1079,7 +1191,7 @@ ucnv_fromUnicode(UConverter *cnv,
|
|||
cnv->charErrorBufferLength=0;
|
||||
}
|
||||
|
||||
if(!flush && s==sourceLimit) {
|
||||
if(!flush && s==sourceLimit && cnv->preFromULength>=0) {
|
||||
/* the overflow buffer is emptied and there is no new input: we are done */
|
||||
*target=t;
|
||||
return;
|
||||
|
@ -1122,6 +1234,12 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
int32_t errorInputLength;
|
||||
UBool converterSawEndOfInput, calledCallback;
|
||||
|
||||
/* variables for m:n conversion */
|
||||
char replay[UCNV_EXT_MAX_BYTES];
|
||||
const char *realSource, *realSourceLimit;
|
||||
int32_t realSourceIndex;
|
||||
UBool realFlush;
|
||||
|
||||
cnv=pArgs->converter;
|
||||
s=pArgs->source;
|
||||
t=pArgs->target;
|
||||
|
@ -1141,6 +1259,29 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
}
|
||||
}
|
||||
|
||||
if(cnv->preToULength>=0) {
|
||||
/* normal mode */
|
||||
realSource=NULL;
|
||||
} else {
|
||||
/*
|
||||
* Previous m:n conversion stored source units from a partial match
|
||||
* and failed to consume all of them.
|
||||
* We need to "replay" them from a temporary buffer and convert them first.
|
||||
*/
|
||||
realSource=pArgs->source;
|
||||
realSourceLimit=pArgs->sourceLimit;
|
||||
realFlush=pArgs->flush;
|
||||
realSourceIndex=sourceIndex;
|
||||
|
||||
uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
|
||||
pArgs->source=replay;
|
||||
pArgs->sourceLimit=replay-cnv->preToULength;
|
||||
pArgs->flush=FALSE;
|
||||
sourceIndex=-1;
|
||||
|
||||
cnv->preToULength=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* loop for conversion and error handling
|
||||
*
|
||||
|
@ -1202,7 +1343,36 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
pArgs->offsets=offsets+=length;
|
||||
}
|
||||
|
||||
sourceIndex+=(int32_t)(pArgs->source-s);
|
||||
if(sourceIndex>=0) {
|
||||
sourceIndex+=(int32_t)(pArgs->source-s);
|
||||
}
|
||||
}
|
||||
|
||||
if(cnv->preToULength<0) {
|
||||
/*
|
||||
* switch the source to new replay units (cannot occur while replaying)
|
||||
* after offset handling and before end-of-input and callback handling
|
||||
*/
|
||||
if(realSource==NULL) {
|
||||
realSource=pArgs->source;
|
||||
realSourceLimit=pArgs->sourceLimit;
|
||||
realFlush=pArgs->flush;
|
||||
realSourceIndex=sourceIndex;
|
||||
|
||||
uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
|
||||
pArgs->source=replay;
|
||||
pArgs->sourceLimit=replay-cnv->preToULength;
|
||||
pArgs->flush=FALSE;
|
||||
if((sourceIndex+=cnv->preToULength)<0) {
|
||||
sourceIndex=-1;
|
||||
}
|
||||
|
||||
cnv->preToULength=0;
|
||||
} else {
|
||||
/* see implementation note before _fromUnicodeWithCallback() */
|
||||
U_ASSERT(realSource==NULL);
|
||||
*err=U_INTERNAL_PROGRAM_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* update pointers */
|
||||
|
@ -1216,6 +1386,15 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
* (continue converting by breaking out of only the inner loop)
|
||||
*/
|
||||
break;
|
||||
} else if(realSource!=NULL) {
|
||||
/* switch back from replaying to the real source and continue */
|
||||
pArgs->source=realSource;
|
||||
pArgs->sourceLimit=realSourceLimit;
|
||||
pArgs->flush=realFlush;
|
||||
sourceIndex=realSourceIndex;
|
||||
|
||||
realSource=NULL;
|
||||
break;
|
||||
} else if(pArgs->flush && cnv->toULength>0) {
|
||||
/*
|
||||
* the entire input stream is consumed
|
||||
|
@ -1265,7 +1444,27 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
|||
* the check for buffer overflow is redundant but it is
|
||||
* a high-runner case and hopefully documents the intent
|
||||
* well
|
||||
*
|
||||
* if we were replaying, then the replay buffer must be
|
||||
* copied back into the UConverter
|
||||
* and the real arguments must be restored
|
||||
*/
|
||||
if(realSource!=NULL) {
|
||||
int32_t length;
|
||||
|
||||
U_ASSERT(cnv->preToULength==0);
|
||||
|
||||
length=(int32_t)(pArgs->sourceLimit-pArgs->source);
|
||||
if(length>0) {
|
||||
uprv_memcpy(cnv->preToU, pArgs->source, length);
|
||||
cnv->preToULength=(int8_t)-length;
|
||||
}
|
||||
|
||||
pArgs->source=realSource;
|
||||
pArgs->sourceLimit=realSourceLimit;
|
||||
pArgs->flush=realFlush;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -1379,7 +1578,7 @@ ucnv_toUnicode(UConverter *cnv,
|
|||
cnv->UCharErrorBufferLength=0;
|
||||
}
|
||||
|
||||
if(!flush && s==sourceLimit) {
|
||||
if(!flush && s==sourceLimit && cnv->preToULength>=0) {
|
||||
/* the overflow buffer is emptied and there is no new input: we are done */
|
||||
*target=t;
|
||||
return;
|
||||
|
|
|
@ -776,6 +776,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
|
|||
myUConverter->subChar1 = myUConverter->sharedData->staticData->subChar1;
|
||||
myUConverter->subCharLen = myUConverter->sharedData->staticData->subCharLen;
|
||||
uprv_memcpy (myUConverter->subChar, myUConverter->sharedData->staticData->subChar, myUConverter->subCharLen);
|
||||
myUConverter->preFromUFirstCP = U_SENTINEL;
|
||||
|
||||
if(myUConverter != NULL && myUConverter->sharedData->impl->open != NULL) {
|
||||
myUConverter->sharedData->impl->open(myUConverter, realName, locale,options, err);
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/ucnv_err.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "udataswp.h"
|
||||
|
||||
/* size of the overflow buffers in UConverter, enough for escaping callbacks */
|
||||
|
@ -168,12 +169,22 @@ struct UConverter {
|
|||
int8_t UCharErrorBufferLength; /* number of valid UChars in charErrorBuffer */
|
||||
|
||||
uint8_t subChar1; /* single-byte substitution character if different from subChar */
|
||||
UBool useSubChar1;
|
||||
uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* codepage specific character sequence */
|
||||
char invalidCharBuffer[UCNV_MAX_CHAR_LEN]; /* bytes from last error/callback situation */
|
||||
uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* codepage output from Error functions */
|
||||
|
||||
UChar invalidUCharBuffer[U16_MAX_LENGTH]; /* UChars from last error/callback situation */
|
||||
UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* unicode output from Error functions */
|
||||
|
||||
/* fields for conversion extension */
|
||||
|
||||
/* store previous UChars/chars to continue partial matches */
|
||||
UChar32 preFromUFirstCP; /* >=0: partial match */
|
||||
UChar preFromU[UCNV_EXT_MAX_UCHARS];
|
||||
char preToU[UCNV_EXT_MAX_BYTES];
|
||||
int8_t preFromULength, preToULength; /* negative: replay */
|
||||
int8_t preToUFirstLength; /* length of first character */
|
||||
};
|
||||
|
||||
U_CDECL_END /* end of UConverter */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2000-2001, International Business Machines
|
||||
* Copyright (C) 2000-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* ucnv_cb.c:
|
||||
|
@ -35,50 +35,16 @@ ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args,
|
|||
int32_t offsetIndex,
|
||||
UErrorCode * err)
|
||||
{
|
||||
int32_t togo;
|
||||
int8_t toerr;
|
||||
int32_t i;
|
||||
|
||||
if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
|
||||
{
|
||||
uprv_memcpy(args->target, source, length);
|
||||
args->target += length;
|
||||
if(args->offsets) /* set all the offsets to the same # */
|
||||
{
|
||||
for(i=0;i<length;i++)
|
||||
{
|
||||
*(args->offsets++) = offsetIndex;
|
||||
}
|
||||
}
|
||||
if(U_FAILURE(*err)) {
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
togo = (int32_t)(args->targetLimit - args->target);
|
||||
|
||||
uprv_memcpy(args->target, source, togo);
|
||||
args->target += togo;
|
||||
|
||||
if(args->offsets)
|
||||
{
|
||||
for(i=0;i<togo;i++)
|
||||
{
|
||||
*(args->offsets++) = offsetIndex;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now, copy the remainder into the errbuff */
|
||||
source += togo;
|
||||
toerr = (int8_t)(length - togo);
|
||||
|
||||
uprv_memcpy(args->converter->charErrorBuffer +
|
||||
args->converter->charErrorBufferLength,
|
||||
source,
|
||||
toerr * sizeof(source[0]));
|
||||
args->converter->charErrorBufferLength += toerr;
|
||||
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
|
||||
}
|
||||
ucnv_fromUWriteBytes(
|
||||
args->converter,
|
||||
source, length,
|
||||
&args->target, args->targetLimit,
|
||||
&args->offsets, offsetIndex,
|
||||
err);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
|
@ -232,55 +198,16 @@ ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args,
|
|||
int32_t offsetIndex,
|
||||
UErrorCode * err)
|
||||
{
|
||||
int32_t togo;
|
||||
int8_t toerr;
|
||||
int32_t i;
|
||||
|
||||
if(U_FAILURE(*err))
|
||||
{
|
||||
if(U_FAILURE(*err)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
|
||||
{
|
||||
uprv_memcpy(args->target, source, length * sizeof(args->target[0]) );
|
||||
args->target += length;
|
||||
if(args->offsets) /* set all the offsets to the same # */
|
||||
{
|
||||
for(i=0;i<length;i++)
|
||||
{
|
||||
*(args->offsets++) = offsetIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
togo = (int32_t)(args->targetLimit - args->target);
|
||||
|
||||
uprv_memcpy(args->target, source, togo * sizeof(args->target[0]) );
|
||||
args->target += togo;
|
||||
|
||||
if(args->offsets)
|
||||
{
|
||||
for(i=0;i<togo;i++)
|
||||
{
|
||||
*(args->offsets++) = offsetIndex;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now, copy the remainder into the errbuff */
|
||||
source += togo;
|
||||
toerr = (int8_t)(length - togo);
|
||||
|
||||
uprv_memcpy(args->converter->UCharErrorBuffer +
|
||||
args->converter->UCharErrorBufferLength,
|
||||
source,
|
||||
toerr * sizeof(source[0]));
|
||||
args->converter->UCharErrorBufferLength += toerr;
|
||||
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
ucnv_toUWriteUChars(
|
||||
args->converter,
|
||||
source, length,
|
||||
&args->target, args->targetLimit,
|
||||
&args->offsets, offsetIndex,
|
||||
err);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
|
|
|
@ -79,6 +79,46 @@ ucnv_fromUWriteBytes(UConverter *cnv,
|
|||
}
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_toUWriteUChars(UConverter *cnv,
|
||||
const UChar *uchars, int32_t length,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **offsets,
|
||||
int32_t sourceIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar *t=*target;
|
||||
int32_t *o;
|
||||
|
||||
/* write UChars */
|
||||
if(offsets==NULL || (o=*offsets)==NULL) {
|
||||
while(length>0 && t<targetLimit) {
|
||||
*t++=*uchars++;
|
||||
--length;
|
||||
}
|
||||
} else {
|
||||
/* output with offsets */
|
||||
while(length>0 && t<targetLimit) {
|
||||
*t++=*uchars++;
|
||||
*o++=sourceIndex;
|
||||
--length;
|
||||
}
|
||||
*offsets=o;
|
||||
}
|
||||
*target=t;
|
||||
|
||||
/* write overflow */
|
||||
if(length>0) {
|
||||
if(cnv!=NULL) {
|
||||
t=cnv->UCharErrorBuffer;
|
||||
cnv->UCharErrorBufferLength=(int8_t)length;
|
||||
do {
|
||||
*t++=*uchars++;
|
||||
} while(--length>0);
|
||||
}
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_toUWriteCodePoint(UConverter *cnv,
|
||||
UChar32 c,
|
||||
|
|
|
@ -251,6 +251,13 @@ ucnv_fromUWriteBytes(UConverter *cnv,
|
|||
int32_t **offsets,
|
||||
int32_t sourceIndex,
|
||||
UErrorCode *pErrorCode);
|
||||
U_CFUNC void
|
||||
ucnv_toUWriteUChars(UConverter *cnv,
|
||||
const UChar *uchars, int32_t length,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **offsets,
|
||||
int32_t sourceIndex,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_toUWriteCodePoint(UConverter *cnv,
|
||||
|
|
921
icu4c/source/common/ucnv_ext.c
Normal file
921
icu4c/source/common/ucnv_ext.c
Normal file
|
@ -0,0 +1,921 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: ucnv_ext.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003jun13
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Conversion extensions
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_LEGACY_CONVERSION
|
||||
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
/*
|
||||
* ### TODO
|
||||
*
|
||||
* implement getUnicodeSet for the extension table
|
||||
* implement data swapping for it
|
||||
*/
|
||||
|
||||
/*
|
||||
* ### TODO: probably need pointer to baseTableSharedData
|
||||
* and also copy the base table's pointers for the base table arrays etc.
|
||||
* into this sharedData
|
||||
*/
|
||||
|
||||
/* to Unicode --------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* @return lookup value for the byte, if found; else 0
|
||||
*/
|
||||
static U_INLINE uint32_t
|
||||
ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
|
||||
uint32_t word;
|
||||
int32_t i, start, limit;
|
||||
|
||||
/* check the input byte against the lowest and highest section bytes */
|
||||
start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
|
||||
limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
|
||||
if(byte<start || limit<byte) {
|
||||
return 0; /* the byte is out of range */
|
||||
}
|
||||
|
||||
if(length==((limit-start)+1)) {
|
||||
/* direct access on a linear array */
|
||||
return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
|
||||
}
|
||||
|
||||
/*
|
||||
* Shift byte once instead of each section word and add 0xffffff.
|
||||
* We will compare the shifted/added byte (bbffffff) against
|
||||
* section words which have byte values in the same bit position.
|
||||
* If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
|
||||
* for all v=0..f
|
||||
* so we need not mask off the lower 24 bits of each section word.
|
||||
*/
|
||||
word=UCNV_EXT_TO_U_MAKE_WORD(byte, UCNV_EXT_TO_U_VALUE_MASK);
|
||||
|
||||
/* binary search */
|
||||
start=0;
|
||||
limit=length;
|
||||
for(;;) {
|
||||
i=limit-start;
|
||||
if(i<=1) {
|
||||
break; /* done */
|
||||
}
|
||||
/* start<limit-1 */
|
||||
|
||||
if(i<=4) {
|
||||
/* linear search for the last part */
|
||||
if(word>=toUSection[start]) {
|
||||
break;
|
||||
}
|
||||
if(++start<limit && word>=toUSection[start]) {
|
||||
break;
|
||||
}
|
||||
if(++start<limit && word>=toUSection[start]) {
|
||||
break;
|
||||
}
|
||||
/* always break at start==limit-1 */
|
||||
++start;
|
||||
break;
|
||||
}
|
||||
|
||||
i=(start+limit)/2;
|
||||
if(word<toUSection[i]) {
|
||||
limit=i;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
}
|
||||
|
||||
/* did we really find it? */
|
||||
if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
|
||||
return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
|
||||
} else {
|
||||
return 0; /* not found */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* this works like ucnv_extMatchFromU() except
|
||||
* - the first character is in pre
|
||||
* - no trie is used
|
||||
* - the returned matchLength is not offset by 2
|
||||
*/
|
||||
static int32_t
|
||||
ucnv_extMatchToU(const int32_t *cx,
|
||||
const char *pre, int32_t preLength,
|
||||
const char *src, int32_t srcLength,
|
||||
const UChar **pResult, int32_t *pResultLength,
|
||||
UBool useFallback, UBool flush) {
|
||||
const uint32_t *toUTable, *toUSection;
|
||||
|
||||
uint32_t value, matchValue;
|
||||
int32_t i, j, index, length, matchLength;
|
||||
uint8_t b;
|
||||
|
||||
if(cx==NULL) {
|
||||
return 0; /* no extension data, no match */
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
|
||||
index=0;
|
||||
|
||||
matchValue=0;
|
||||
i=j=matchLength=0;
|
||||
|
||||
/* we must not remember fallback matches when not using fallbacks */
|
||||
|
||||
/* match input units until there is a full match or the input is consumed */
|
||||
for(;;) {
|
||||
/* go to the next section */
|
||||
toUSection=toUTable+index;
|
||||
|
||||
/* read first pair of the section */
|
||||
value=*toUSection++;
|
||||
length=UCNV_EXT_TO_U_GET_BYTE(value);
|
||||
value=UCNV_EXT_TO_U_GET_VALUE(value);
|
||||
if( value!=0 &&
|
||||
(UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
|
||||
TO_U_USE_FALLBACK(useFallback))
|
||||
) {
|
||||
/* remember longest match so far */
|
||||
matchValue=value;
|
||||
matchLength=i+j;
|
||||
}
|
||||
|
||||
/* match pre[] then src[] */
|
||||
if(i<preLength) {
|
||||
b=(uint8_t)pre[i++];
|
||||
} else if(j<srcLength) {
|
||||
b=(uint8_t)src[j++];
|
||||
} else {
|
||||
/* all input consumed, partial match */
|
||||
if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
|
||||
/*
|
||||
* end of the entire input stream, stop with the longest match so far
|
||||
* or: partial match must not be longer than UCNV_EXT_MAX_BYTES
|
||||
* because it must fit into state buffers
|
||||
*/
|
||||
break;
|
||||
} else {
|
||||
/* continue with more input next time */
|
||||
return -length;
|
||||
}
|
||||
}
|
||||
|
||||
/* search for the current UChar */
|
||||
value=ucnv_extFindToU(toUSection, length, b);
|
||||
if(value==0) {
|
||||
/* no match here, stop with the longest match so far */
|
||||
break;
|
||||
} else {
|
||||
if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
|
||||
/* partial match, continue */
|
||||
index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
|
||||
} else {
|
||||
if( UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
|
||||
TO_U_USE_FALLBACK(useFallback)
|
||||
) {
|
||||
/* full match, stop with result */
|
||||
matchValue=value;
|
||||
matchLength=i+j;
|
||||
} else {
|
||||
/* full match on fallback not taken, stop with the longest match so far */
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(matchLength==0) {
|
||||
/* no match at all */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* return result */
|
||||
matchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
|
||||
if(UCNV_EXT_TO_U_IS_CODE_POINT(matchValue)) {
|
||||
*pResultLength=-(int32_t)matchValue;
|
||||
} else {
|
||||
*pResultLength=UCNV_EXT_TO_U_GET_LENGTH(matchValue);
|
||||
*pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+UCNV_EXT_TO_U_GET_INDEX(matchValue);
|
||||
}
|
||||
|
||||
return matchLength;
|
||||
}
|
||||
|
||||
static U_INLINE void
|
||||
ucnv_extWriteToU(UConverter *cnv,
|
||||
const UChar *result, int32_t resultLength,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **offsets, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
/* output the result */
|
||||
if(resultLength<0) {
|
||||
/* output a single code point */
|
||||
ucnv_toUWriteCodePoint(
|
||||
cnv, UCNV_EXT_TO_U_GET_CODE_POINT(-resultLength),
|
||||
target, targetLimit,
|
||||
offsets, srcIndex,
|
||||
pErrorCode);
|
||||
} else {
|
||||
/* output a string - with correct data we have resultLength>0 */
|
||||
ucnv_toUWriteUChars(
|
||||
cnv,
|
||||
result, resultLength,
|
||||
target, targetLimit,
|
||||
offsets, srcIndex,
|
||||
pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* target<targetLimit; set error code for overflow
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
|
||||
int32_t firstLength,
|
||||
const char **src, const char *srcLimit,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **offsets, int32_t srcIndex,
|
||||
UBool flush,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *result;
|
||||
int32_t resultLength, match;
|
||||
|
||||
/* try to match */
|
||||
match=ucnv_extMatchToU(cx,
|
||||
(const char *)cnv->toUBytes, firstLength,
|
||||
*src, (int32_t)(srcLimit-*src),
|
||||
&result, &resultLength,
|
||||
cnv->useFallback, flush);
|
||||
if(match>0) {
|
||||
/* advance src pointer for the consumed input */
|
||||
*src+=match-firstLength;
|
||||
|
||||
/* write result to target */
|
||||
ucnv_extWriteToU(cnv,
|
||||
result, resultLength,
|
||||
target, targetLimit,
|
||||
offsets, srcIndex,
|
||||
pErrorCode);
|
||||
return TRUE;
|
||||
} else if(match<0) {
|
||||
/* save state for partial match */
|
||||
const char *s;
|
||||
int32_t j;
|
||||
|
||||
/* copy the first code point */
|
||||
s=(const char *)cnv->toUBytes;
|
||||
cnv->preToUFirstLength=(int8_t)firstLength;
|
||||
for(j=0; j<firstLength; ++j) {
|
||||
cnv->preToU[j]=*s++;
|
||||
}
|
||||
|
||||
/* now copy the newly consumed input */
|
||||
s=*src;
|
||||
match=-match;
|
||||
for(; j<match; ++j) {
|
||||
cnv->preToU[j]=*s++;
|
||||
}
|
||||
*src=s; /* same as *src=srcLimit; because we reached the end of input */
|
||||
cnv->preToULength=(int8_t)match;
|
||||
return TRUE;
|
||||
} else /* match==0 no match */ {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* ### TODO */
|
||||
|
||||
U_CFUNC int32_t
|
||||
ucnv_extSimpleMatchToU(const int32_t *cx,
|
||||
UChar32 cp, uint32_t *pValue,
|
||||
UBool useFallback,
|
||||
UErrorCode *pErrorCode) {
|
||||
const uint8_t *result;
|
||||
int32_t resultLength, match;
|
||||
|
||||
/* try to match */
|
||||
match=ucnv_extMatchToU(cx,
|
||||
cp,
|
||||
NULL, 0,
|
||||
NULL, 0,
|
||||
&result, &resultLength,
|
||||
useFallback, TRUE);
|
||||
if(match>=2) {
|
||||
/* write result for simple, single-character conversion */
|
||||
if(resultLength<0) {
|
||||
resultLength=-resultLength;
|
||||
*pValue=(uint32_t)UCNV_EXT_TO_U_GET_DATA(resultLength);
|
||||
return UCNV_EXT_TO_U_GET_LENGTH(resultLength);
|
||||
} else if(resultLength==4) {
|
||||
/* de-serialize a 4-byte result */
|
||||
*pValue=
|
||||
((uint32_t)result[0]<<24)|
|
||||
((uint32_t)result[1]<<16)|
|
||||
((uint32_t)result[2]<<8)|
|
||||
result[3];
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* return no match because
|
||||
* - match>1 && resultLength>4: result too long for simple conversion
|
||||
* - match==1: no match found, <subchar1> preferred
|
||||
* - match==0: no match found in the first place
|
||||
* - match<0: partial match, not supported for simple conversion (and flush==TRUE)
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* continue partial match with new input
|
||||
* never called for simple, single-character conversion
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_extContinueMatchToU(UConverter *cnv,
|
||||
UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *result;
|
||||
int32_t resultLength, match, length;
|
||||
|
||||
match=ucnv_extMatchToU(cnv->sharedData->table->mbcs.extIndexes,
|
||||
cnv->preToU, cnv->preToULength,
|
||||
pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
|
||||
&result, &resultLength,
|
||||
cnv->useFallback, pArgs->flush);
|
||||
if(match>0) {
|
||||
if(match>=cnv->preToULength) {
|
||||
/* advance src pointer for the consumed input */
|
||||
pArgs->source+=match-cnv->preToULength;
|
||||
cnv->preToULength=0;
|
||||
} else {
|
||||
/* the match did not use all of preToU[] - keep the rest for replay */
|
||||
int32_t length=cnv->preToULength-match;
|
||||
uprv_memmove(cnv->preToU, cnv->preToU+match, length);
|
||||
cnv->preToULength=(int8_t)-length;
|
||||
}
|
||||
|
||||
/* write result */
|
||||
ucnv_extWriteToU(cnv,
|
||||
result, resultLength,
|
||||
&pArgs->target, pArgs->targetLimit,
|
||||
&pArgs->offsets, srcIndex,
|
||||
pErrorCode);
|
||||
} else if(match<0) {
|
||||
/* save state for partial match */
|
||||
const char *s;
|
||||
int32_t j;
|
||||
|
||||
/* just _append_ the newly consumed input to preToU[] */
|
||||
s=pArgs->source;
|
||||
match=-match;
|
||||
for(j=cnv->preToULength; j<match; ++j) {
|
||||
cnv->preToU[j]=*s++;
|
||||
}
|
||||
pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
|
||||
cnv->preToULength=(int8_t)match;
|
||||
} else /* match==0 */ {
|
||||
/*
|
||||
* no match
|
||||
*
|
||||
* We need to split the previous input into two parts:
|
||||
*
|
||||
* 1. The first codepage character is unmappable - that's how we got into
|
||||
* trying the extension data in the first place.
|
||||
* We need to move it from the preToU buffer
|
||||
* to the error buffer, set an error code,
|
||||
* and prepare the rest of the previous input for 2.
|
||||
*
|
||||
* 2. The rest of the previous input must be converted once we
|
||||
* come back from the callback for the first character.
|
||||
* At that time, we have to try again from scratch to convert
|
||||
* these input characters.
|
||||
* The replay will be handled by the ucnv.c conversion code.
|
||||
*/
|
||||
|
||||
/* move the first codepage character to the error field */
|
||||
uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
|
||||
cnv->toULength=cnv->preToUFirstLength;
|
||||
|
||||
/* move the rest up inside the buffer */
|
||||
length=cnv->preToULength-cnv->preToUFirstLength;
|
||||
if(length>0) {
|
||||
uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
|
||||
}
|
||||
|
||||
/* mark preToU for replay */
|
||||
cnv->preToULength=(int8_t)-length;
|
||||
|
||||
/* set the error code for unassigned */
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
/* from Unicode ------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* @return index of the UChar, if found; else <0
|
||||
*/
|
||||
static U_INLINE int32_t
|
||||
ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
|
||||
int32_t i, start, limit;
|
||||
|
||||
/* binary search */
|
||||
start=0;
|
||||
limit=length;
|
||||
for(;;) {
|
||||
i=limit-start;
|
||||
if(i<=1) {
|
||||
break; /* done */
|
||||
}
|
||||
/* start<limit-1 */
|
||||
|
||||
if(i<=4) {
|
||||
/* linear search for the last part */
|
||||
if(u>=fromUSection[start]) {
|
||||
break;
|
||||
}
|
||||
if(++start<limit && u>=fromUSection[start]) {
|
||||
break;
|
||||
}
|
||||
if(++start<limit && u>=fromUSection[start]) {
|
||||
break;
|
||||
}
|
||||
/* always break at start==limit-1 */
|
||||
++start;
|
||||
break;
|
||||
}
|
||||
|
||||
i=(start+limit)/2;
|
||||
if(u<fromUSection[i]) {
|
||||
limit=i;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
}
|
||||
|
||||
/* did we really find it? */
|
||||
if(start<limit && u==fromUSection[start]) {
|
||||
return start;
|
||||
} else {
|
||||
return -1; /* not found */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @param cx pointer to extension data; if NULL, returns 0
|
||||
* @param firstCP the first code point before all the other UChars
|
||||
* @param pre UChars that must match; !initialMatch: partial match with them
|
||||
* @param preLength length of pre, >=0
|
||||
* @param src UChars that can be used to complete a match
|
||||
* @param srcLength length of src, >=0
|
||||
* @param pResult [out] address of pointer to result bytes
|
||||
* set only in case of a match
|
||||
* @param pResultLength [out] address of result length variable;
|
||||
* gets a negative value if the length variable
|
||||
* itself contains the length and bytes, encoded in
|
||||
* the format of fromUTableValues[] and then inverted
|
||||
* @param useFallback "use fallback" flag, usually from cnv->useFallback
|
||||
* @param flush TRUE if the end of the input stream is reached
|
||||
* @return >1: matched, return value=total match length (number of input units matched)
|
||||
* 1: matched, no mapping but request for <subchar1>
|
||||
* (only for the first code point)
|
||||
* 0: no match
|
||||
* <0: partial match, return value=negative total match length
|
||||
* (partial matches are never returned for flush==TRUE)
|
||||
* (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
|
||||
* the matchLength is 2 if only firstCP matched, and >2 if firstCP and
|
||||
* further code units matched
|
||||
*/
|
||||
static int32_t
|
||||
ucnv_extMatchFromU(const int32_t *cx,
|
||||
UChar32 firstCP,
|
||||
const UChar *pre, int32_t preLength,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const uint8_t **pResult, int32_t *pResultLength,
|
||||
UBool useFallback, UBool flush) {
|
||||
const uint16_t *stage12, *stage3;
|
||||
const uint32_t *stage3b;
|
||||
|
||||
const UChar *fromUTableUChars, *fromUSectionUChars;
|
||||
const uint32_t *fromUTableValues, *fromUSectionValues;
|
||||
|
||||
uint32_t value, matchValue;
|
||||
int32_t i, j, index, length, matchLength;
|
||||
UChar c;
|
||||
|
||||
if(cx==NULL) {
|
||||
return 0; /* no extension data, no match */
|
||||
}
|
||||
|
||||
/* trie lookup of firstCP */
|
||||
index=firstCP>>10; /* stage 1 index */
|
||||
if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
|
||||
return 0; /* the first code point is outside the trie */
|
||||
}
|
||||
|
||||
stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
|
||||
stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
|
||||
index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP);
|
||||
|
||||
stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
|
||||
value=stage3b[index];
|
||||
if(value==0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
|
||||
/* partial match, enter the loop below */
|
||||
index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
|
||||
|
||||
/* initialize */
|
||||
fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
|
||||
fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
|
||||
|
||||
matchValue=0;
|
||||
i=j=matchLength=0;
|
||||
|
||||
/* we must not remember fallback matches when not using fallbacks */
|
||||
|
||||
/* match input units until there is a full match or the input is consumed */
|
||||
for(;;) {
|
||||
/* go to the next section */
|
||||
fromUSectionUChars=fromUTableUChars+index;
|
||||
fromUSectionValues=fromUTableValues+index;
|
||||
|
||||
/* read first pair of the section */
|
||||
length=*fromUSectionUChars++;
|
||||
value=*fromUSectionValues++;
|
||||
if( value!=0 &&
|
||||
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP))
|
||||
) {
|
||||
/* remember longest match so far */
|
||||
matchValue=value;
|
||||
matchLength=2+i+j;
|
||||
}
|
||||
|
||||
/* match pre[] then src[] */
|
||||
if(i<preLength) {
|
||||
c=pre[i++];
|
||||
} else if(j<srcLength) {
|
||||
c=src[j++];
|
||||
} else {
|
||||
/* all input consumed, partial match */
|
||||
if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
|
||||
/*
|
||||
* end of the entire input stream, stop with the longest match so far
|
||||
* or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
|
||||
* because it must fit into state buffers
|
||||
*/
|
||||
break;
|
||||
} else {
|
||||
/* continue with more input next time */
|
||||
return -(2+length);
|
||||
}
|
||||
}
|
||||
|
||||
/* search for the current UChar */
|
||||
index=ucnv_extFindFromU(fromUSectionUChars, length, c);
|
||||
if(index<0) {
|
||||
/* no match here, stop with the longest match so far */
|
||||
break;
|
||||
} else {
|
||||
value=fromUSectionValues[index];
|
||||
if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
||||
/* partial match, continue */
|
||||
index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
|
||||
} else {
|
||||
if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP)
|
||||
) {
|
||||
/* full match, stop with result */
|
||||
matchValue=value;
|
||||
matchLength=2+i+j;
|
||||
} else {
|
||||
/* full match on fallback not taken, stop with the longest match so far */
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(matchLength==0) {
|
||||
/* no match at all */
|
||||
return 0;
|
||||
}
|
||||
} else /* result from firstCP trie lookup */ {
|
||||
if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP)
|
||||
) {
|
||||
/* full match, stop with result */
|
||||
matchValue=value;
|
||||
matchLength=2;
|
||||
} else {
|
||||
/* fallback not taken */
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) {
|
||||
/* do not interpret values with reserved bits used, for forward compatibility */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* return result */
|
||||
if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
matchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue);
|
||||
length=(int32_t)UCNV_EXT_FROM_U_GET_LENGTH(matchValue);
|
||||
if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
|
||||
*pResultLength=-(int32_t)matchValue;
|
||||
} else {
|
||||
*pResultLength=length;
|
||||
*pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+UCNV_EXT_FROM_U_GET_DATA(matchValue);
|
||||
}
|
||||
|
||||
return matchLength;
|
||||
}
|
||||
|
||||
static U_INLINE void
|
||||
ucnv_extWriteFromU(UConverter *cnv,
|
||||
const uint8_t *result, int32_t resultLength,
|
||||
char **target, const char *targetLimit,
|
||||
int32_t **offsets, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint8_t buffer[4];
|
||||
|
||||
/* output the result */
|
||||
if(resultLength<0) {
|
||||
/*
|
||||
* Generate a byte array and then write it below.
|
||||
* This is not the fastest possible way, but it should be ok for
|
||||
* extension mappings, and it is much simpler.
|
||||
* Offset and overflow handling are only done once this way.
|
||||
*/
|
||||
uint8_t *p;
|
||||
uint32_t value;
|
||||
|
||||
resultLength=-resultLength;
|
||||
value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength);
|
||||
resultLength=UCNV_EXT_FROM_U_GET_LENGTH(resultLength);
|
||||
/* resultLength<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH==3 */
|
||||
|
||||
p=buffer;
|
||||
switch(resultLength) {
|
||||
case 3:
|
||||
*p++=(uint8_t)(value>>16);
|
||||
case 2:
|
||||
*p++=(uint8_t)(value>>8);
|
||||
case 1:
|
||||
*p++=(uint8_t)value;
|
||||
default:
|
||||
break; /* will never occur */
|
||||
}
|
||||
result=buffer;
|
||||
}
|
||||
|
||||
/* with correct data we have resultLength>0 */
|
||||
ucnv_fromUWriteBytes(cnv, (const char *)result, resultLength,
|
||||
target, targetLimit,
|
||||
offsets, srcIndex,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
* target<targetLimit; set error code for overflow
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
|
||||
UChar32 cp,
|
||||
const UChar **src, const UChar *srcLimit,
|
||||
char **target, const char *targetLimit,
|
||||
int32_t **offsets, int32_t srcIndex,
|
||||
UBool flush,
|
||||
UErrorCode *pErrorCode) {
|
||||
const uint8_t *result;
|
||||
int32_t resultLength, match;
|
||||
|
||||
/* try to match */
|
||||
match=ucnv_extMatchFromU(cx, cp,
|
||||
NULL, 0,
|
||||
*src, (int32_t)(srcLimit-*src),
|
||||
&result, &resultLength,
|
||||
cnv->useFallback, flush);
|
||||
if(match>=2) {
|
||||
/* advance src pointer for the consumed input */
|
||||
*src+=match-2; /* remove 2 for the initial code point */
|
||||
|
||||
/* write result to target */
|
||||
ucnv_extWriteFromU(cnv,
|
||||
result, resultLength,
|
||||
target, targetLimit,
|
||||
offsets, srcIndex,
|
||||
pErrorCode);
|
||||
return TRUE;
|
||||
} else if(match<0) {
|
||||
/* save state for partial match */
|
||||
const UChar *s;
|
||||
int32_t j;
|
||||
|
||||
/* copy the first code point */
|
||||
cnv->preFromUFirstCP=cp;
|
||||
|
||||
/* now copy the newly consumed input */
|
||||
s=*src;
|
||||
match=-match-2; /* remove 2 for the initial code point */
|
||||
for(j=0; j<match; ++j) {
|
||||
cnv->preFromU[j]=*s++;
|
||||
}
|
||||
*src=s; /* same as *src=srcLimit; because we reached the end of input */
|
||||
cnv->preFromULength=(int8_t)match;
|
||||
return TRUE;
|
||||
} else if(match==1) {
|
||||
/* matched, no mapping but request for <subchar1> */
|
||||
cnv->useSubChar1=TRUE;
|
||||
return FALSE;
|
||||
} else /* match==0 no match */ {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
ucnv_extSimpleMatchFromU(const int32_t *cx,
|
||||
UChar32 cp, uint32_t *pValue,
|
||||
UBool useFallback,
|
||||
UErrorCode *pErrorCode) {
|
||||
const uint8_t *result;
|
||||
int32_t resultLength, match;
|
||||
|
||||
/* try to match */
|
||||
match=ucnv_extMatchFromU(cx,
|
||||
cp,
|
||||
NULL, 0,
|
||||
NULL, 0,
|
||||
&result, &resultLength,
|
||||
useFallback, TRUE);
|
||||
if(match>=2) {
|
||||
/* write result for simple, single-character conversion */
|
||||
if(resultLength<0) {
|
||||
resultLength=-resultLength;
|
||||
*pValue=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength);
|
||||
return UCNV_EXT_FROM_U_GET_LENGTH(resultLength);
|
||||
} else if(resultLength==4) {
|
||||
/* de-serialize a 4-byte result */
|
||||
*pValue=
|
||||
((uint32_t)result[0]<<24)|
|
||||
((uint32_t)result[1]<<16)|
|
||||
((uint32_t)result[2]<<8)|
|
||||
result[3];
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* return no match because
|
||||
* - match>1 && resultLength>4: result too long for simple conversion
|
||||
* - match==1: no match found, <subchar1> preferred
|
||||
* - match==0: no match found in the first place
|
||||
* - match<0: partial match, not supported for simple conversion (and flush==TRUE)
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* continue partial match with new input, requires cnv->preFromUFirstCP>=0
|
||||
* never called for simple, single-character conversion
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
const uint8_t *result;
|
||||
int32_t resultLength, match;
|
||||
|
||||
match=ucnv_extMatchFromU(cnv->sharedData->table->mbcs.extIndexes,
|
||||
cnv->preFromUFirstCP,
|
||||
cnv->preFromU, cnv->preFromULength,
|
||||
pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
|
||||
&result, &resultLength,
|
||||
cnv->useFallback, pArgs->flush);
|
||||
if(match>=2) {
|
||||
match-=2; /* remove 2 for the initial code point */
|
||||
|
||||
if(match>=cnv->preFromULength) {
|
||||
/* advance src pointer for the consumed input */
|
||||
pArgs->source+=match-cnv->preFromULength;
|
||||
cnv->preFromULength=0;
|
||||
} else {
|
||||
/* the match did not use all of preFromU[] - keep the rest for replay */
|
||||
int32_t length=cnv->preFromULength-match;
|
||||
uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
|
||||
cnv->preFromULength=(int8_t)-length;
|
||||
}
|
||||
|
||||
/* finish the partial match */
|
||||
cnv->preFromUFirstCP=U_SENTINEL;
|
||||
|
||||
/* write result */
|
||||
ucnv_extWriteFromU(cnv,
|
||||
result, resultLength,
|
||||
&pArgs->target, pArgs->targetLimit,
|
||||
&pArgs->offsets, srcIndex,
|
||||
pErrorCode);
|
||||
} else if(match<0) {
|
||||
/* save state for partial match */
|
||||
const UChar *s;
|
||||
int32_t j;
|
||||
|
||||
/* just _append_ the newly consumed input to preFromU[] */
|
||||
s=pArgs->source;
|
||||
match=-match-2; /* remove 2 for the initial code point */
|
||||
for(j=cnv->preFromULength; j<match; ++j) {
|
||||
cnv->preFromU[j]=*s++;
|
||||
}
|
||||
pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
|
||||
cnv->preFromULength=(int8_t)match;
|
||||
} else /* match==0 or 1 */ {
|
||||
/*
|
||||
* no match
|
||||
*
|
||||
* We need to split the previous input into two parts:
|
||||
*
|
||||
* 1. The first code point is unmappable - that's how we got into
|
||||
* trying the extension data in the first place.
|
||||
* We need to move it from the preFromU buffer
|
||||
* to the error buffer, set an error code,
|
||||
* and prepare the rest of the previous input for 2.
|
||||
*
|
||||
* 2. The rest of the previous input must be converted once we
|
||||
* come back from the callback for the first code point.
|
||||
* At that time, we have to try again from scratch to convert
|
||||
* these input characters.
|
||||
* The replay will be handled by the ucnv.c conversion code.
|
||||
*/
|
||||
|
||||
if(match==1) {
|
||||
/* matched, no mapping but request for <subchar1> */
|
||||
cnv->useSubChar1=TRUE;
|
||||
}
|
||||
|
||||
/* move the first code point to the error field */
|
||||
cnv->fromUChar32=cnv->preFromUFirstCP;
|
||||
cnv->preFromUFirstCP=U_SENTINEL;
|
||||
|
||||
/* mark preFromU for replay */
|
||||
cnv->preFromULength=-cnv->preFromULength;
|
||||
|
||||
/* set the error code for unassigned */
|
||||
*pErrorCode=U_INVALID_CHAR_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ### TODO
|
||||
*
|
||||
* - test toU() functions
|
||||
*
|
||||
* - EBCDIC_STATEFUL: support extensions, but the charset string must be
|
||||
* either one single-byte character or a sequence of double-byte ones,
|
||||
* to avoid state transitions inside the mapping and to avoid having to
|
||||
* store character boundaries.
|
||||
* The extension functions will need an additional EBCDIC state in/out
|
||||
* parameter and will have to be able to insert an SI or SO before writing
|
||||
* the mapping result.
|
||||
* - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB
|
||||
* - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate
|
||||
* - make dbcsonly work with extensions
|
||||
*
|
||||
* - test |2 to <subchar1> for regular code point, prefix code point,
|
||||
* multiple code points
|
||||
* - test fallback from non-zero to 00
|
||||
* - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone()
|
||||
*/
|
||||
|
||||
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
|
417
icu4c/source/common/ucnv_ext.h
Normal file
417
icu4c/source/common/ucnv_ext.h
Normal file
|
@ -0,0 +1,417 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: ucnv_ext.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003jun13
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Conversion extensions
|
||||
*/
|
||||
|
||||
#ifndef __UCNV_EXT_H__
|
||||
#define __UCNV_EXT_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucnv.h"
|
||||
|
||||
/*
|
||||
* See icuhtml/design/conversion/conversion_extensions.html
|
||||
*
|
||||
* Conversion extensions serve two purposes:
|
||||
* 1. They support m:n mappings.
|
||||
* 2. They support extension-only conversion files that are used together
|
||||
* with the regular conversion data in base files.
|
||||
*
|
||||
* A base file may contain an extension table (explicitly requested or
|
||||
* implicitly generated for m:n mappings), but its extension table is not
|
||||
* used when an extension-only file is used.
|
||||
*
|
||||
* It is an error if a base file contains any regular (not extension) mapping
|
||||
* from the same sequence as a mapping in the extension file
|
||||
* because the base mapping would hide the extension mapping.
|
||||
*
|
||||
*
|
||||
* Data for conversion extensions:
|
||||
*
|
||||
* One set of data structures per conversion direction (to/from Unicode).
|
||||
* The data structures are sorted by input units to allow for binary search.
|
||||
* Input sequences of more than one unit are handled like contraction tables
|
||||
* in collation:
|
||||
* The lookup value of a unit points to another table that is to be searched
|
||||
* for the next unit, recursively.
|
||||
*
|
||||
* For conversion from Unicode, the initial code point is looked up in
|
||||
* a 3-stage trie for speed,
|
||||
* with an additional table of unique results to save space.
|
||||
*
|
||||
* Long output strings are stored in separate arrays, with length and index
|
||||
* in the lookup tables.
|
||||
* Output results also include a flag distinguishing roundtrip from
|
||||
* (reverse) fallback mappings.
|
||||
*
|
||||
* Input Unicode strings must not begin or end with unpaired surrogates
|
||||
* to avoid problems with matches on parts of surrogate pairs.
|
||||
*
|
||||
* Mappings from multiple characters (code points or codepage state
|
||||
* table sequences) must be searched preferring the longest match.
|
||||
* For this to work and be efficient, the variable-width table must contain
|
||||
* all mappings that contain prefixes of the multiple characters.
|
||||
* If an extension table is built on top of a base table in another file
|
||||
* and a base table entry is a prefix of a multi-character mapping, then
|
||||
* this is an error.
|
||||
*
|
||||
*
|
||||
* Implementation note:
|
||||
*
|
||||
* Currently, the parser and several checks in the code limit the number
|
||||
* of UChars or bytes in a mapping to
|
||||
* UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
|
||||
* which are output value limits in the data structure.
|
||||
*
|
||||
* For input, this is not strictly necessary - it is a hard limit only for the
|
||||
* buffers in UConverter that are used to store partial matches.
|
||||
*
|
||||
* Input sequences could otherwise be arbitrarily long if partial matches
|
||||
* need not be stored (i.e., if a sequence does not span several buffers with too
|
||||
* many units before the last buffer), although then results would differ
|
||||
* depending on whether partial matches exceed the limits or not,
|
||||
* which depends on the pattern of buffer sizes.
|
||||
*
|
||||
*
|
||||
* Data structure:
|
||||
*
|
||||
* int32_t indexes[>=32];
|
||||
*
|
||||
* Array of indexes and lengths etc. The length of the array is at least 32.
|
||||
* The actual length is stored in indexes[0] to be forward compatible.
|
||||
*
|
||||
* Each index to another array is the number of bytes from indexes[].
|
||||
* Each length of an array is the number of array base units in that array.
|
||||
*
|
||||
* Some of the structures may not be present, in which case their indexes
|
||||
* and lengths are 0.
|
||||
*
|
||||
* Usage of indexes[i]:
|
||||
* [0] length of indexes[]
|
||||
*
|
||||
* // to Unicode table
|
||||
* [1] index of toUTable[] (array of uint32_t)
|
||||
* [2] length of toUTable[]
|
||||
* [3] index of toUUChars[] (array of UChar)
|
||||
* [4] length of toUUChars[]
|
||||
*
|
||||
* // from Unicode table, not for the initial code point
|
||||
* [5] index of fromUTableUChars[] (array of UChar)
|
||||
* [6] index of fromUTableValues[] (array of uint32_t)
|
||||
* [7] length of fromUTableUChars[] and fromUTableValues[]
|
||||
* [8] index of fromUBytes[] (array of char)
|
||||
* [9] length of fromUBytes[]
|
||||
*
|
||||
* // from Unicode trie for initial-code point lookup
|
||||
* [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
|
||||
* [11] length of stage 1 portion of fromUStage12[]
|
||||
* [12] length of fromUStage12[]
|
||||
* [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
|
||||
* [14] length of fromUStage3[]
|
||||
* [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
|
||||
* [16] length of fromUStage3b[]
|
||||
*
|
||||
* [17]..[30] reserved
|
||||
* [31] number of bytes for the entire extension structure
|
||||
* [>31] reserved; there are indexes[0] indexes
|
||||
*
|
||||
*
|
||||
* uint32_t toUTable[];
|
||||
*
|
||||
* Array of byte/value pairs for lookups for toUnicode conversion.
|
||||
* The array is partitioned into sections like collation contraction tables.
|
||||
* Each section contains one word with the number of following words and
|
||||
* a default value for when the lookup in this section yields no match.
|
||||
*
|
||||
* A section is sorted in ascending order of input bytes,
|
||||
* allowing for fast linear or binary searches.
|
||||
* The builder may store entries for a contiguous range of byte values
|
||||
* (compare difference between the first and last one with count),
|
||||
* which then allows for direct array access.
|
||||
* The builder should always do this for the initial table section.
|
||||
*
|
||||
* Entries may have 0 values, see below.
|
||||
* No two entries in a section have the same byte values.
|
||||
*
|
||||
* Each uint32_t contains an input byte value in bits 31..24 and the
|
||||
* corresponding lookup value in bits 23..0.
|
||||
* Interpret the value as follows:
|
||||
* if(value==0) {
|
||||
* no match, see below
|
||||
* } else if(value<0x1f0000) {
|
||||
* partial match - use value as index to the next toUTable section
|
||||
* and match the next unit; (value indexes toUTable[value])
|
||||
* } else {
|
||||
* if(bit 23 set) {
|
||||
* roundtrip;
|
||||
* } else {
|
||||
* fallback;
|
||||
* }
|
||||
* unset value bit 23;
|
||||
* if(value<=0x2fffff) {
|
||||
* (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
|
||||
* } else {
|
||||
* bits 17..0 (value&0x3ffff) is an index to
|
||||
* the result UChars in toUUChars[]; (0 indexes toUUChars[0])
|
||||
* length of the result=((value>>18)-12); (length=0..19)
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* The first word in a section contains the number of following words in the
|
||||
* input byte position (bits 31..24, number=1..0xff).
|
||||
* The value of the initial word is used when the current byte is not found
|
||||
* in this section.
|
||||
* If the value is not 0, then it represents a result as above.
|
||||
* If the value is 0, then the search has to return a shorter match with an
|
||||
* earlier default value as the result, or result in "unmappable" even for the
|
||||
* initial bytes.
|
||||
* If the value is 0 for the initial toUTable entry, then the initial byte
|
||||
* does not start any mapping input.
|
||||
*
|
||||
*
|
||||
* UChar toUUChars[];
|
||||
*
|
||||
* Contains toUnicode mapping results, stored as sequences of UChars.
|
||||
* Indexes and lengths stored in the toUTable[].
|
||||
*
|
||||
*
|
||||
* UChar fromUTableUChars[];
|
||||
* uint32_t fromUTableValues[];
|
||||
*
|
||||
* The fromUTable is split into two arrays, but works otherwise much like
|
||||
* the toUTable. The array is partitioned into sections like collation
|
||||
* contraction tables and toUTable.
|
||||
* A row in the table consists of same-index entries in fromUTableUChars[]
|
||||
* and fromUTableValues[].
|
||||
*
|
||||
* Interpret a value as follows:
|
||||
* if(value==0) {
|
||||
* no match, see below
|
||||
* } else if(value<=0xffffff) { (bits 31..24 are 0)
|
||||
* partial match - use value as index to the next fromUTable section
|
||||
* and match the next unit; (value indexes fromUTable[value])
|
||||
* } else {
|
||||
* if(value==0x80000001) {
|
||||
* return no mapping, but request for <subchar1>;
|
||||
* }
|
||||
* if(bit 31 set) {
|
||||
* roundtrip;
|
||||
* } else {
|
||||
* fallback;
|
||||
* }
|
||||
* // bits 30..29 reserved, 0
|
||||
* length=(value>>24)&0x1f; (bits 28..24)
|
||||
* if(length==1..3) {
|
||||
* bits 23..0 contain 1..3 bytes, padded with 00s on the left;
|
||||
* } else {
|
||||
* bits 23..0 (value&0xffffff) is an index to
|
||||
* the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* The first pair in a section contains the number of following pairs in the
|
||||
* UChar position (16 bits, number=1..0xffff).
|
||||
* The value of the initial pair is used when the current UChar is not found
|
||||
* in this section.
|
||||
* If the value is not 0, then it represents a result as above.
|
||||
* If the value is 0, then the search has to return a shorter match with an
|
||||
* earlier default value as the result, or result in "unmappable" even for the
|
||||
* initial UChars.
|
||||
*
|
||||
* If the from Unicode trie is present, then the from Unicode search tables
|
||||
* are not used for initial code points.
|
||||
* In this case, the first entries (index 0) in the tables are not used
|
||||
* (reserved, set to 0) because a value of 0 is used in trie results
|
||||
* to indicate no mapping.
|
||||
*
|
||||
*
|
||||
* uint16_t fromUStage12[];
|
||||
*
|
||||
* Stages 1 & 2 of a trie that maps an initial code point.
|
||||
* Indexes in stage 1 are all offset by the length of stage 1 so that the
|
||||
* same array pointer can be used for both stages.
|
||||
* If (c>>10)>=(length of stage 1) then c does not start any mapping.
|
||||
* Same bit distribution as for regular conversion tries.
|
||||
*
|
||||
*
|
||||
* uint16_t fromUStage3[];
|
||||
* uint32_t fromUStage3b[];
|
||||
*
|
||||
* Stage 3 of the trie. The first array simply contains indexes to the second,
|
||||
* which contains words in the same format as fromUTableValues[].
|
||||
* Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
|
||||
* and 16-bit entries in stage 3 allow for 64k stage 3b entries.
|
||||
* The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
|
||||
*
|
||||
* Two arrays are used because it is expected that more than half of the stage 3
|
||||
* entries will be zero. The 16-bit index stage 3 array saves space even
|
||||
* considering storing a total of 6 bytes per non-zero entry in both arrays
|
||||
* together.
|
||||
* Using a stage 3 granularity of >1 diminishes the compactability in that stage
|
||||
* but provides a larger effective addressing space in stage 2.
|
||||
* All but the final result stage use 16-bit entries to save space.
|
||||
*
|
||||
* fromUStage3b[] contains a zero for "no mapping" at its index 0,
|
||||
* and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for "<subchar1> SUB mapping"
|
||||
* (i.e., "no mapping" with preference for <subchar1> rather than <subchar>),
|
||||
* and all other items are unique non-zero results.
|
||||
*
|
||||
*
|
||||
* char fromUBytes[];
|
||||
*
|
||||
* Contains fromUnicode mapping results, stored as sequences of chars.
|
||||
* Indexes and lengths stored in the fromUTableValues[].
|
||||
*/
|
||||
enum {
|
||||
UCNV_EXT_INDEXES_LENGTH, /* 0 */
|
||||
|
||||
UCNV_EXT_TO_U_INDEX, /* 1 */
|
||||
UCNV_EXT_TO_U_LENGTH,
|
||||
UCNV_EXT_TO_U_UCHARS_INDEX,
|
||||
UCNV_EXT_TO_U_UCHARS_LENGTH,
|
||||
|
||||
UCNV_EXT_FROM_U_UCHARS_INDEX, /* 5 */
|
||||
UCNV_EXT_FROM_U_VALUES_INDEX,
|
||||
UCNV_EXT_FROM_U_LENGTH,
|
||||
UCNV_EXT_FROM_U_BYTES_INDEX,
|
||||
UCNV_EXT_FROM_U_BYTES_LENGTH,
|
||||
|
||||
UCNV_EXT_FROM_U_STAGE_12_INDEX, /* 10 */
|
||||
UCNV_EXT_FROM_U_STAGE_1_LENGTH,
|
||||
UCNV_EXT_FROM_U_STAGE_12_LENGTH,
|
||||
UCNV_EXT_FROM_U_STAGE_3_INDEX,
|
||||
UCNV_EXT_FROM_U_STAGE_3_LENGTH,
|
||||
UCNV_EXT_FROM_U_STAGE_3B_INDEX,
|
||||
UCNV_EXT_FROM_U_STAGE_3B_LENGTH,
|
||||
|
||||
UCNV_EXT_RESERVED_INDEX, /* 17, moves with additional indexes */
|
||||
|
||||
UCNV_EXT_SIZE=31,
|
||||
UCNV_EXT_INDEXES_MIN_LENGTH=32
|
||||
};
|
||||
|
||||
/* get the pointer to an extension array from indexes[index] */
|
||||
#define UCNV_EXT_ARRAY(indexes, index, itemType) \
|
||||
((const itemType *)((const char *)(indexes)+(indexes)[index]))
|
||||
|
||||
/* internal API ------------------------------------------------------------- */
|
||||
|
||||
U_CFUNC UBool
|
||||
ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
|
||||
int32_t firstLength,
|
||||
const char **src, const char *srcLimit,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **offsets, int32_t srcIndex,
|
||||
UBool flush,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_extContinueMatchToU(UConverter *cnv,
|
||||
UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
|
||||
U_CFUNC UBool
|
||||
ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
|
||||
UChar32 cp,
|
||||
const UChar **src, const UChar *srcLimit,
|
||||
char **target, const char *targetLimit,
|
||||
int32_t **offsets, int32_t srcIndex,
|
||||
UBool flush,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC int32_t
|
||||
ucnv_extSimpleMatchFromU(const int32_t *cx,
|
||||
UChar32 cp, uint32_t *pValue,
|
||||
UBool useFallback,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/* toUnicode helpers -------------------------------------------------------- */
|
||||
|
||||
#define UCNV_EXT_TO_U_BYTE_SHIFT 24
|
||||
#define UCNV_EXT_TO_U_VALUE_MASK 0xffffff
|
||||
#define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000
|
||||
#define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff
|
||||
#define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23)
|
||||
#define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff
|
||||
#define UCNV_EXT_TO_U_LENGTH_SHIFT 18
|
||||
#define UCNV_EXT_TO_U_LENGTH_OFFSET 12
|
||||
|
||||
/* maximum number of indexed UChars */
|
||||
#define UCNV_EXT_MAX_UCHARS 19
|
||||
|
||||
#define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<<UCNV_EXT_TO_U_BYTE_SHIFT)|(value))
|
||||
|
||||
#define UCNV_EXT_TO_U_GET_BYTE(word) ((word)>>UCNV_EXT_TO_U_BYTE_SHIFT)
|
||||
#define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK)
|
||||
|
||||
#define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)<UCNV_EXT_TO_U_MIN_CODE_POINT)
|
||||
#define UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value) (value)
|
||||
|
||||
#define UCNV_EXT_TO_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_TO_U_ROUNDTRIP_FLAG)!=0)
|
||||
#define UCNV_EXT_TO_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_TO_U_ROUNDTRIP_FLAG)
|
||||
|
||||
/* use after masking off the roundtrip flag */
|
||||
#define UCNV_EXT_TO_U_IS_CODE_POINT(value) ((value)<=UCNV_EXT_TO_U_MAX_CODE_POINT)
|
||||
#define UCNV_EXT_TO_U_GET_CODE_POINT(value) ((value)-UCNV_EXT_TO_U_MIN_CODE_POINT)
|
||||
|
||||
#define UCNV_EXT_TO_U_GET_INDEX(value) ((value)&UCNV_EXT_TO_U_INDEX_MASK)
|
||||
#define UCNV_EXT_TO_U_GET_LENGTH(value) (((value)>>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET)
|
||||
|
||||
/* fromUnicode helpers ------------------------------------------------------ */
|
||||
|
||||
/* most trie constants are shared with ucnvmbcs.h */
|
||||
|
||||
/* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */
|
||||
#define UCNV_EXT_STAGE_2_LEFT_SHIFT 2
|
||||
#define UCNV_EXT_STAGE_3_GRANULARITY 4
|
||||
|
||||
/* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
|
||||
#define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \
|
||||
(stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]<<UCNV_EXT_STAGE_2_LEFT_SHIFT) +((c)&0xf) ]
|
||||
|
||||
#define UCNV_EXT_FROM_U_LENGTH_SHIFT 24
|
||||
#define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31)
|
||||
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x60000000
|
||||
#define UCNV_EXT_FROM_U_DATA_MASK 0xffffff
|
||||
|
||||
/* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
|
||||
#define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001
|
||||
|
||||
/* at most 3 bytes in the lower part of the value */
|
||||
#define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3
|
||||
|
||||
/* maximum number of indexed bytes */
|
||||
#define UCNV_EXT_MAX_BYTES 0x1f
|
||||
|
||||
#define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0)
|
||||
#define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value)
|
||||
|
||||
#define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
|
||||
#define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)
|
||||
|
||||
/* use after masking off the roundtrip flag */
|
||||
#define UCNV_EXT_FROM_U_GET_LENGTH(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)
|
||||
|
||||
/* get bytes or bytes index */
|
||||
#define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK)
|
||||
|
||||
#endif
|
|
@ -46,6 +46,7 @@
|
|||
#include "unicode/uset.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "umutex.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -56,9 +57,18 @@
|
|||
#define MBCS_UNROLL_SINGLE_FROM_BMP 0
|
||||
|
||||
/*
|
||||
* _MBCSHeader versions 4.1
|
||||
* _MBCSHeader versions 4.2
|
||||
* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
|
||||
*
|
||||
* Change from version 4.1:
|
||||
* - Added an optional extension table structure at the end of the .cnv file.
|
||||
* It is present if the upper bits of the header flags field contains a non-zero
|
||||
* byte offset to it.
|
||||
* Files that contain only a conversion table and no base table
|
||||
* use the special outputType MBCS_OUTPUT_EXT_ONLY.
|
||||
* These contain the base table name between the MBCS header and the extension
|
||||
* data.
|
||||
*
|
||||
* Change from version 4.0:
|
||||
* - Replace header.reserved with header.fromUBytesLength so that all
|
||||
* fields in the data have length.
|
||||
|
@ -524,11 +534,6 @@ _MBCSGetUnicodeSet(const UConverter *cnv,
|
|||
* code. The framework will then call the callback function.
|
||||
*/
|
||||
|
||||
/*
|
||||
* TODO when implementing real extensions, review whether the useFallback parameter
|
||||
* should get cnv->useFallback or the full resolution considering cp as well
|
||||
*/
|
||||
|
||||
/*
|
||||
* @return if(U_FAILURE) return the code point for cnv->fromUChar32
|
||||
* else return 0 after output has been written to the target
|
||||
|
@ -539,10 +544,26 @@ _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
|
|||
const UChar **source, const UChar *sourceLimit,
|
||||
char **target, const char *targetLimit,
|
||||
int32_t **offsets, int32_t sourceIndex,
|
||||
UBool useFallback, UBool flush,
|
||||
UBool flush,
|
||||
UErrorCode *pErrorCode) {
|
||||
const int32_t *cx;
|
||||
|
||||
cnv->useSubChar1=FALSE;
|
||||
|
||||
if( (cx=sharedData->table->mbcs.extIndexes)!=NULL &&
|
||||
ucnv_extInitialMatchFromU(
|
||||
cnv, cx,
|
||||
cp, source, sourceLimit,
|
||||
target, targetLimit,
|
||||
offsets, sourceIndex,
|
||||
flush,
|
||||
pErrorCode)
|
||||
) {
|
||||
return 0; /* an extension mapping handled the input */
|
||||
}
|
||||
|
||||
/* GB 18030 */
|
||||
if(cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
|
||||
if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
|
||||
const uint32_t *range;
|
||||
int32_t i;
|
||||
|
||||
|
@ -590,10 +611,24 @@ _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
|
|||
const char **source, const char *sourceLimit,
|
||||
UChar **target, const UChar *targetLimit,
|
||||
int32_t **offsets, int32_t sourceIndex,
|
||||
UBool useFallback, UBool flush,
|
||||
UBool flush,
|
||||
UErrorCode *pErrorCode) {
|
||||
const int32_t *cx;
|
||||
|
||||
if( (cx=sharedData->table->mbcs.extIndexes)!=NULL &&
|
||||
ucnv_extInitialMatchToU(
|
||||
cnv, cx,
|
||||
length, source, sourceLimit,
|
||||
target, targetLimit,
|
||||
offsets, sourceIndex,
|
||||
flush,
|
||||
pErrorCode)
|
||||
) {
|
||||
return 0; /* an extension mapping handled the input */
|
||||
}
|
||||
|
||||
/* GB 18030 */
|
||||
if(length==4 && cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
|
||||
if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
|
||||
const uint32_t *range;
|
||||
uint32_t linear;
|
||||
int32_t i;
|
||||
|
@ -789,6 +824,7 @@ _MBCSLoad(UConverterSharedData *sharedData,
|
|||
UDataInfo info;
|
||||
UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
|
||||
_MBCSHeader *header=(_MBCSHeader *)raw;
|
||||
uint32_t offset;
|
||||
|
||||
if(header->version[0]!=4) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
|
@ -806,6 +842,12 @@ _MBCSLoad(UConverterSharedData *sharedData,
|
|||
mbcsTable->fromUBytesLength=header->fromUBytesLength;
|
||||
mbcsTable->outputType=(uint8_t)header->flags;
|
||||
|
||||
/* extension data, header version 4.2 and higher */
|
||||
offset=header->flags>>8;
|
||||
if(offset!=0) {
|
||||
mbcsTable->extIndexes=(const int32_t *)(raw+offset);
|
||||
}
|
||||
|
||||
/* make sure that the output type is known */
|
||||
switch(mbcsTable->outputType) {
|
||||
case MBCS_OUTPUT_1:
|
||||
|
@ -817,6 +859,8 @@ _MBCSLoad(UConverterSharedData *sharedData,
|
|||
case MBCS_OUTPUT_2_SISO:
|
||||
/* OK */
|
||||
break;
|
||||
case MBCS_OUTPUT_EXT_ONLY:
|
||||
/* ### TODO */
|
||||
default:
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
return;
|
||||
|
@ -1062,7 +1106,7 @@ _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|||
1, (const char **)&source, (const char *)sourceLimit,
|
||||
&target, targetLimit,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
|
||||
|
||||
|
@ -1263,7 +1307,7 @@ unrolled:
|
|||
1, (const char **)&source, (const char *)sourceLimit,
|
||||
&target, target+targetCapacity,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
sourceIndex+=1+(int32_t)(source-lastSource);
|
||||
|
||||
|
@ -1299,266 +1343,6 @@ unrolled:
|
|||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/*
|
||||
* This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
|
||||
* We still need a conversion loop in case we find reserved action codes, which are to be ignored.
|
||||
*/
|
||||
static UChar32
|
||||
_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *cnv;
|
||||
const int32_t (*stateTable)[256];
|
||||
const uint8_t *source, *sourceLimit;
|
||||
|
||||
int32_t entry;
|
||||
uint8_t action;
|
||||
|
||||
/* set up the local pointers */
|
||||
cnv=pArgs->converter;
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
|
||||
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
|
||||
} else {
|
||||
stateTable=cnv->sharedData->table->mbcs.stateTable;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
while(source<sourceLimit) {
|
||||
entry=stateTable[0][*source++];
|
||||
/* MBCS_ENTRY_IS_FINAL(entry) */
|
||||
|
||||
/* write back the updated pointer early so that we can return directly */
|
||||
pArgs->source=(const char *)source;
|
||||
|
||||
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
}
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if( action==MBCS_STATE_VALID_DIRECT_20 ||
|
||||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
|
||||
) {
|
||||
/* output supplementary code point */
|
||||
return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
|
||||
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
}
|
||||
} else if(action==MBCS_STATE_UNASSIGNED) {
|
||||
/* just fall through */
|
||||
} else if(action==MBCS_STATE_ILLEGAL) {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
} else {
|
||||
/* reserved, must never occur */
|
||||
continue;
|
||||
}
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* callback(illegal) */
|
||||
break;
|
||||
} else /* unassigned sequence */ {
|
||||
/* defer to the generic implementation */
|
||||
pArgs->source=(const char *)source-1;
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
}
|
||||
}
|
||||
|
||||
/* no output because of empty input or only state changes */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
static UChar32
|
||||
_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *cnv;
|
||||
const uint8_t *source, *sourceLimit, *lastSource;
|
||||
|
||||
const int32_t (*stateTable)[256];
|
||||
const uint16_t *unicodeCodeUnits;
|
||||
|
||||
uint32_t offset;
|
||||
uint8_t state;
|
||||
|
||||
int32_t entry;
|
||||
UChar32 c;
|
||||
uint8_t action;
|
||||
|
||||
/* use optimized function if possible */
|
||||
cnv=pArgs->converter;
|
||||
if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
|
||||
/*
|
||||
* Using the generic ucnv_getNextUChar() code lets us deal correctly
|
||||
* with the rare case of a codepage that maps single surrogates
|
||||
* without adding the complexity to this already complicated function here.
|
||||
*/
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
} else if(cnv->sharedData->table->mbcs.countStates==1) {
|
||||
return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
|
||||
}
|
||||
|
||||
/* set up the local pointers */
|
||||
source=lastSource=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
|
||||
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
|
||||
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
|
||||
} else {
|
||||
stateTable=cnv->sharedData->table->mbcs.stateTable;
|
||||
}
|
||||
unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
offset=cnv->toUnicodeStatus;
|
||||
state=(uint8_t)(cnv->mode);
|
||||
|
||||
/* conversion loop */
|
||||
c=U_SENTINEL;
|
||||
while(source<sourceLimit) {
|
||||
entry=stateTable[state][*source++];
|
||||
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
|
||||
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
|
||||
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
|
||||
|
||||
/* optimization for 1/2-byte input and BMP output */
|
||||
if( source<sourceLimit &&
|
||||
MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
|
||||
MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
|
||||
(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
|
||||
) {
|
||||
++source;
|
||||
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
|
||||
/* output BMP code point */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* set the next state early so that we can reuse the entry variable */
|
||||
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if(action==MBCS_STATE_VALID_DIRECT_16) {
|
||||
/* output BMP code point */
|
||||
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
break;
|
||||
} else if(action==MBCS_STATE_VALID_16) {
|
||||
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[offset];
|
||||
if(c<0xfffe) {
|
||||
/* output BMP code point */
|
||||
break;
|
||||
} else if(c==0xfffe) {
|
||||
if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
} else if(action==MBCS_STATE_VALID_16_PAIR) {
|
||||
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[offset++];
|
||||
if(c<0xd800) {
|
||||
/* output BMP code point below 0xd800 */
|
||||
break;
|
||||
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
|
||||
/* output roundtrip or fallback supplementary code point */
|
||||
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
|
||||
break;
|
||||
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
|
||||
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
|
||||
c=unicodeCodeUnits[offset];
|
||||
break;
|
||||
} else if(c==0xffff) {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
} else if(action==MBCS_STATE_VALID_DIRECT_20 ||
|
||||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
|
||||
) {
|
||||
/* output supplementary code point */
|
||||
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
|
||||
break;
|
||||
} else if(action==MBCS_STATE_CHANGE_ONLY) {
|
||||
/*
|
||||
* This serves as a state change without any output.
|
||||
* It is useful for reading simple stateful encodings,
|
||||
* for example using just Shift-In/Shift-Out codes.
|
||||
* The 21 unused bits may later be used for more sophisticated
|
||||
* state transitions.
|
||||
*/
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
|
||||
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
|
||||
/* output BMP code point */
|
||||
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
break;
|
||||
}
|
||||
} else if(action==MBCS_STATE_UNASSIGNED) {
|
||||
/* just fall through */
|
||||
} else if(action==MBCS_STATE_ILLEGAL) {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
} else {
|
||||
/* reserved (must never occur), or only state change */
|
||||
offset=0;
|
||||
lastSource=source;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* end of action codes: prepare for a new character */
|
||||
offset=0;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* callback(illegal) */
|
||||
break;
|
||||
} else /* unassigned sequence */ {
|
||||
/* defer to the generic implementation */
|
||||
cnv->toUnicodeStatus=0;
|
||||
cnv->mode=state;
|
||||
pArgs->source=(const char *)lastSource;
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(c<0) {
|
||||
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* incomplete character byte sequence */
|
||||
uint8_t *bytes=cnv->toUBytes;
|
||||
cnv->toULength=(int8_t)(source-lastSource);
|
||||
do {
|
||||
*bytes++=*lastSource++;
|
||||
} while(lastSource<source);
|
||||
} else {
|
||||
/* no output because of empty input or only state changes */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
c=0xffff;
|
||||
}
|
||||
|
||||
/* set the converter state back into UConverter, ready for a new character */
|
||||
cnv->toUnicodeStatus=0;
|
||||
cnv->mode=state;
|
||||
|
||||
/* write back the updated pointer */
|
||||
pArgs->source=(const char *)source;
|
||||
return c;
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
@ -1584,6 +1368,19 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|||
|
||||
/* use optimized function if possible */
|
||||
cnv=pArgs->converter;
|
||||
|
||||
if(cnv->preToULength>0) {
|
||||
/*
|
||||
* pass sourceIndex=-1 because we continue from an earlier buffer
|
||||
* in the future, this may change with continuous offsets
|
||||
*/
|
||||
ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
|
||||
|
||||
if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if(cnv->sharedData->table->mbcs.countStates==1) {
|
||||
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
||||
_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
|
||||
|
@ -1890,7 +1687,7 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|||
byteIndex, (const char **)&source, (const char *)sourceLimit,
|
||||
&target, targetLimit,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
|
||||
|
||||
|
@ -1912,6 +1709,328 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
|||
pArgs->offsets=offsets;
|
||||
}
|
||||
|
||||
/*
|
||||
* This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
|
||||
* We still need a conversion loop in case we find reserved action codes, which are to be ignored.
|
||||
*/
|
||||
static UChar32
|
||||
_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *cnv;
|
||||
const int32_t (*stateTable)[256];
|
||||
const uint8_t *source, *sourceLimit;
|
||||
|
||||
int32_t entry;
|
||||
uint8_t action;
|
||||
|
||||
/* set up the local pointers */
|
||||
cnv=pArgs->converter;
|
||||
source=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
|
||||
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
|
||||
} else {
|
||||
stateTable=cnv->sharedData->table->mbcs.stateTable;
|
||||
}
|
||||
|
||||
/* conversion loop */
|
||||
while(source<sourceLimit) {
|
||||
entry=stateTable[0][*source++];
|
||||
/* MBCS_ENTRY_IS_FINAL(entry) */
|
||||
|
||||
/* write back the updated pointer early so that we can return directly */
|
||||
pArgs->source=(const char *)source;
|
||||
|
||||
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
}
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if( action==MBCS_STATE_VALID_DIRECT_20 ||
|
||||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
|
||||
) {
|
||||
/* output supplementary code point */
|
||||
return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
|
||||
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
}
|
||||
} else if(action==MBCS_STATE_UNASSIGNED) {
|
||||
/* just fall through */
|
||||
} else if(action==MBCS_STATE_ILLEGAL) {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
} else {
|
||||
/* reserved, must never occur */
|
||||
continue;
|
||||
}
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* callback(illegal) */
|
||||
break;
|
||||
} else /* unassigned sequence */ {
|
||||
/* defer to the generic implementation */
|
||||
pArgs->source=(const char *)source-1;
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
}
|
||||
}
|
||||
|
||||
/* no output because of empty input or only state changes */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0xffff;
|
||||
}
|
||||
|
||||
static UChar32
|
||||
_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
UConverter *cnv;
|
||||
const uint8_t *source, *sourceLimit, *lastSource;
|
||||
|
||||
const int32_t (*stateTable)[256];
|
||||
const uint16_t *unicodeCodeUnits;
|
||||
|
||||
uint32_t offset;
|
||||
uint8_t state;
|
||||
|
||||
int32_t entry;
|
||||
UChar32 c;
|
||||
uint8_t action;
|
||||
|
||||
/* use optimized function if possible */
|
||||
cnv=pArgs->converter;
|
||||
|
||||
/* ### TODO extension */
|
||||
if(cnv->sharedData->table->mbcs.extIndexes!=NULL) {
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
}
|
||||
/* ### TODO end cheap-trick extension */
|
||||
|
||||
if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
|
||||
/*
|
||||
* Using the generic ucnv_getNextUChar() code lets us deal correctly
|
||||
* with the rare case of a codepage that maps single surrogates
|
||||
* without adding the complexity to this already complicated function here.
|
||||
*/
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
} else if(cnv->sharedData->table->mbcs.countStates==1) {
|
||||
return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
|
||||
}
|
||||
|
||||
/* set up the local pointers */
|
||||
source=lastSource=(const uint8_t *)pArgs->source;
|
||||
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
|
||||
|
||||
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
|
||||
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
|
||||
} else {
|
||||
stateTable=cnv->sharedData->table->mbcs.stateTable;
|
||||
}
|
||||
unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
offset=cnv->toUnicodeStatus;
|
||||
state=(uint8_t)(cnv->mode);
|
||||
|
||||
/* conversion loop */
|
||||
c=U_SENTINEL;
|
||||
while(source<sourceLimit) {
|
||||
entry=stateTable[state][*source++];
|
||||
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
|
||||
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
|
||||
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
|
||||
|
||||
/* optimization for 1/2-byte input and BMP output */
|
||||
if( source<sourceLimit &&
|
||||
MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
|
||||
MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
|
||||
(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
|
||||
) {
|
||||
++source;
|
||||
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
|
||||
/* output BMP code point */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* set the next state early so that we can reuse the entry variable */
|
||||
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if(action==MBCS_STATE_VALID_DIRECT_16) {
|
||||
/* output BMP code point */
|
||||
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
break;
|
||||
} else if(action==MBCS_STATE_VALID_16) {
|
||||
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[offset];
|
||||
if(c<0xfffe) {
|
||||
/* output BMP code point */
|
||||
break;
|
||||
} else if(c==0xfffe) {
|
||||
if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
} else if(action==MBCS_STATE_VALID_16_PAIR) {
|
||||
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[offset++];
|
||||
if(c<0xd800) {
|
||||
/* output BMP code point below 0xd800 */
|
||||
break;
|
||||
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
|
||||
/* output roundtrip or fallback supplementary code point */
|
||||
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
|
||||
break;
|
||||
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
|
||||
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
|
||||
c=unicodeCodeUnits[offset];
|
||||
break;
|
||||
} else if(c==0xffff) {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
} else if(action==MBCS_STATE_VALID_DIRECT_20 ||
|
||||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
|
||||
) {
|
||||
/* output supplementary code point */
|
||||
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
|
||||
break;
|
||||
} else if(action==MBCS_STATE_CHANGE_ONLY) {
|
||||
/*
|
||||
* This serves as a state change without any output.
|
||||
* It is useful for reading simple stateful encodings,
|
||||
* for example using just Shift-In/Shift-Out codes.
|
||||
* The 21 unused bits may later be used for more sophisticated
|
||||
* state transitions.
|
||||
*/
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
|
||||
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
|
||||
/* output BMP code point */
|
||||
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
break;
|
||||
}
|
||||
} else if(action==MBCS_STATE_UNASSIGNED) {
|
||||
/* just fall through */
|
||||
} else if(action==MBCS_STATE_ILLEGAL) {
|
||||
/* callback(illegal) */
|
||||
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
|
||||
} else {
|
||||
/* reserved (must never occur), or only state change */
|
||||
offset=0;
|
||||
lastSource=source;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* end of action codes: prepare for a new character */
|
||||
offset=0;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* callback(illegal) */
|
||||
break;
|
||||
} else /* unassigned sequence */ {
|
||||
/* defer to the generic implementation */
|
||||
cnv->toUnicodeStatus=0;
|
||||
cnv->mode=state;
|
||||
pArgs->source=(const char *)lastSource;
|
||||
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(c<0) {
|
||||
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
|
||||
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
/* incomplete character byte sequence */
|
||||
uint8_t *bytes=cnv->toUBytes;
|
||||
cnv->toULength=(int8_t)(source-lastSource);
|
||||
do {
|
||||
*bytes++=*lastSource++;
|
||||
} while(lastSource<source);
|
||||
} else {
|
||||
/* no output because of empty input or only state changes */
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
}
|
||||
c=0xffff;
|
||||
}
|
||||
|
||||
/* set the converter state back into UConverter, ready for a new character */
|
||||
cnv->toUnicodeStatus=0;
|
||||
cnv->mode=state;
|
||||
|
||||
/* write back the updated pointer */
|
||||
pArgs->source=(const char *)source;
|
||||
return c;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
|
||||
* Removal improves code coverage.
|
||||
*/
|
||||
/**
|
||||
* This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
|
||||
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
|
||||
* It does not handle conversion extensions (_extToU()).
|
||||
*/
|
||||
U_CFUNC UChar32
|
||||
_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
|
||||
uint8_t b, UBool useFallback) {
|
||||
int32_t entry;
|
||||
uint8_t action;
|
||||
|
||||
entry=sharedData->table->mbcs.stateTable[0][b];
|
||||
/* MBCS_ENTRY_IS_FINAL(entry) */
|
||||
|
||||
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
}
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if(action==MBCS_STATE_VALID_DIRECT_20) {
|
||||
/* output supplementary code point */
|
||||
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
|
||||
if(!TO_U_USE_FALLBACK(useFallback)) {
|
||||
return 0xfffe;
|
||||
}
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
|
||||
if(!TO_U_USE_FALLBACK(useFallback)) {
|
||||
return 0xfffe;
|
||||
}
|
||||
/* output supplementary code point */
|
||||
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
|
||||
} else if(action==MBCS_STATE_UNASSIGNED) {
|
||||
return 0xfffe;
|
||||
} else if(action==MBCS_STATE_ILLEGAL) {
|
||||
return 0xffff;
|
||||
} else {
|
||||
/* reserved, must never occur */
|
||||
return 0xffff;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This is a simple version of getNextUChar() that is used
|
||||
* by other converter implementations.
|
||||
|
@ -1945,6 +2064,8 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
|
|||
return 0xffff;
|
||||
}
|
||||
|
||||
/* ### TODO extension */
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
|
||||
|
@ -2054,61 +2175,6 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
|
|||
return 0xffff;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
|
||||
* Removal improves code coverage.
|
||||
*/
|
||||
/**
|
||||
* This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
|
||||
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
|
||||
* It does not handle conversion extensions (_extToU()).
|
||||
*/
|
||||
U_CFUNC UChar32
|
||||
_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
|
||||
uint8_t b, UBool useFallback) {
|
||||
int32_t entry;
|
||||
uint8_t action;
|
||||
|
||||
entry=sharedData->table->mbcs.stateTable[0][b];
|
||||
/* MBCS_ENTRY_IS_FINAL(entry) */
|
||||
|
||||
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
}
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
if(action==MBCS_STATE_VALID_DIRECT_20) {
|
||||
/* output supplementary code point */
|
||||
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
|
||||
if(!TO_U_USE_FALLBACK(useFallback)) {
|
||||
return 0xfffe;
|
||||
}
|
||||
/* output BMP code point */
|
||||
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
|
||||
if(!TO_U_USE_FALLBACK(useFallback)) {
|
||||
return 0xfffe;
|
||||
}
|
||||
/* output supplementary code point */
|
||||
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
|
||||
} else if(action==MBCS_STATE_UNASSIGNED) {
|
||||
return 0xfffe;
|
||||
} else if(action==MBCS_STATE_ILLEGAL) {
|
||||
return 0xffff;
|
||||
} else {
|
||||
/* reserved, must never occur */
|
||||
return 0xffff;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* MBCS-from-Unicode conversion functions ----------------------------------- */
|
||||
|
||||
/* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
|
||||
|
@ -2251,7 +2317,7 @@ unassigned:
|
|||
c, &source, sourceLimit,
|
||||
(char **)&target, (char *)target+targetCapacity,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
nextSourceIndex+=(int32_t)(source-pArgs->source);
|
||||
|
||||
|
@ -2454,7 +2520,7 @@ unassigned:
|
|||
c, &source, sourceLimit,
|
||||
(char **)&target, (char *)target+targetCapacity,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
nextSourceIndex+=(int32_t)(source-pArgs->source);
|
||||
|
||||
|
@ -2681,7 +2747,7 @@ getTrail:
|
|||
c, &source, sourceLimit,
|
||||
(char **)&target, (char *)target+targetCapacity,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
sourceIndex+=length+(int32_t)(source-lastSource);
|
||||
lastSource=source;
|
||||
|
@ -2744,8 +2810,21 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
|||
int32_t length, prevLength;
|
||||
uint8_t unicodeMask;
|
||||
|
||||
/* use optimized function if possible */
|
||||
cnv=pArgs->converter;
|
||||
|
||||
if(cnv->preFromUFirstCP>=0) {
|
||||
/*
|
||||
* pass sourceIndex=-1 because we continue from an earlier buffer
|
||||
* in the future, this may change with continuous offsets
|
||||
*/
|
||||
ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
|
||||
|
||||
if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* use optimized function if possible */
|
||||
outputType=cnv->sharedData->table->mbcs.outputType;
|
||||
unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
|
||||
if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
|
||||
|
@ -2768,6 +2847,7 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
|
|||
offsets=pArgs->offsets;
|
||||
|
||||
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
|
||||
|
||||
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
|
||||
bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
|
||||
} else {
|
||||
|
@ -3025,7 +3105,7 @@ unassigned:
|
|||
c, &source, sourceLimit,
|
||||
(char **)&target, (char *)target+targetCapacity,
|
||||
&offsets, sourceIndex,
|
||||
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
|
||||
pArgs->flush,
|
||||
pErrorCode);
|
||||
nextSourceIndex+=(int32_t)(source-pArgs->source);
|
||||
prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
|
||||
|
@ -3222,6 +3302,8 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
|
|||
uint32_t value;
|
||||
int32_t length;
|
||||
|
||||
/* ### TODO extension mapping */
|
||||
|
||||
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
|
||||
if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
|
||||
return 0;
|
||||
|
@ -3404,7 +3486,11 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
|
|||
int32_t length;
|
||||
|
||||
/* first, select between subChar and subChar1 */
|
||||
if(cnv->subChar1!=0 && cnv->invalidUCharBuffer[0]<=0xff) {
|
||||
if( cnv->subChar1!=0 &&
|
||||
(cnv->sharedData->table->mbcs.extIndexes!=NULL ?
|
||||
cnv->useSubChar1 :
|
||||
(cnv->invalidUCharBuffer[0]<=0xff))
|
||||
) {
|
||||
/* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
|
||||
subchar=(char *)&cnv->subChar1;
|
||||
length=1;
|
||||
|
@ -3414,6 +3500,9 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
|
|||
length=cnv->subCharLen;
|
||||
}
|
||||
|
||||
/* reset the selector for the next code point */
|
||||
cnv->useSubChar1=FALSE;
|
||||
|
||||
switch(cnv->sharedData->table->mbcs.outputType) {
|
||||
case MBCS_OUTPUT_2_SISO:
|
||||
p=buffer;
|
||||
|
|
|
@ -37,7 +37,11 @@
|
|||
* At the moment, there are only variations of MBCS converters. They all have
|
||||
* the same toUnicode structures, while the fromUnicode structures for SBCS
|
||||
* differ from those for other MBCS-style converters.
|
||||
*
|
||||
*
|
||||
* _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
|
||||
* If it is present, then an ICU version reading header versions 4.0 or 4.1
|
||||
* will be able to use the base table and ignore the extension.
|
||||
*
|
||||
* MBCS-style data structure following the static data.
|
||||
* Offsets are counted in bytes from the beginning of the MBCS header structure.
|
||||
* Details about usage in comments in ucnvmbcs.c.
|
||||
|
@ -45,61 +49,79 @@
|
|||
* struct _MBCSHeader (see the definition in this header file below)
|
||||
* contains 32-bit fields as follows:
|
||||
* 8 values:
|
||||
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.1.0.0)
|
||||
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
|
||||
* 1 uint32_t countStates
|
||||
* 2 uint32_t countToUFallbacks
|
||||
* 3 uint32_t offsetToUCodeUnits
|
||||
* 4 uint32_t offsetFromUTable
|
||||
* 5 uint32_t offsetFromUBytes
|
||||
* 6 uint32_t flags, bits:
|
||||
* 31.. 8 reserved
|
||||
* 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher
|
||||
* 0 for older versions and if
|
||||
* there is not extension structure
|
||||
* 7.. 0 outputType
|
||||
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
|
||||
* counts bytes in fromUBytes[]
|
||||
*
|
||||
* int32_t stateTable[countStates][256];
|
||||
* if(outputType==MBCS_OUTPUT_EXT_ONLY) {
|
||||
* -- base table name for extension-only table
|
||||
* char baseTableName[variable]; -- with NUL plus padding for 4-alignment
|
||||
*
|
||||
* struct _MBCSToUFallback { (fallbacks are sorted by offset)
|
||||
* uint32_t offset;
|
||||
* UChar32 codePoint;
|
||||
* } toUFallbacks[countToUFallbacks];
|
||||
*
|
||||
* uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
|
||||
* (padded to an even number of units)
|
||||
*
|
||||
* -- stage 1 tables
|
||||
* if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
* -- stage 1 table for all of Unicode
|
||||
* uint16_t fromUTable[0x440]; (32-bit-aligned)
|
||||
* -- all _MBCSHeader fields except for version and flags are 0
|
||||
* } else {
|
||||
* -- BMP-only tables have a smaller stage 1 table
|
||||
* uint16_t fromUTable[0x40]; (32-bit-aligned)
|
||||
* -- normal base table with optional extension
|
||||
*
|
||||
* int32_t stateTable[countStates][256];
|
||||
*
|
||||
* struct _MBCSToUFallback { (fallbacks are sorted by offset)
|
||||
* uint32_t offset;
|
||||
* UChar32 codePoint;
|
||||
* } toUFallbacks[countToUFallbacks];
|
||||
*
|
||||
* uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
|
||||
* (padded to an even number of units)
|
||||
*
|
||||
* -- stage 1 tables
|
||||
* if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
* -- stage 1 table for all of Unicode
|
||||
* uint16_t fromUTable[0x440]; (32-bit-aligned)
|
||||
* } else {
|
||||
* -- BMP-only tables have a smaller stage 1 table
|
||||
* uint16_t fromUTable[0x40]; (32-bit-aligned)
|
||||
* }
|
||||
*
|
||||
* -- stage 2 tables
|
||||
* length determined by top of stage 1 and bottom of stage 3 tables
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: pure indexes
|
||||
* uint16_t stage 2 indexes[?];
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
|
||||
* uint32_t stage 2 flags and indexes[?];
|
||||
* }
|
||||
*
|
||||
* -- stage 3 tables with byte results
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
|
||||
* uint16_t fromUBytes[fromUBytesLength/2];
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
|
||||
* uint8_t fromUBytes[fromUBytesLength]; or
|
||||
* uint16_t fromUBytes[fromUBytesLength/2]; or
|
||||
* uint32_t fromUBytes[fromUBytesLength/4];
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* -- stage 2 tables
|
||||
* length determined by top of stage 1 and bottom of stage 3 tables
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: pure indexes
|
||||
* uint16_t stage 2 indexes[?];
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
|
||||
* uint32_t stage 2 flags and indexes[?];
|
||||
* }
|
||||
*
|
||||
* -- stage 3 tables with byte results
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
|
||||
* uint16_t fromUBytes[fromUBytesLength/2];
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
|
||||
* uint8_t fromUBytes[fromUBytesLength]; or
|
||||
* uint16_t fromUBytes[fromUBytesLength/2]; or
|
||||
* uint32_t fromUBytes[fromUBytesLength/4];
|
||||
* }
|
||||
* -- extension table, details see ucnv_ext.h
|
||||
* int32_t indexes[>=32]; ...
|
||||
*/
|
||||
|
||||
/* MBCS converter data and state -------------------------------------------- */
|
||||
|
||||
enum {
|
||||
MBCS_MAX_STATE_COUNT=128
|
||||
};
|
||||
|
||||
/**
|
||||
* MBCS action codes for conversions to Unicode.
|
||||
* These values are in bits 23..20 of the state table entries.
|
||||
|
@ -175,7 +197,11 @@ enum {
|
|||
MBCS_OUTPUT_4_EUC, /* 9 */
|
||||
|
||||
MBCS_OUTPUT_2_SISO=12, /* c */
|
||||
MBCS_OUTPUT_2_HZ /* d */
|
||||
MBCS_OUTPUT_2_HZ, /* d */
|
||||
|
||||
MBCS_OUTPUT_EXT_ONLY, /* e */
|
||||
|
||||
MBCS_OUTPUT_COUNT
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -210,6 +236,9 @@ typedef struct UConverterMBCSTable {
|
|||
|
||||
/* converter name for swaplfnl */
|
||||
char *swapLFNLName;
|
||||
|
||||
/* extension data */
|
||||
const int32_t *extIndexes;
|
||||
} UConverterMBCSTable;
|
||||
|
||||
/**
|
||||
|
|
|
@ -455,7 +455,7 @@ ucnv_safeClone(const UConverter *cnv,
|
|||
UErrorCode *status);
|
||||
|
||||
/** @stable ICU 2.0 */
|
||||
#define U_CNV_SAFECLONE_BUFFERSIZE 3072
|
||||
#define U_CNV_SAFECLONE_BUFFERSIZE 4096
|
||||
|
||||
/**
|
||||
* Deletes the unicode converter and releases resources associated
|
||||
|
|
28
icu4c/source/test/testdata/conversion.txt
vendored
28
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -43,6 +43,16 @@ conversion {
|
|||
toUnicode {
|
||||
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
||||
Cases {
|
||||
// extensions
|
||||
{
|
||||
"*test3",
|
||||
:bin{ 00050601020b0701020a01020c },
|
||||
"\u20ac\x05\x06\x0b\U00101234\U00023456\ufffd",
|
||||
:intvector{ 0, 1, 2, 3, 6, 6, 7, 7, 10 },
|
||||
:int{1}, :int{0}, "", "?", :bin{""}
|
||||
}
|
||||
|
||||
// normal conversions
|
||||
{
|
||||
"UTF-16LE",
|
||||
:bin{ 310000d801dc00d902dc320000d8330001dc3400 },
|
||||
|
@ -110,6 +120,24 @@ conversion {
|
|||
fromUnicode {
|
||||
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
||||
Cases {
|
||||
// extensions
|
||||
{
|
||||
"*test3",
|
||||
"\xc4\xc4\xc4\U00101234\xc4\xc4\U00101234\x05",
|
||||
:bin{ ffffff070501020c },
|
||||
:intvector{ 0, 1, 2, 3, 5, 5, 5, 5 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
|
||||
{
|
||||
"*test3",
|
||||
"\U00101234\U00101234\U00050005\U00101234\U00050005\U00060006",
|
||||
:bin{ 07070001020e05070001020f09 },
|
||||
:intvector{ 0, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
|
||||
// normal conversions
|
||||
{
|
||||
"UTF-16LE",
|
||||
"1\U00010001\U000500022\ud8003\udc014",
|
||||
|
|
7
icu4c/source/test/testdata/test1.ucm
vendored
7
icu4c/source/test/testdata/test1.ucm
vendored
|
@ -1,18 +1,19 @@
|
|||
# *******************************************************************************
|
||||
# * Copyright (C) 2001, International Business Machines
|
||||
# * Copyright (C) 2001-2003, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# *******************************************************************************
|
||||
#
|
||||
# test1.ucm
|
||||
#
|
||||
# Test file for MBCS conversion with single-byte codepage data.
|
||||
# Also contains extension mappings (m:n).
|
||||
|
||||
<code_set_name> "test1"
|
||||
<mb_cur_max> 1
|
||||
<mb_cur_min> 1
|
||||
<uconv_class> "MBCS"
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 5-9, ff
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 5-9, ff
|
||||
|
||||
CHARMAP
|
||||
|
||||
|
|
26
icu4c/source/test/testdata/test3.ucm
vendored
26
icu4c/source/test/testdata/test3.ucm
vendored
|
@ -1,20 +1,21 @@
|
|||
# *******************************************************************************
|
||||
# * Copyright (C) 2001, International Business Machines
|
||||
# * Copyright (C) 2001-2003, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# *******************************************************************************
|
||||
#
|
||||
# test3.ucm
|
||||
#
|
||||
# Test file for MBCS conversion with three-byte codepage data.
|
||||
# Also contains extension mappings (m:n).
|
||||
|
||||
<code_set_name> "test3"
|
||||
<mb_cur_max> 3
|
||||
<mb_cur_min> 1
|
||||
<uconv_class> "MBCS"
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 1:1, 5-9, ff
|
||||
<icu:state> 2:2
|
||||
<icu:state> a-f.p
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 1:1, 5-9, ff
|
||||
<icu:state> 2:2
|
||||
<icu:state> a-f.p
|
||||
|
||||
CHARMAP
|
||||
|
||||
|
@ -24,6 +25,11 @@ CHARMAP
|
|||
# nothing special
|
||||
<U0005> \x05 |0
|
||||
|
||||
# extensions
|
||||
<U00c0> \x05+\x01\x02\x0d |0
|
||||
<U00c0> \x05+\x01\x02\x0e |3
|
||||
<U00c0> \x05+\xff |3
|
||||
|
||||
# toUnicode result is fallback direct
|
||||
<U0006> \x06 |3
|
||||
|
||||
|
@ -31,8 +37,18 @@ CHARMAP
|
|||
<U101234> \x07 |0
|
||||
<Ufebcd> \x08 |3
|
||||
|
||||
# extensions
|
||||
<U101234>+<U50005>+<U60006> \x07+\x00+\x01\x02\x0f+\x09 |0
|
||||
<U101234>+<U50005> \x07+\x00+\x01\x02\x0e+\x05 |0
|
||||
<U101234>+<U60006> \x07+\x00+\x01\x02\x0f+\x06 |0
|
||||
<U101234>+<U70007> \x07+\x00+\x01\x02\x0f |1
|
||||
|
||||
#unassigned \x09
|
||||
|
||||
# extensions where the first code point is unassigned, for replay testing
|
||||
#<U00c4><U0300> \x09+\x09 |0
|
||||
<U00c4><U00c4><U101234><U0005> \x05+\x01\x02\x0c |0
|
||||
|
||||
# toUnicode result is surrogate pair: test real pair, single unit, unassigned
|
||||
<U23456> \x01\x02\x0a |0
|
||||
<U000b> \x01\x02\x0b |0
|
||||
|
|
14
icu4c/source/test/testdata/test4.ucm
vendored
14
icu4c/source/test/testdata/test4.ucm
vendored
|
@ -1,21 +1,21 @@
|
|||
# *******************************************************************************
|
||||
# * Copyright (C) 2001, International Business Machines
|
||||
# * Copyright (C) 2001-2003, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# *******************************************************************************
|
||||
#
|
||||
# test4.ucm
|
||||
#
|
||||
# Test file for MBCS conversion with three-byte codepage data.
|
||||
# Test file for MBCS conversion with four-byte codepage data.
|
||||
|
||||
<code_set_name> "test4"
|
||||
<mb_cur_max> 4
|
||||
<mb_cur_min> 1
|
||||
<uconv_class> "MBCS"
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 1:1, 5-9, ff
|
||||
<icu:state> 2:2
|
||||
<icu:state> 3:3
|
||||
<icu:state> a-f.p
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 1:1, 5-9, ff
|
||||
<icu:state> 2:2
|
||||
<icu:state> 3:3
|
||||
<icu:state> a-f.p
|
||||
|
||||
CHARMAP
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "unicode/udata.h"
|
||||
#include "utrie.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "toolutil.h"
|
||||
#include "unewdata.h"
|
||||
#include "unormimp.h"
|
||||
#include "gennorm.h"
|
||||
|
@ -86,87 +87,6 @@ setUnicodeVersion(const char *v) {
|
|||
|
||||
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
|
||||
|
||||
/* tool memory helper ------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* UToolMemory is used for generic, custom memory management.
|
||||
* It is allocated with enough space for count*size bytes starting
|
||||
* at array.
|
||||
* The array is declared with a union of large data types so
|
||||
* that its base address is aligned for any types.
|
||||
* If size is a multiple of a data type size, then such items
|
||||
* can be safely allocated inside the array, at offsets that
|
||||
* are themselves multiples of size.
|
||||
*/
|
||||
typedef struct UToolMemory {
|
||||
char name[64];
|
||||
uint32_t count, size, index;
|
||||
union {
|
||||
uint32_t u;
|
||||
double d;
|
||||
void *p;
|
||||
} array[1];
|
||||
} UToolMemory;
|
||||
|
||||
static UToolMemory *
|
||||
utm_open(const char *name, uint32_t count, uint32_t size) {
|
||||
UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size);
|
||||
if(mem==NULL) {
|
||||
fprintf(stderr, "error: %s - out of memory\n", name);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
uprv_strcpy(mem->name, name);
|
||||
mem->count=count;
|
||||
mem->size=size;
|
||||
mem->index=0;
|
||||
return mem;
|
||||
}
|
||||
|
||||
static void
|
||||
utm_close(UToolMemory *mem) {
|
||||
if(mem!=NULL) {
|
||||
uprv_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void *
|
||||
utm_getStart(UToolMemory *mem) {
|
||||
return (char *)mem->array;
|
||||
}
|
||||
|
||||
static int32_t
|
||||
utm_countItems(UToolMemory *mem) {
|
||||
return mem->index;
|
||||
}
|
||||
|
||||
static void *
|
||||
utm_alloc(UToolMemory *mem) {
|
||||
char *p=(char *)mem->array+mem->index*mem->size;
|
||||
if(++mem->index<=mem->count) {
|
||||
uprv_memset(p, 0, mem->size);
|
||||
return p;
|
||||
} else {
|
||||
fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
|
||||
mem->name, (long)mem->count);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
static void *
|
||||
utm_allocN(UToolMemory *mem, int32_t n) {
|
||||
char *p=(char *)mem->array+mem->index*mem->size;
|
||||
if((mem->index+=(uint32_t)n)<=mem->count) {
|
||||
uprv_memset(p, 0, n*mem->size);
|
||||
return p;
|
||||
} else {
|
||||
fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
|
||||
mem->name, (long)mem->count);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/* builder data ------------------------------------------------------------- */
|
||||
|
||||
typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
|
||||
|
@ -244,23 +164,23 @@ init() {
|
|||
}
|
||||
|
||||
/* allocate Norm structures and reset the first one */
|
||||
normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm));
|
||||
normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
|
||||
norms=utm_alloc(normMem);
|
||||
|
||||
/* allocate UTF-32 string memory */
|
||||
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4);
|
||||
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
|
||||
|
||||
/* reset all "have seen" flags */
|
||||
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
|
||||
|
||||
/* allocate extra data memory for UTF-16 decomposition strings and other values */
|
||||
extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2);
|
||||
extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
|
||||
/* initialize the extraMem counter for the top of FNC strings */
|
||||
p16=(uint16_t *)utm_alloc(extraMem);
|
||||
*p16=1;
|
||||
|
||||
/* allocate temporary memory for combining triples */
|
||||
combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple));
|
||||
combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
|
||||
|
||||
/* set the minimum code points for no/maybe quick check values to the end of the BMP */
|
||||
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
|
||||
|
@ -508,7 +428,7 @@ processCombining() {
|
|||
triples=utm_getStart(combiningTriplesMem);
|
||||
|
||||
/* add lead and trail indexes to the triples for sorting */
|
||||
count=(uint16_t)combiningTriplesMem->index;
|
||||
count=(uint16_t)utm_countItems(combiningTriplesMem);
|
||||
for(i=0; i<count; ++i) {
|
||||
/* findCombiningCP() must always find the code point */
|
||||
triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
|
||||
|
@ -1265,7 +1185,7 @@ makeAll32() {
|
|||
uint32_t n;
|
||||
int32_t i, normLength, count;
|
||||
|
||||
count=(int32_t)normMem->index;
|
||||
count=(int32_t)utm_countItems(normMem);
|
||||
for(i=0; i<count; ++i) {
|
||||
norms[i].value32=make32BitNorm(norms+i);
|
||||
}
|
||||
|
@ -1292,7 +1212,7 @@ makeFCD() {
|
|||
int32_t i, count, fcdLength;
|
||||
uint16_t bothCCs;
|
||||
|
||||
count=(int32_t)normMem->index;
|
||||
count=utm_countItems(normMem);
|
||||
for(i=0; i<count; ++i) {
|
||||
bothCCs=norms[i].canonBothCCs;
|
||||
if(bothCCs==0) {
|
||||
|
@ -1400,7 +1320,7 @@ combine(uint32_t lead, uint32_t trail) {
|
|||
|
||||
/* search for all triples with c as lead code point */
|
||||
triples=utm_getStart(combiningTriplesMem);
|
||||
count=combiningTriplesMem->index;
|
||||
count=utm_countItems(combiningTriplesMem);
|
||||
|
||||
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
|
||||
for(i=0; i<count && lead!=triples[i].lead; ++i) {}
|
||||
|
@ -1512,7 +1432,7 @@ canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
|
|||
|
||||
/* search for all triples with c as lead code point */
|
||||
triples=utm_getStart(combiningTriplesMem);
|
||||
count=combiningTriplesMem->index;
|
||||
count=utm_countItems(combiningTriplesMem);
|
||||
c=s[0];
|
||||
|
||||
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
|
||||
|
@ -1838,7 +1758,7 @@ generateData(const char *dataDir) {
|
|||
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
||||
|
||||
/* make sure that the FCD trie is 4-aligned */
|
||||
if((extraMem->index+combiningTableTop)&1) {
|
||||
if((utm_countItems(extraMem)+combiningTableTop)&1) {
|
||||
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
|
||||
}
|
||||
|
||||
|
@ -1850,7 +1770,7 @@ generateData(const char *dataDir) {
|
|||
size=
|
||||
_NORM_INDEX_TOP*4+
|
||||
normTrieSize+
|
||||
extraMem->index*2+
|
||||
utm_countItems(extraMem)*2+
|
||||
combiningTableTop*2+
|
||||
fcdTrieSize+
|
||||
auxTrieSize+
|
||||
|
@ -1858,7 +1778,7 @@ generateData(const char *dataDir) {
|
|||
|
||||
if(beVerbose) {
|
||||
printf("size of normalization trie %5u bytes\n", normTrieSize);
|
||||
printf("size of 16-bit extra memory %5u UChars/uint16_t\n", extraMem->index);
|
||||
printf("size of 16-bit extra memory %5u UChars/uint16_t\n", utm_countItems(extraMem));
|
||||
printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
|
||||
printf("size of combining table %5u uint16_t\n", combiningTableTop);
|
||||
printf("size of FCD trie %5u bytes\n", fcdTrieSize);
|
||||
|
@ -1873,7 +1793,7 @@ generateData(const char *dataDir) {
|
|||
}
|
||||
|
||||
indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
|
||||
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index;
|
||||
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
|
||||
|
||||
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
|
||||
indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
|
||||
|
@ -1900,7 +1820,7 @@ generateData(const char *dataDir) {
|
|||
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, normTrieBlock, normTrieSize);
|
||||
udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2);
|
||||
udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
|
||||
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
|
||||
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
|
||||
udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
|
||||
|
@ -1928,7 +1848,7 @@ extern void
|
|||
cleanUpData(void) {
|
||||
int32_t i, count;
|
||||
|
||||
count=(int32_t)normMem->index;
|
||||
count=utm_countItems(normMem);
|
||||
for(i=0; i<count; ++i) {
|
||||
uset_close(norms[i].canonStart);
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ TARGET = makeconv$(EXEEXT)
|
|||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = makeconv.o ucnvstat.o genmbcs.o
|
||||
OBJECTS = makeconv.o ucnvstat.o genmbcs.o gencnvex.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
|
|
996
icu4c/source/tools/makeconv/gencnvex.c
Normal file
996
icu4c/source/tools/makeconv/gencnvex.c
Normal file
|
@ -0,0 +1,996 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: gencnvex.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003oct12
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "toolutil.h"
|
||||
#include "unewdata.h"
|
||||
#include "ucm.h"
|
||||
#include "makeconv.h"
|
||||
#include "genmbcs.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
static void
|
||||
CnvExtClose(NewConverter *cnvData);
|
||||
|
||||
static UBool
|
||||
CnvExtIsValid(NewConverter *cnvData,
|
||||
const uint8_t *bytes, int32_t length);
|
||||
|
||||
static UBool
|
||||
CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
|
||||
|
||||
static uint32_t
|
||||
CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
UNewDataMemory *pData, int32_t tableType);
|
||||
|
||||
typedef struct CnvExtData {
|
||||
NewConverter newConverter;
|
||||
|
||||
UCMFile *ucm;
|
||||
|
||||
/* toUnicode (state table in ucm->states) */
|
||||
UToolMemory *toUTable, *toUUChars;
|
||||
|
||||
/* fromUnicode */
|
||||
UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes;
|
||||
|
||||
uint16_t stage1[MBCS_STAGE_1_SIZE];
|
||||
uint16_t stage2[MBCS_STAGE_2_SIZE];
|
||||
uint16_t stage3[0x10000<<UCNV_EXT_STAGE_2_LEFT_SHIFT]; /* 0x10000 because of 16-bit stage 2/3 indexes */
|
||||
uint32_t stage3b[0x10000];
|
||||
|
||||
int32_t stage1Top, stage2Top, stage3Top, stage3bTop;
|
||||
|
||||
/* for stage3 compaction of <subchar1> |2 mappings */
|
||||
uint16_t stage3Sub1Block;
|
||||
} CnvExtData;
|
||||
|
||||
NewConverter *
|
||||
CnvExtOpen(UCMFile *ucm) {
|
||||
CnvExtData *extData;
|
||||
|
||||
extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData));
|
||||
if(extData!=NULL) {
|
||||
uprv_memset(extData, 0, sizeof(CnvExtData));
|
||||
|
||||
extData->ucm=ucm; /* aliased, not owned */
|
||||
|
||||
extData->newConverter.close=CnvExtClose;
|
||||
extData->newConverter.isValid=CnvExtIsValid;
|
||||
extData->newConverter.addTable=CnvExtAddTable;
|
||||
extData->newConverter.write=CnvExtWrite;
|
||||
}
|
||||
return &extData->newConverter;
|
||||
}
|
||||
|
||||
static void
|
||||
CnvExtClose(NewConverter *cnvData) {
|
||||
CnvExtData *extData=(CnvExtData *)cnvData;
|
||||
if(extData!=NULL) {
|
||||
utm_close(extData->toUTable);
|
||||
utm_close(extData->toUUChars);
|
||||
utm_close(extData->fromUTableUChars);
|
||||
utm_close(extData->fromUTableValues);
|
||||
utm_close(extData->fromUBytes);
|
||||
}
|
||||
}
|
||||
|
||||
/* we do not expect this to be called */
|
||||
static UBool
|
||||
CnvExtIsValid(NewConverter *cnvData,
|
||||
const uint8_t *bytes, int32_t length) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
UNewDataMemory *pData, int32_t tableType) {
|
||||
CnvExtData *extData=(CnvExtData *)cnvData;
|
||||
int32_t length, top, headerSize;
|
||||
|
||||
int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 };
|
||||
|
||||
if(tableType&TABLE_BASE) {
|
||||
headerSize=0;
|
||||
} else {
|
||||
_MBCSHeader header={ 0 };
|
||||
|
||||
/* write the header and base table name for an extension-only table */
|
||||
length=uprv_strlen(extData->ucm->baseName)+1;
|
||||
while(length&3) {
|
||||
/* add padding */
|
||||
extData->ucm->baseName[length++]=0;
|
||||
}
|
||||
|
||||
headerSize=sizeof(header)+length;
|
||||
|
||||
/* fill the header */
|
||||
header.version[0]=4;
|
||||
header.version[1]=2;
|
||||
header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY);
|
||||
|
||||
/* write the header and the base table name */
|
||||
udata_writeBlock(pData, &header, sizeof(header));
|
||||
udata_writeBlock(pData, extData->ucm->baseName, length);
|
||||
}
|
||||
|
||||
/* fill indexes[] - offsets/indexes are in units of the target array */
|
||||
top=0;
|
||||
|
||||
indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH;
|
||||
top+=length*4;
|
||||
|
||||
indexes[UCNV_EXT_TO_U_INDEX]=top;
|
||||
indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable);
|
||||
top+=length*4;
|
||||
|
||||
indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top;
|
||||
indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars);
|
||||
top+=length*2;
|
||||
|
||||
indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top;
|
||||
length=utm_countItems(extData->fromUTableUChars);
|
||||
top+=length*2;
|
||||
|
||||
if(top&3) {
|
||||
/* add padding */
|
||||
*((UChar *)utm_alloc(extData->fromUTableUChars))=0;
|
||||
*((uint32_t *)utm_alloc(extData->fromUTableValues))=0;
|
||||
++length;
|
||||
top+=2;
|
||||
}
|
||||
indexes[UCNV_EXT_FROM_U_LENGTH]=length;
|
||||
|
||||
indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top;
|
||||
top+=length*4;
|
||||
|
||||
indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top;
|
||||
length=utm_countItems(extData->fromUBytes);
|
||||
top+=length;
|
||||
|
||||
if(top&1) {
|
||||
/* add padding */
|
||||
*((uint8_t *)utm_alloc(extData->fromUBytes))=0;
|
||||
++length;
|
||||
++top;
|
||||
}
|
||||
indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length;
|
||||
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top;
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top;
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top;
|
||||
top+=length*2;
|
||||
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top;
|
||||
length=extData->stage3Top;
|
||||
top+=length*2;
|
||||
|
||||
if(top&3) {
|
||||
/* add padding */
|
||||
extData->stage3[extData->stage3Top++]=0;
|
||||
++length;
|
||||
top+=2;
|
||||
}
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length;
|
||||
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top;
|
||||
indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop;
|
||||
top+=length*4;
|
||||
|
||||
indexes[UCNV_EXT_SIZE]=top;
|
||||
|
||||
/* write the extension data */
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
|
||||
udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2);
|
||||
|
||||
udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2);
|
||||
udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4);
|
||||
udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]);
|
||||
|
||||
udata_writeBlock(pData, extData->stage1, extData->stage1Top*2);
|
||||
udata_writeBlock(pData, extData->stage2, extData->stage2Top*2);
|
||||
udata_writeBlock(pData, extData->stage3, extData->stage3Top*2);
|
||||
udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4);
|
||||
|
||||
{
|
||||
int32_t i, j;
|
||||
|
||||
length=extData->stage1Top;
|
||||
printf("\nstage1[%x]:\n", length);
|
||||
|
||||
for(i=0; i<length; ++i) {
|
||||
if(extData->stage1[i]!=length) {
|
||||
printf("stage1[%04x]=%04x\n", i, extData->stage1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
j=length;
|
||||
length=extData->stage2Top;
|
||||
printf("\nstage2[%x]:\n", length);
|
||||
|
||||
for(i=0; i<length; ++j, ++i) {
|
||||
if(extData->stage2[i]!=0) {
|
||||
printf("stage12[%04x]=%04x\n", j, extData->stage2[i]);
|
||||
}
|
||||
}
|
||||
|
||||
length=extData->stage3Top;
|
||||
printf("\nstage3[%x]:\n", length);
|
||||
|
||||
for(i=0; i<length; ++i) {
|
||||
if(extData->stage3[i]!=0) {
|
||||
printf("stage3[%04x]=%04x\n", i, extData->stage3[i]);
|
||||
}
|
||||
}
|
||||
|
||||
length=extData->stage3bTop;
|
||||
printf("\nstage3b[%x]:\n", length);
|
||||
|
||||
for(i=0; i<length; ++i) {
|
||||
if(extData->stage3b[i]!=0) {
|
||||
printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(VERBOSE) {
|
||||
printf("size of extension data: %ld\n", top);
|
||||
}
|
||||
|
||||
/* return the number of bytes that should have been written */
|
||||
return (uint32_t)(headerSize+top);
|
||||
}
|
||||
|
||||
/* to Unicode --------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Remove fromUnicode fallbacks and SUB mappings which are irrelevant for
|
||||
* the toUnicode table.
|
||||
* The table must be sorted.
|
||||
* Destroys previous data in the reverseMap.
|
||||
*/
|
||||
static int32_t
|
||||
reduceToUMappings(UCMTable *table) {
|
||||
UCMapping *mappings;
|
||||
int32_t *map;
|
||||
int32_t i, j, count;
|
||||
int8_t flag;
|
||||
|
||||
mappings=table->mappings;
|
||||
map=table->reverseMap;
|
||||
count=table->mappingsLength;
|
||||
|
||||
/* leave the map alone for the initial mappings with desired flags */
|
||||
for(i=j=0; i<count; ++i) {
|
||||
flag=mappings[map[i]].f;
|
||||
if(flag!=0 && flag!=3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* reduce from here to the rest */
|
||||
for(j=i; i<count; ++i) {
|
||||
flag=mappings[map[i]].f;
|
||||
if(flag==0 || flag==3) {
|
||||
map[j++]=map[i];
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
||||
UChar32 *u32;
|
||||
UChar *u;
|
||||
uint32_t value;
|
||||
int32_t u16Length;
|
||||
UErrorCode errorCode;
|
||||
|
||||
/* write the Unicode result code point or string index */
|
||||
if(m->uLen==1) {
|
||||
value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
|
||||
} else {
|
||||
/* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
|
||||
|
||||
/* get the result code point string and its 16-bit string length */
|
||||
u32=UCM_GET_CODE_POINTS(table, m);
|
||||
errorCode=U_ZERO_ERROR;
|
||||
u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode);
|
||||
if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
/* allocate it and put its length and index into the value */
|
||||
value=
|
||||
(((uint32_t)m->uLen+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)|
|
||||
((uint32_t)utm_countItems(extData->toUUChars));
|
||||
u=utm_allocN(extData->toUUChars, u16Length);
|
||||
|
||||
/* write the result 16-bit string */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode);
|
||||
if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
|
||||
exit(errorCode);
|
||||
}
|
||||
}
|
||||
if(m->f==0) {
|
||||
value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/*
|
||||
* Recursive toUTable generator core function.
|
||||
* Preconditions:
|
||||
* - start<limit (There is at least one mapping.)
|
||||
* - The mappings are sorted lexically. (Access is through the reverseMap.)
|
||||
* - All mappings between start and limit have input sequences that share
|
||||
* the same prefix of unitIndex length, and therefore all of these sequences
|
||||
* are at least unitIndex+1 long.
|
||||
* - There are only relevant mappings available through the reverseMap,
|
||||
* see reduceToUMappings().
|
||||
*
|
||||
* One function invocation generates one section table.
|
||||
*
|
||||
* Steps:
|
||||
* 1. Count the number of unique unit values and get the low/high unit values
|
||||
* that occur at unitIndex.
|
||||
* 2. Allocate the section table with possible optimization for linear access.
|
||||
* 3. Write temporary version of the section table with start indexes of
|
||||
* subsections, each corresponding to one unit value at unitIndex.
|
||||
* 4. Iterate through the table once more, and depending on the subsection length:
|
||||
* 0: write 0 as a result value (unused byte in linear-access section table)
|
||||
* >0: if there is one mapping with an input unit sequence of unitIndex+1
|
||||
* then defaultValue=compute the mapping result for this whole sequence
|
||||
* else defaultValue=0
|
||||
*
|
||||
* recurse into the subsection
|
||||
*/
|
||||
static UBool
|
||||
generateToUTable(CnvExtData *extData, UCMTable *table,
|
||||
int32_t start, int32_t limit, int32_t unitIndex,
|
||||
uint32_t defaultValue) {
|
||||
UCMapping *mappings, *m;
|
||||
int32_t *map;
|
||||
int32_t i, j, uniqueCount, count, subStart, subLimit;
|
||||
|
||||
uint8_t *bytes;
|
||||
int32_t low, high, prev;
|
||||
|
||||
uint32_t *section;
|
||||
|
||||
mappings=table->mappings;
|
||||
map=table->reverseMap;
|
||||
|
||||
/* step 1: examine the input units; set low, high, uniqueCount */
|
||||
m=mappings+map[start];
|
||||
bytes=UCM_GET_BYTES(table, m);
|
||||
low=bytes[unitIndex];
|
||||
uniqueCount=1;
|
||||
|
||||
prev=high=low;
|
||||
for(i=start+1; i<limit; ++i) {
|
||||
m=mappings+map[i];
|
||||
bytes=UCM_GET_BYTES(table, m);
|
||||
high=bytes[unitIndex];
|
||||
|
||||
if(high!=prev) {
|
||||
prev=high;
|
||||
++uniqueCount;
|
||||
}
|
||||
}
|
||||
|
||||
/* step 2: allocate the section; set count, section */
|
||||
count=(high-low)+1;
|
||||
if(unitIndex==0 || uniqueCount>=(3*count)/4) {
|
||||
/*
|
||||
* for the root table and for fairly full tables:
|
||||
* allocate for direct, linear array access
|
||||
* by keeping count, to write an entry for each unit value
|
||||
* from low to high
|
||||
*/
|
||||
} else {
|
||||
count=uniqueCount;
|
||||
}
|
||||
|
||||
/* allocate the section: 1 entry for the header + count for the items */
|
||||
section=(uint32_t *)utm_allocN(extData->toUTable, 1+count);
|
||||
|
||||
/* write the section header */
|
||||
*section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue;
|
||||
|
||||
/* step 3: write temporary section table with subsection starts */
|
||||
prev=low-1; /* just before low to prevent empty subsections before low */
|
||||
j=0; /* section table index */
|
||||
for(i=start; i<limit; ++i) {
|
||||
m=mappings+map[i];
|
||||
bytes=UCM_GET_BYTES(table, m);
|
||||
high=bytes[unitIndex];
|
||||
|
||||
if(high!=prev) {
|
||||
/* start of a new subsection for unit high */
|
||||
if(count>uniqueCount) {
|
||||
/* write empty subsections for unused units in a linear table */
|
||||
while(++prev<high) {
|
||||
section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i;
|
||||
}
|
||||
} else {
|
||||
prev=high;
|
||||
}
|
||||
|
||||
/* write the entry with the subsection start */
|
||||
section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i;
|
||||
}
|
||||
}
|
||||
/* assert(j==count) */
|
||||
|
||||
/* step 4: recurse and write results */
|
||||
subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]);
|
||||
for(j=0; j<count; ++j) {
|
||||
subStart=subLimit;
|
||||
subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit;
|
||||
|
||||
/* remove the subStart temporary value */
|
||||
section[j]&=~UCNV_EXT_TO_U_VALUE_MASK;
|
||||
|
||||
if(subStart==subLimit) {
|
||||
/* leave the value zero: empty subsection for unused unit in a linear table */
|
||||
continue;
|
||||
}
|
||||
|
||||
/* see if there is exactly one input unit sequence of length unitIndex+1 */
|
||||
defaultValue=0;
|
||||
m=mappings+map[subStart];
|
||||
if(m->bLen==unitIndex+1) {
|
||||
/* do not include this in generateToUTable() */
|
||||
++subStart;
|
||||
|
||||
if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) {
|
||||
/* print error for multiple same-input-sequence mappings */
|
||||
fprintf(stderr, "error: multiple mappings from same bytes\n");
|
||||
ucm_printMapping(table, m, stderr);
|
||||
ucm_printMapping(table, mappings+map[subStart], stderr);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
defaultValue=getToUnicodeValue(extData, table, m);
|
||||
}
|
||||
|
||||
if(subStart==subLimit) {
|
||||
/* write the result for the input sequence ending here */
|
||||
section[j]|=defaultValue;
|
||||
} else {
|
||||
/* write the index to the subsection table */
|
||||
section[j]|=(uint32_t)utm_countItems(extData->toUTable);
|
||||
|
||||
/* recurse */
|
||||
if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate the toUTable and toUUChars from the input table.
|
||||
* The input table must be sorted, and all precision flags must be 0..3.
|
||||
* This function will modify the table's reverseMap.
|
||||
*/
|
||||
static UBool
|
||||
makeToUTable(CnvExtData *extData, UCMTable *table) {
|
||||
int32_t toUCount;
|
||||
|
||||
toUCount=reduceToUMappings(table);
|
||||
|
||||
extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4);
|
||||
extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2);
|
||||
|
||||
return generateToUTable(extData, table, 0, toUCount, 0, 0);
|
||||
}
|
||||
|
||||
/* from Unicode ------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* preprocessing:
|
||||
* rebuild reverseMap with mapping indexes for mappings relevant for from Unicode
|
||||
* change each Unicode string to encode all but the first code point in 16-bit form
|
||||
*
|
||||
* generation:
|
||||
* for each unique code point
|
||||
* write an entry in the 3-stage trie
|
||||
* check that there is only one single-code point sequence
|
||||
* start recursion for following 16-bit input units
|
||||
*/
|
||||
|
||||
/*
|
||||
* Remove toUnicode fallbacks and non-<subchar1> SUB mappings
|
||||
* which are irrelevant for the fromUnicode extension table.
|
||||
* Overwrite the reverseMap with an index array to the relevant mappings.
|
||||
* Modify the code point sequences to a generator-friendly format where
|
||||
* the first code points remains unchanged but the following are recoded
|
||||
* into 16-bit Unicode string form.
|
||||
* The table must be sorted.
|
||||
* Destroys previous data in the reverseMap.
|
||||
*/
|
||||
static int32_t
|
||||
prepareFromUMappings(UCMTable *table) {
|
||||
UCMapping *mappings, *m;
|
||||
int32_t *map;
|
||||
int32_t i, j, count;
|
||||
int8_t flag;
|
||||
|
||||
mappings=table->mappings;
|
||||
map=table->reverseMap;
|
||||
count=table->mappingsLength;
|
||||
|
||||
/*
|
||||
* we do not go through the map on input because the mappings are
|
||||
* sorted lexically
|
||||
*/
|
||||
m=mappings;
|
||||
|
||||
for(i=j=0; i<count; ++m, ++i) {
|
||||
flag=m->f;
|
||||
if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
|
||||
map[j++]=i;
|
||||
|
||||
if(m->uLen>1) {
|
||||
/* recode all but the first code point to 16-bit Unicode */
|
||||
UChar32 *u32;
|
||||
UChar *u;
|
||||
UChar32 c;
|
||||
int32_t q, r;
|
||||
|
||||
u32=UCM_GET_CODE_POINTS(table, m);
|
||||
u=(UChar *)u32; /* destructive in-place recoding */
|
||||
for(r=2, q=1; q<m->uLen; ++q) {
|
||||
c=u32[q];
|
||||
U16_APPEND_UNSAFE(u, r, c);
|
||||
}
|
||||
|
||||
/* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */
|
||||
m->uLen=(int8_t)r;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return j;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
||||
uint8_t *bytes, *resultBytes;
|
||||
uint32_t value;
|
||||
|
||||
if(m->f==2) {
|
||||
return UCNV_EXT_FROM_U_SUBCHAR1; /* <subchar1> SUB mapping */
|
||||
}
|
||||
|
||||
bytes=UCM_GET_BYTES(table, m);
|
||||
value=0;
|
||||
switch(m->bLen) {
|
||||
/* 1..3: store the bytes in the value word */
|
||||
case 3:
|
||||
value=((uint32_t)*bytes++)<<16;
|
||||
case 2:
|
||||
value|=((uint32_t)*bytes++)<<8;
|
||||
case 1:
|
||||
value|=*bytes;
|
||||
break;
|
||||
default:
|
||||
/* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */
|
||||
/* store the bytes in fromUBytes[] and the index in the value word */
|
||||
value=(uint32_t)utm_countItems(extData->fromUBytes);
|
||||
resultBytes=utm_allocN(extData->fromUBytes, m->bLen);
|
||||
uprv_memcpy(resultBytes, bytes, m->bLen);
|
||||
break;
|
||||
}
|
||||
value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT;
|
||||
if(m->f==0) {
|
||||
value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/*
|
||||
* works like generateToUTable(), except that the
|
||||
* output section consists of two arrays, one for input UChars and one
|
||||
* for result values
|
||||
*
|
||||
* also, fromUTable sections are always stored in a compact form for
|
||||
* access via binary search
|
||||
*/
|
||||
static UBool
|
||||
generateFromUTable(CnvExtData *extData, UCMTable *table,
|
||||
int32_t start, int32_t limit, int32_t unitIndex,
|
||||
uint32_t defaultValue) {
|
||||
UCMapping *mappings, *m;
|
||||
int32_t *map;
|
||||
int32_t i, j, uniqueCount, count, subStart, subLimit;
|
||||
|
||||
UChar *uchars;
|
||||
UChar32 low, high, prev;
|
||||
|
||||
UChar *sectionUChars;
|
||||
uint32_t *sectionValues;
|
||||
|
||||
mappings=table->mappings;
|
||||
map=table->reverseMap;
|
||||
|
||||
/* step 1: examine the input units; set low, high, uniqueCount */
|
||||
m=mappings+map[start];
|
||||
uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
|
||||
low=uchars[unitIndex];
|
||||
uniqueCount=1;
|
||||
|
||||
prev=high=low;
|
||||
for(i=start+1; i<limit; ++i) {
|
||||
m=mappings+map[i];
|
||||
uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
|
||||
high=uchars[unitIndex];
|
||||
|
||||
if(high!=prev) {
|
||||
prev=high;
|
||||
++uniqueCount;
|
||||
}
|
||||
}
|
||||
|
||||
/* step 2: allocate the section; set count, section */
|
||||
/* the fromUTable always stores for access via binary search */
|
||||
count=uniqueCount;
|
||||
|
||||
/* allocate the section: 1 entry for the header + count for the items */
|
||||
sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count);
|
||||
sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count);
|
||||
|
||||
/* write the section header */
|
||||
*sectionUChars++=(UChar)count;
|
||||
*sectionValues++=defaultValue;
|
||||
|
||||
/* step 3: write temporary section table with subsection starts */
|
||||
prev=low-1; /* just before low to prevent empty subsections before low */
|
||||
j=0; /* section table index */
|
||||
for(i=start; i<limit; ++i) {
|
||||
m=mappings+map[i];
|
||||
uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
|
||||
high=uchars[unitIndex];
|
||||
|
||||
if(high!=prev) {
|
||||
/* start of a new subsection for unit high */
|
||||
prev=high;
|
||||
|
||||
/* write the entry with the subsection start */
|
||||
sectionUChars[j]=(UChar)high;
|
||||
sectionValues[j]=(uint32_t)i;
|
||||
++j;
|
||||
}
|
||||
}
|
||||
/* assert(j==count) */
|
||||
|
||||
/* step 4: recurse and write results */
|
||||
subLimit=(int32_t)(sectionValues[0]);
|
||||
for(j=0; j<count; ++j) {
|
||||
subStart=subLimit;
|
||||
subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit;
|
||||
|
||||
/* see if there is exactly one input unit sequence of length unitIndex+1 */
|
||||
defaultValue=0;
|
||||
m=mappings+map[subStart];
|
||||
if(m->uLen==unitIndex+1) {
|
||||
/* do not include this in generateToUTable() */
|
||||
++subStart;
|
||||
|
||||
if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) {
|
||||
/* print error for multiple same-input-sequence mappings */
|
||||
fprintf(stderr, "error: multiple mappings from same Unicode code points\n");
|
||||
ucm_printMapping(table, m, stderr);
|
||||
ucm_printMapping(table, mappings+map[subStart], stderr);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
defaultValue=getFromUBytesValue(extData, table, m);
|
||||
}
|
||||
|
||||
if(subStart==subLimit) {
|
||||
/* write the result for the input sequence ending here */
|
||||
sectionValues[j]=defaultValue;
|
||||
} else {
|
||||
/* write the index to the subsection table */
|
||||
sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues);
|
||||
|
||||
/* recurse */
|
||||
if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* add entries to the fromUnicode trie,
|
||||
* assume to be called with code points in ascending order
|
||||
* and use that to build the trie in precompacted form
|
||||
*/
|
||||
static void
|
||||
addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) {
|
||||
int32_t i1, i2, i3, i3b, nextOffset, min, newBlock;
|
||||
|
||||
if(value==0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* compute the index for each stage,
|
||||
* allocate a stage block if necessary,
|
||||
* and write the stage value
|
||||
*/
|
||||
i1=c>>10;
|
||||
if(i1>=extData->stage1Top) {
|
||||
extData->stage1Top=i1+1;
|
||||
}
|
||||
|
||||
nextOffset=(c>>4)&0x3f;
|
||||
|
||||
if(extData->stage1[i1]==0) {
|
||||
/* allocate another block in stage 2; overlap with the previous block */
|
||||
newBlock=extData->stage2Top;
|
||||
min=newBlock-nextOffset; /* minimum block start with overlap */
|
||||
while(min<newBlock && extData->stage2[newBlock-1]==0) {
|
||||
--newBlock;
|
||||
}
|
||||
|
||||
extData->stage1[i1]=(uint16_t)newBlock;
|
||||
extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
|
||||
if(extData->stage2Top>LENGTHOF(extData->stage2)) {
|
||||
fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", c);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
i2=extData->stage1[i1]+nextOffset;
|
||||
nextOffset=c&0xf;
|
||||
|
||||
if(extData->stage2[i2]==0) {
|
||||
/* allocate another block in stage 3; overlap with the previous block */
|
||||
newBlock=extData->stage3Top;
|
||||
min=newBlock-nextOffset; /* minimum block start with overlap */
|
||||
while(min<newBlock && extData->stage3[newBlock-1]==0) {
|
||||
--newBlock;
|
||||
}
|
||||
|
||||
/* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */
|
||||
newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1);
|
||||
extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT);
|
||||
|
||||
extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE;
|
||||
if(extData->stage3Top>LENGTHOF(extData->stage3)) {
|
||||
fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", c);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
i3=((int32_t)extData->stage2[i2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)+nextOffset;
|
||||
/*
|
||||
* assume extData->stage3[i3]==0 because we get
|
||||
* code points in strictly ascending order
|
||||
*/
|
||||
|
||||
if(value==UCNV_EXT_FROM_U_SUBCHAR1) {
|
||||
/* <subchar1> SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */
|
||||
extData->stage3[i3]=1;
|
||||
|
||||
/*
|
||||
* precompaction is not optimal for <subchar1> |2 mappings because
|
||||
* stage3 values for them are all the same, unlike for other mappings
|
||||
* which all have unique values;
|
||||
* use a simple compaction of reusing a whole block filled with these
|
||||
* mappings
|
||||
*/
|
||||
|
||||
/* is the entire block filled with <subchar1> |2 mappings? */
|
||||
if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) {
|
||||
for(min=i3-nextOffset;
|
||||
min<i3 && extData->stage3[min]==1;
|
||||
++min) {}
|
||||
|
||||
if(min==i3) {
|
||||
/* the entire block is filled with these mappings */
|
||||
if(extData->stage3Sub1Block!=0) {
|
||||
/* point to the previous such block and remove this block from stage3 */
|
||||
extData->stage2[i2]=extData->stage3Sub1Block;
|
||||
extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE;
|
||||
uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2);
|
||||
} else {
|
||||
/* remember this block's stage2 entry */
|
||||
extData->stage3Sub1Block=extData->stage2[i2];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if((i3b=extData->stage3bTop++)>=LENGTHOF(extData->stage3b)) {
|
||||
fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", c);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
/* roundtrip or fallback mapping */
|
||||
extData->stage3[i3]=(uint16_t)i3b;
|
||||
extData->stage3b[i3b]=value;
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) {
|
||||
UCMapping *mappings, *m;
|
||||
int32_t *map;
|
||||
uint32_t value;
|
||||
int32_t subStart, subLimit;
|
||||
|
||||
UChar32 *codePoints;
|
||||
UChar32 c, next;
|
||||
|
||||
if(mapLength==0) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
mappings=table->mappings;
|
||||
map=table->reverseMap;
|
||||
|
||||
/*
|
||||
* iterate over same-initial-code point mappings,
|
||||
* enter the initial code point into the trie,
|
||||
* and start a recursion on the corresponding mappings section
|
||||
* with generateFromUTable()
|
||||
*/
|
||||
m=mappings+map[0];
|
||||
codePoints=UCM_GET_CODE_POINTS(table, m);
|
||||
next=codePoints[0];
|
||||
subLimit=0;
|
||||
while(subLimit<mapLength) {
|
||||
/* get a new subsection of mappings starting with the same code point */
|
||||
subStart=subLimit;
|
||||
c=next;
|
||||
while(next==c && ++subLimit<mapLength) {
|
||||
m=mappings+map[subLimit];
|
||||
codePoints=UCM_GET_CODE_POINTS(table, m);
|
||||
next=codePoints[0];
|
||||
}
|
||||
|
||||
/*
|
||||
* compute the value for this code point;
|
||||
* if there is a mapping for this code point alone, it is at subStart
|
||||
* because the table is sorted lexically
|
||||
*/
|
||||
value=0;
|
||||
m=mappings+map[subStart];
|
||||
codePoints=UCM_GET_CODE_POINTS(table, m);
|
||||
if(m->uLen==1) {
|
||||
/* do not include this in generateFromUTable() */
|
||||
++subStart;
|
||||
|
||||
if(subStart<subLimit && mappings[map[subStart]].uLen==1) {
|
||||
/* print error for multiple same-input-sequence mappings */
|
||||
fprintf(stderr, "error: multiple mappings from same Unicode code points\n");
|
||||
ucm_printMapping(table, m, stderr);
|
||||
ucm_printMapping(table, mappings+map[subStart], stderr);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
value=getFromUBytesValue(extData, table, m);
|
||||
}
|
||||
|
||||
if(subStart==subLimit) {
|
||||
/* write the result for this one code point */
|
||||
addFromUTrieEntry(extData, c, value);
|
||||
} else {
|
||||
/* write the index to the subsection table */
|
||||
addFromUTrieEntry(extData, c, (uint32_t)utm_countItems(extData->fromUTableValues));
|
||||
|
||||
/* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */
|
||||
if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate the fromU data structures from the input table.
|
||||
* The input table must be sorted, and all precision flags must be 0..3.
|
||||
* This function will modify the table's reverseMap.
|
||||
*/
|
||||
static UBool
|
||||
makeFromUTable(CnvExtData *extData, UCMTable *table) {
|
||||
uint16_t *stage1;
|
||||
int32_t i, stage1Top, fromUCount;
|
||||
|
||||
fromUCount=prepareFromUMappings(table);
|
||||
|
||||
extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2);
|
||||
extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4);
|
||||
extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1);
|
||||
|
||||
/* allocate all-unassigned stage blocks */
|
||||
extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED;
|
||||
extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED;
|
||||
|
||||
/*
|
||||
* stage 3b stores only unique values, and in
|
||||
* index 0: 0 for "no mapping"
|
||||
* index 1: "no mapping" with preference for <subchar1> rather than <subchar>
|
||||
*/
|
||||
extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1;
|
||||
extData->stage3bTop=2;
|
||||
|
||||
/* allocate the first entry in the fromUTable because index 0 means "no result" */
|
||||
utm_alloc(extData->fromUTableUChars);
|
||||
utm_alloc(extData->fromUTableValues);
|
||||
|
||||
if(!generateFromUTrie(extData, table, fromUCount)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
* offset the stage 1 trie entries by stage1Top because they will
|
||||
* be stored in a single array
|
||||
*/
|
||||
stage1=extData->stage1;
|
||||
stage1Top=extData->stage1Top;
|
||||
for(i=0; i<stage1Top; ++i) {
|
||||
stage1[i]=(uint16_t)(stage1[i]+stage1Top);
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
static UBool
|
||||
CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
|
||||
CnvExtData *extData;
|
||||
|
||||
staticData->unicodeMask=table->unicodeMask;
|
||||
if(staticData->unicodeMask&UCNV_HAS_SURROGATES) {
|
||||
fprintf(stderr, "error: contains mappings for surrogate code points\n");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
staticData->conversionType=UCNV_MBCS;
|
||||
|
||||
extData=(CnvExtData *)cnvData;
|
||||
|
||||
/*
|
||||
* assume that the table is sorted
|
||||
*
|
||||
* call the functions in this order because
|
||||
* makeToUTable() modifies the original reverseMap,
|
||||
* makeFromUTable() writes a whole new mapping into reverseMap
|
||||
*/
|
||||
return
|
||||
makeToUTable(extData, table) &&
|
||||
makeFromUTable(extData, table);
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000, International Business Machines
|
||||
* Copyright (C) 2000-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -19,10 +19,27 @@
|
|||
|
||||
#include "makeconv.h"
|
||||
|
||||
U_CFUNC NewConverter *
|
||||
MBCSOpen(uint8_t maxCharLength);
|
||||
enum {
|
||||
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
|
||||
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
|
||||
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
|
||||
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
|
||||
MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
|
||||
MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
|
||||
|
||||
U_CFUNC UBool
|
||||
MBCSAddState(NewConverter *cnvData, const char *s);
|
||||
MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
|
||||
MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
|
||||
|
||||
MBCS_STAGE_3_BLOCK_SIZE=16, /* 16; 16=1<<4 for 4 bits in stage 3 */
|
||||
MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
|
||||
|
||||
MBCS_MAX_FALLBACK_COUNT=8192
|
||||
};
|
||||
|
||||
U_CFUNC NewConverter *
|
||||
MBCSOpen(UCMFile *ucm);
|
||||
|
||||
U_CFUNC NewConverter *
|
||||
CnvExtOpen(UCMFile *ucm);
|
||||
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -183,6 +183,10 @@ SOURCE="$(InputPath)"
|
|||
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\gencnvex.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\genmbcs.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2001, International Business Machines
|
||||
* Copyright (C) 2000-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -20,10 +20,19 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "unewdata.h"
|
||||
#include "ucm.h"
|
||||
|
||||
/* exports from makeconv.c */
|
||||
U_CFUNC UBool VERBOSE;
|
||||
|
||||
/* converter table type for writing */
|
||||
enum {
|
||||
TABLE_NONE,
|
||||
TABLE_BASE,
|
||||
TABLE_EXT,
|
||||
TABLE_BASE_AND_EXT
|
||||
};
|
||||
|
||||
/* abstract converter generator struct, C++ - style */
|
||||
struct NewConverter;
|
||||
typedef struct NewConverter NewConverter;
|
||||
|
@ -32,32 +41,17 @@ struct NewConverter {
|
|||
void
|
||||
(*close)(NewConverter *cnvData);
|
||||
|
||||
UBool
|
||||
(*startMappings)(NewConverter *cnvData);
|
||||
|
||||
/** is this byte sequence valid? */
|
||||
UBool
|
||||
(*isValid)(NewConverter *cnvData,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
uint32_t b);
|
||||
const uint8_t *bytes, int32_t length);
|
||||
|
||||
UBool
|
||||
(*addToUnicode)(NewConverter *cnvData,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c, uint32_t b,
|
||||
int8_t isFallback);
|
||||
|
||||
UBool
|
||||
(*addFromUnicode)(NewConverter *cnvData,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c, uint32_t b,
|
||||
int8_t isFallback);
|
||||
|
||||
void
|
||||
(*finishMappings)(NewConverter *cnvData, const UConverterStaticData *staticData);
|
||||
(*addTable)(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
|
||||
|
||||
uint32_t
|
||||
(*write)(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData);
|
||||
(*write)(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
UNewDataMemory *pData, int32_t tableType);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -132,6 +132,9 @@
|
|||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat">
|
||||
<File
|
||||
RelativePath=".\gencnvex.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\genmbcs.c">
|
||||
</File>
|
||||
|
|
|
@ -38,7 +38,7 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
|
|||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/ctestfw $(LIBCPPFLAGS)
|
||||
LIBS = $(LIBICUUC) $(DEFAULT_LIBS)
|
||||
|
||||
OBJECTS = toolutil.o unewdata.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o
|
||||
OBJECTS = toolutil.o unewdata.o ucm.o ucmstate.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o
|
||||
|
||||
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
# define NOMCX
|
||||
# include <windows.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -73,3 +74,117 @@ findBasename(const char *filename) {
|
|||
return filename;
|
||||
}
|
||||
}
|
||||
|
||||
/* tool memory helper ------------------------------------------------------- */
|
||||
|
||||
typedef struct UToolMemory {
|
||||
char name[64];
|
||||
int32_t capacity, maxCapacity, size, index;
|
||||
void *array;
|
||||
UAlignedMemory staticArray[1];
|
||||
} UToolMemory;
|
||||
|
||||
U_CAPI UToolMemory * U_EXPORT2
|
||||
utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) {
|
||||
UToolMemory *mem;
|
||||
|
||||
if(maxCapacity<initialCapacity) {
|
||||
maxCapacity=initialCapacity;
|
||||
}
|
||||
|
||||
mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+initialCapacity*size);
|
||||
if(mem==NULL) {
|
||||
fprintf(stderr, "error: %s - out of memory\n", name);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
mem->array=mem->staticArray;
|
||||
|
||||
uprv_strcpy(mem->name, name);
|
||||
mem->capacity=initialCapacity;
|
||||
mem->maxCapacity=maxCapacity;
|
||||
mem->size=size;
|
||||
mem->index=0;
|
||||
return mem;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
utm_close(UToolMemory *mem) {
|
||||
if(mem!=NULL) {
|
||||
if(mem->array!=mem->staticArray) {
|
||||
uprv_free(mem->array);
|
||||
}
|
||||
uprv_free(mem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_getStart(UToolMemory *mem) {
|
||||
return (char *)mem->array;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utm_countItems(UToolMemory *mem) {
|
||||
return mem->index;
|
||||
}
|
||||
|
||||
|
||||
static UBool
|
||||
utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
|
||||
if(mem->capacity<capacity) {
|
||||
int32_t newCapacity;
|
||||
|
||||
if(mem->maxCapacity<capacity) {
|
||||
fprintf(stderr, "error: %s - trying to use more than maxCapacity=%ld units\n",
|
||||
mem->name, (long)mem->maxCapacity);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
/* try to allocate a larger array */
|
||||
if(capacity>=2*mem->capacity) {
|
||||
newCapacity=capacity;
|
||||
} else if(mem->capacity<=mem->maxCapacity/3) {
|
||||
newCapacity=2*mem->capacity;
|
||||
} else {
|
||||
newCapacity=mem->maxCapacity;
|
||||
}
|
||||
|
||||
if(mem->array==mem->staticArray) {
|
||||
mem->array=uprv_malloc(newCapacity*mem->size);
|
||||
if(mem->array!=NULL) {
|
||||
uprv_memcpy(mem->array, mem->staticArray, mem->index*mem->size);
|
||||
}
|
||||
} else {
|
||||
mem->array=uprv_realloc(mem->array, newCapacity*mem->size);
|
||||
}
|
||||
|
||||
if(mem->array==NULL) {
|
||||
fprintf(stderr, "error: %s - out of memory\n", mem->name);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_alloc(UToolMemory *mem) {
|
||||
char *p=(char *)mem->array+mem->index*mem->size;
|
||||
int32_t newIndex=mem->index+1;
|
||||
if(utm_hasCapacity(mem, newIndex)) {
|
||||
mem->index=newIndex;
|
||||
uprv_memset(p, 0, mem->size);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_allocN(UToolMemory *mem, int32_t n) {
|
||||
char *p=(char *)mem->array+mem->index*mem->size;
|
||||
int32_t newIndex=mem->index+n;
|
||||
if(utm_hasCapacity(mem, newIndex)) {
|
||||
mem->index=newIndex;
|
||||
uprv_memset(p, 0, n*mem->size);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
|
|
@ -163,10 +163,18 @@ SOURCE=.\ucbuf.c
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucm.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucmpwrit.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucmstate.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unewdata.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -195,6 +203,10 @@ SOURCE=.\ucbuf.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucm.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucmpwrit.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -20,8 +20,7 @@
|
|||
#define __TOOLUTIL_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
|
||||
#include "cmemory.h"
|
||||
|
||||
/*
|
||||
* For Windows, a path/filename may be the short (8.3) version
|
||||
|
@ -51,4 +50,55 @@ getLongPathname(const char *pathname);
|
|||
U_CAPI const char * U_EXPORT2
|
||||
findBasename(const char *filename);
|
||||
|
||||
/*
|
||||
* UToolMemory is used for generic, custom memory management.
|
||||
* It is allocated with enough space for count*size bytes starting
|
||||
* at array.
|
||||
* The array is declared with a union of large data types so
|
||||
* that its base address is aligned for any types.
|
||||
* If size is a multiple of a data type size, then such items
|
||||
* can be safely allocated inside the array, at offsets that
|
||||
* are themselves multiples of size.
|
||||
*/
|
||||
struct UToolMemory;
|
||||
typedef struct UToolMemory UToolMemory;
|
||||
|
||||
/**
|
||||
* Open a UToolMemory object for allocation of initialCapacity to maxCapacity
|
||||
* items with size bytes each.
|
||||
*/
|
||||
U_CAPI UToolMemory * U_EXPORT2
|
||||
utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size);
|
||||
|
||||
/**
|
||||
* Close a UToolMemory object.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
utm_close(UToolMemory *mem);
|
||||
|
||||
/**
|
||||
* Get the pointer to the beginning of the array of items.
|
||||
* The pointer becomes invalid after allocation of new items.
|
||||
*/
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_getStart(UToolMemory *mem);
|
||||
|
||||
/**
|
||||
* Get the current number of items.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utm_countItems(UToolMemory *mem);
|
||||
|
||||
/**
|
||||
* Allocate one more item and return the pointer to its start in the array.
|
||||
*/
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_alloc(UToolMemory *mem);
|
||||
|
||||
/**
|
||||
* Allocate n items and return the pointer to the start of the first one in the array.
|
||||
*/
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_allocN(UToolMemory *mem, int32_t n);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -136,9 +136,15 @@
|
|||
<File
|
||||
RelativePath=".\ucbuf.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucm.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucmpwrit.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucmstate.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unewdata.c">
|
||||
</File>
|
||||
|
@ -161,6 +167,9 @@
|
|||
<File
|
||||
RelativePath=".\ucbuf.h">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucm.h">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucmpwrit.h">
|
||||
</File>
|
||||
|
|
910
icu4c/source/tools/toolutil/ucm.c
Normal file
910
icu4c/source/tools/toolutil/ucm.c
Normal file
|
@ -0,0 +1,910 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: ucm.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003jun20
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This file reads a .ucm file, stores its mappings and sorts them.
|
||||
* It implements handling of Unicode conversion mappings from .ucm files
|
||||
* for makeconv, canonucm, rptp2ucm, etc.
|
||||
*
|
||||
* Unicode code point sequences with a length of more than 1,
|
||||
* as well as byte sequences with more than 4 bytes or more than one complete
|
||||
* character sequence are handled to support m:n mappings.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "uarrsort.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "uparse.h"
|
||||
#include "ucm.h"
|
||||
#include <stdio.h>
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
### TODO
|
||||
allow file without fallback indicators for backward compatibility
|
||||
only for makeconv
|
||||
must not sort such mappings
|
||||
disallow when using extension tables because that requires sorting
|
||||
|
||||
rptp2ucm has its own mapping parser and sets all-|1 and |3 mappings; normalization function generates |0 and |2
|
||||
|
||||
*/
|
||||
|
||||
static void
|
||||
printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
|
||||
int32_t j;
|
||||
|
||||
for(j=0; j<m->uLen; ++j) {
|
||||
fprintf(f, "<U%04lX>", codePoints[j]);
|
||||
}
|
||||
|
||||
fputc(' ', f);
|
||||
|
||||
for(j=0; j<m->bLen; ++j) {
|
||||
fprintf(f, "\\x%02X", bytes[j]);
|
||||
}
|
||||
|
||||
if(m->f>=0) {
|
||||
fprintf(f, " |%lu\n", m->f);
|
||||
} else {
|
||||
fputs("\n", f);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
|
||||
printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
|
||||
UCMapping *m;
|
||||
int32_t i, length;
|
||||
|
||||
m=table->mappings;
|
||||
length=table->mappingsLength;
|
||||
if(byUnicode) {
|
||||
for(i=0; i<length; ++m, ++i) {
|
||||
ucm_printMapping(table, m, f);
|
||||
}
|
||||
} else {
|
||||
const int32_t *map=table->reverseMap;
|
||||
for(i=0; i<length; ++i) {
|
||||
ucm_printMapping(table, m+map[i], f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* mapping comparisons ------------------------------------------------------ */
|
||||
|
||||
static int32_t
|
||||
compareUnicode(UCMTable *lTable, const UCMapping *l,
|
||||
UCMTable *rTable, const UCMapping *r) {
|
||||
const UChar32 *lu, *ru;
|
||||
int32_t result, i, length;
|
||||
|
||||
if(l->uLen==1 && r->uLen==1) {
|
||||
/* compare two single code points */
|
||||
return l->u-r->u;
|
||||
}
|
||||
|
||||
/* get pointers to the code point sequences */
|
||||
lu=UCM_GET_CODE_POINTS(lTable, l);
|
||||
ru=UCM_GET_CODE_POINTS(rTable, r);
|
||||
|
||||
/* get the minimum length */
|
||||
if(l->uLen<=r->uLen) {
|
||||
length=l->uLen;
|
||||
} else {
|
||||
length=r->uLen;
|
||||
}
|
||||
|
||||
/* compare the code points */
|
||||
for(i=0; i<length; ++i) {
|
||||
result=lu[i]-ru[i];
|
||||
if(result!=0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/* compare the lengths */
|
||||
return l->uLen-r->uLen;
|
||||
}
|
||||
|
||||
static int32_t
|
||||
compareBytes(UCMTable *lTable, const UCMapping *l,
|
||||
UCMTable *rTable, const UCMapping *r,
|
||||
UBool lexical) {
|
||||
const uint8_t *lb, *rb;
|
||||
int32_t result, i, length;
|
||||
|
||||
/*
|
||||
* A lexical comparison is used for sorting in the builder, to allow
|
||||
* an efficient search for a byte sequence that could be a prefix
|
||||
* of a previously entered byte sequence.
|
||||
*
|
||||
* Comparing by lengths first is for compatibility with old .ucm tools
|
||||
* like canonucm and rptp2ucm.
|
||||
*/
|
||||
if(lexical) {
|
||||
/* get the minimum length and continue */
|
||||
if(l->bLen<=r->bLen) {
|
||||
length=l->bLen;
|
||||
} else {
|
||||
length=r->bLen;
|
||||
}
|
||||
} else {
|
||||
/* compare lengths first */
|
||||
result=l->bLen-r->bLen;
|
||||
if(result!=0) {
|
||||
return result;
|
||||
} else {
|
||||
length=l->bLen;
|
||||
}
|
||||
}
|
||||
|
||||
/* get pointers to the byte sequences */
|
||||
lb=UCM_GET_BYTES(lTable, l);
|
||||
rb=UCM_GET_BYTES(rTable, r);
|
||||
|
||||
/* compare the bytes */
|
||||
for(i=0; i<length; ++i) {
|
||||
result=lb[i]-rb[i];
|
||||
if(result!=0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/* compare the lengths */
|
||||
return l->bLen-r->bLen;
|
||||
}
|
||||
|
||||
/* compare UCMappings for sorting */
|
||||
static int32_t
|
||||
compareMappings(UCMTable *table, const void *left, const void *right, UBool uFirst) {
|
||||
const UCMapping *l=(const UCMapping *)left, *r=(const UCMapping *)right;
|
||||
int32_t result;
|
||||
|
||||
/* choose which side to compare first */
|
||||
if(uFirst) {
|
||||
/* Unicode then bytes */
|
||||
result=compareUnicode(table, l, table, r);
|
||||
if(result==0) {
|
||||
result=compareBytes(table, l, table, r, FALSE); /* not lexically, like canonucm */
|
||||
}
|
||||
} else {
|
||||
/* bytes then Unicode */
|
||||
result=compareBytes(table, l, table, r, TRUE); /* lexically, for builder */
|
||||
if(result==0) {
|
||||
result=compareUnicode(table, l, table, r);
|
||||
}
|
||||
}
|
||||
|
||||
if(result!=0) {
|
||||
return result;
|
||||
}
|
||||
|
||||
/* compare the flags */
|
||||
return l->f-r->f;
|
||||
}
|
||||
|
||||
/* sorting by Unicode first sorts mappings directly */
|
||||
static int32_t
|
||||
compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
|
||||
return compareMappings((UCMTable *)context, left, right, TRUE);
|
||||
}
|
||||
|
||||
/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
|
||||
static int32_t
|
||||
compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
|
||||
UCMTable *table=(UCMTable *)context;
|
||||
int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
|
||||
return compareMappings(table, table->mappings+l, table->mappings+r, FALSE);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_sortTable(UCMTable *t) {
|
||||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
|
||||
/* 1. sort by Unicode first */
|
||||
uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
|
||||
compareMappingsUnicodeFirst, t,
|
||||
FALSE, &errorCode);
|
||||
|
||||
/* build the reverseMap */
|
||||
if(t->reverseMap==NULL) {
|
||||
/*
|
||||
* allocate mappingsCapacity instead of mappingsLength so that
|
||||
* if mappings are added, the reverseMap need not be
|
||||
* reallocated each time
|
||||
* (see moveMappings() and ucm_addMapping())
|
||||
*/
|
||||
t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
|
||||
if(t->reverseMap==NULL) {
|
||||
fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
for(i=0; i<t->mappingsLength; ++i) {
|
||||
t->reverseMap[i]=i;
|
||||
}
|
||||
|
||||
/* 2. sort reverseMap by mappings bytes first */
|
||||
uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
|
||||
compareMappingsBytesFirst, t,
|
||||
FALSE, &errorCode);
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
|
||||
u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
### TODO normalization function for a table (in or for rptp2ucm)
|
||||
sort table
|
||||
if there are mappings with the same code points and bytes but |1 and |3, merge them into one |0 (or make |2 where necessary)
|
||||
if mappings were merged, sort again
|
||||
-> for rptp2ucm
|
||||
|
||||
*/
|
||||
|
||||
/* lookups ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
### TODO lookups?
|
||||
|
||||
binary search for first mapping with some code point or byte sequence
|
||||
check if a code point is the first of any mapping (RT or FB)
|
||||
check if a byte sequence is a prefix of any mapping (RT or RFB)
|
||||
check if there is a mapping with the same source units; return whether the target is same or different
|
||||
|
||||
*/
|
||||
|
||||
enum {
|
||||
MOVE_TO_EXT=0x10,
|
||||
REMOVE_MAPPING=0x20,
|
||||
MOVE_ANY=0x30
|
||||
};
|
||||
|
||||
/*
|
||||
* move mappings with MOVE_ANY ored into their flags from the base table
|
||||
* to the extension table
|
||||
*/
|
||||
static void
|
||||
moveMappings(UCMTable *base, UCMTable *ext) {
|
||||
UCMapping *mb, *mbLimit;
|
||||
int8_t flag;
|
||||
UBool didMove;
|
||||
|
||||
mb=base->mappings;
|
||||
mbLimit=mb+base->mappingsLength;
|
||||
didMove=FALSE;
|
||||
|
||||
while(mb<mbLimit) {
|
||||
flag=mb->f;
|
||||
if(flag&MOVE_ANY) {
|
||||
/* restore the original flag value */
|
||||
mb->f=flag&~MOVE_ANY;
|
||||
didMove=TRUE;
|
||||
|
||||
if(ext!=NULL && (flag&MOVE_TO_EXT)) {
|
||||
/* add the mapping to the extension table */
|
||||
ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
|
||||
}
|
||||
|
||||
/* move the last base mapping down and overwrite the current one */
|
||||
if(mb<(mbLimit-1)) {
|
||||
uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
|
||||
}
|
||||
--mbLimit;
|
||||
--base->mappingsLength;
|
||||
} else {
|
||||
++mb;
|
||||
}
|
||||
}
|
||||
|
||||
if(didMove) {
|
||||
ucm_sortTable(base);
|
||||
ucm_printTable(base, stdout, TRUE); puts(""); /* ### TODO */
|
||||
if(ext!=NULL) {
|
||||
ucm_sortTable(ext);
|
||||
ucm_printTable(ext, stdout, TRUE); puts(""); /* ### TODO */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum {
|
||||
NEEDS_MOVE=1,
|
||||
HAS_ERRORS=2
|
||||
};
|
||||
|
||||
static uint8_t
|
||||
checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
||||
UCMapping *mb, *me, *mbLimit, *meLimit;
|
||||
int32_t cmp;
|
||||
uint8_t result;
|
||||
|
||||
mb=base->mappings;
|
||||
mbLimit=mb+base->mappingsLength;
|
||||
|
||||
me=ext->mappings;
|
||||
meLimit=me+ext->mappingsLength;
|
||||
|
||||
result=0;
|
||||
|
||||
for(;;) {
|
||||
/* skip irrelevant mappings on both sides */
|
||||
for(;;) {
|
||||
if(mb==mbLimit) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if(0<=mb->f && mb->f<=2) {
|
||||
break;
|
||||
}
|
||||
|
||||
++mb;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
if(me==meLimit) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if(0<=me->f && me->f<=2) {
|
||||
break;
|
||||
}
|
||||
|
||||
++me;
|
||||
}
|
||||
|
||||
/* compare the base and extension mappings */
|
||||
cmp=compareUnicode(base, mb, ext, me);
|
||||
if(cmp<0) {
|
||||
/* does mb map from an input sequence that is a prefix of me's? */
|
||||
if( mb->uLen<me->uLen &&
|
||||
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
||||
) {
|
||||
if(moveToExt) {
|
||||
/* mark this mapping to be moved to the extension table */
|
||||
mb->f|=MOVE_TO_EXT;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"ucm error: the base table contains a mapping whose input sequence\n"
|
||||
" is a prefix of the input sequence of an extension mapping\n");
|
||||
ucm_printMapping(base, mb, stderr);
|
||||
ucm_printMapping(ext, me, stderr);
|
||||
}
|
||||
result|=NEEDS_MOVE;
|
||||
}
|
||||
|
||||
++mb;
|
||||
} else if(cmp==0) {
|
||||
/*
|
||||
* same output: remove the extension mapping,
|
||||
* otherwise treat as an error
|
||||
*/
|
||||
if( mb->f==me->f && mb->bLen==me->bLen &&
|
||||
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
||||
) {
|
||||
me->f|=REMOVE_MAPPING;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"ucm error: the base table contains a mapping whose input sequence\n"
|
||||
" is the same as the input sequence of an extension mapping\n"
|
||||
" but it maps differently\n");
|
||||
ucm_printMapping(base, mb, stderr);
|
||||
ucm_printMapping(ext, me, stderr);
|
||||
result|=HAS_ERRORS;
|
||||
}
|
||||
|
||||
++mb;
|
||||
} else /* cmp>0 */ {
|
||||
++me;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint8_t
|
||||
checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
||||
UCMapping *mb, *me;
|
||||
int32_t *baseMap, *extMap;
|
||||
int32_t b, e, bLimit, eLimit, cmp;
|
||||
uint8_t result;
|
||||
UBool isSISO;
|
||||
|
||||
baseMap=base->reverseMap;
|
||||
extMap=ext->reverseMap;
|
||||
|
||||
b=e=0;
|
||||
bLimit=base->mappingsLength;
|
||||
eLimit=ext->mappingsLength;
|
||||
|
||||
result=0;
|
||||
|
||||
isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
|
||||
|
||||
for(;;) {
|
||||
/* skip irrelevant mappings on both sides */
|
||||
for(;;) {
|
||||
if(b==bLimit) {
|
||||
return result;
|
||||
}
|
||||
mb=base->mappings+baseMap[b];
|
||||
|
||||
if(mb->f==0 || mb->f==3) {
|
||||
break;
|
||||
}
|
||||
|
||||
++b;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
if(e==eLimit) {
|
||||
return result;
|
||||
}
|
||||
me=ext->mappings+extMap[e];
|
||||
|
||||
if(me->f==0 || me->f==3) {
|
||||
break;
|
||||
}
|
||||
|
||||
++e;
|
||||
}
|
||||
|
||||
/* compare the base and extension mappings */
|
||||
cmp=compareBytes(base, mb, ext, me, TRUE);
|
||||
if(cmp<0) {
|
||||
/*
|
||||
* does mb map from an input sequence that is a prefix of me's?
|
||||
* for SI/SO tables, a single byte is never a prefix because it
|
||||
* occurs in a separate single-byte state
|
||||
*/
|
||||
if( mb->bLen<me->bLen &&
|
||||
(!isSISO || mb->bLen>1) &&
|
||||
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
||||
) {
|
||||
if(moveToExt) {
|
||||
/* mark this mapping to be moved to the extension table */
|
||||
mb->f|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"ucm error: the base table contains a mapping whose input sequence\n"
|
||||
" is a prefix of the input sequence of an extension mapping\n");
|
||||
ucm_printMapping(base, mb, stderr);
|
||||
ucm_printMapping(ext, me, stderr);
|
||||
result|=HAS_ERRORS;
|
||||
}
|
||||
}
|
||||
|
||||
++b;
|
||||
} else if(cmp==0) {
|
||||
/*
|
||||
* same output: remove the extension mapping,
|
||||
* otherwise treat as an error
|
||||
*/
|
||||
if( mb->f==me->f && mb->uLen==me->uLen &&
|
||||
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
||||
) {
|
||||
me->f|=REMOVE_MAPPING;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"ucm error: the base table contains a mapping whose input sequence\n"
|
||||
" is the same as the input sequence of an extension mapping\n"
|
||||
" but it maps differently\n");
|
||||
ucm_printMapping(base, mb, stderr);
|
||||
ucm_printMapping(ext, me, stderr);
|
||||
result|=HAS_ERRORS;
|
||||
}
|
||||
|
||||
++b;
|
||||
} else /* cmp>0 */ {
|
||||
++e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
|
||||
UCMapping *m, *mLimit;
|
||||
int32_t count;
|
||||
UBool isOK;
|
||||
|
||||
m=table->mappings;
|
||||
mLimit=m+table->mappingsLength;
|
||||
isOK=TRUE;
|
||||
|
||||
while(m<mLimit) {
|
||||
count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
|
||||
if(count<1) {
|
||||
ucm_printMapping(table, m, stderr);
|
||||
isOK=FALSE;
|
||||
}
|
||||
++m;
|
||||
}
|
||||
|
||||
return isOK;
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
||||
uint8_t result;
|
||||
|
||||
/* if we have an extension table, we must always use precision flags */
|
||||
if(base->flagsType!=UCM_FLAGS_EXPLICIT || ext->flagsType!=UCM_FLAGS_EXPLICIT) {
|
||||
fprintf(stderr, "ucm error: the base or extension table contains mappings without precision flags\n");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* checking requires both tables to be sorted */
|
||||
ucm_sortTable(base);
|
||||
ucm_sortTable(ext);
|
||||
|
||||
/* check */
|
||||
result=
|
||||
checkBaseExtUnicode(base, ext, moveToExt)|
|
||||
checkBaseExtBytes(baseStates, base, ext, moveToExt);
|
||||
|
||||
if(result&HAS_ERRORS) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if(result&NEEDS_MOVE) {
|
||||
moveMappings(ext, NULL);
|
||||
moveMappings(base, ext);
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* ucm parser --------------------------------------------------------------- */
|
||||
|
||||
U_CAPI int8_t U_EXPORT2
|
||||
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
|
||||
const char *s=*ps;
|
||||
char *end;
|
||||
int8_t bLen;
|
||||
|
||||
bLen=0;
|
||||
for(;;) {
|
||||
/* skip an optional plus sign */
|
||||
if(bLen>0 && *s=='+') {
|
||||
++s;
|
||||
}
|
||||
if(*s!='\\') {
|
||||
break;
|
||||
}
|
||||
|
||||
if(bLen==UCNV_EXT_MAX_BYTES) {
|
||||
fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
|
||||
return -1;
|
||||
}
|
||||
if( s[1]!='x' ||
|
||||
(bytes[bLen]=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
|
||||
) {
|
||||
fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
|
||||
return -1;
|
||||
}
|
||||
++bLen;
|
||||
s=end;
|
||||
}
|
||||
|
||||
*ps=s;
|
||||
return bLen;
|
||||
}
|
||||
|
||||
/* parse a mapping line; must not be empty */
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_parseMappingLine(UCMapping *m,
|
||||
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
||||
uint8_t bytes[UCNV_EXT_MAX_BYTES],
|
||||
const char *line) {
|
||||
const char *s;
|
||||
char *end;
|
||||
int32_t u16Length;
|
||||
int8_t uLen, bLen, f;
|
||||
|
||||
s=line;
|
||||
uLen=bLen=0;
|
||||
|
||||
/* parse code points */
|
||||
for(;;) {
|
||||
/* skip an optional plus sign */
|
||||
if(uLen>0 && *s=='+') {
|
||||
++s;
|
||||
}
|
||||
if(*s!='<') {
|
||||
break;
|
||||
}
|
||||
|
||||
if(uLen==UCNV_EXT_MAX_UCHARS) {
|
||||
fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
if( s[1]!='U' ||
|
||||
(codePoints[uLen]=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
|
||||
*end!='>'
|
||||
) {
|
||||
fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
if((uint32_t)codePoints[uLen]>0x10ffff || U_IS_SURROGATE(codePoints[uLen])) {
|
||||
fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
++uLen;
|
||||
s=end+1;
|
||||
}
|
||||
|
||||
if(uLen==0) {
|
||||
fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
|
||||
return FALSE;
|
||||
} else if(uLen==1) {
|
||||
m->u=codePoints[0];
|
||||
} else {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
|
||||
if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
|
||||
u16Length>UCNV_EXT_MAX_UCHARS
|
||||
) {
|
||||
fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
s=u_skipWhitespace(s);
|
||||
|
||||
/* parse bytes */
|
||||
bLen=ucm_parseBytes(bytes, line, &s);
|
||||
|
||||
if(bLen<0) {
|
||||
return FALSE;
|
||||
} else if(bLen==0) {
|
||||
fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
|
||||
return FALSE;
|
||||
} else if(bLen<=4) {
|
||||
uprv_memcpy(m->b.bytes, bytes, bLen);
|
||||
}
|
||||
|
||||
/* skip everything until the fallback indicator, even the start of a comment */
|
||||
for(;;) {
|
||||
if(*s==0) {
|
||||
f=-1; /* no fallback indicator */
|
||||
break;
|
||||
} else if(*s=='|') {
|
||||
f=(int8_t)(s[1]-'0');
|
||||
if((uint8_t)f>3) {
|
||||
fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
++s;
|
||||
}
|
||||
|
||||
m->uLen=uLen;
|
||||
m->bLen=bLen;
|
||||
m->f=f;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* general APIs ------------------------------------------------------------- */
|
||||
|
||||
U_CAPI UCMTable * U_EXPORT2
|
||||
ucm_openTable() {
|
||||
UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
|
||||
if(table==NULL) {
|
||||
fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
memset(table, 0, sizeof(UCMTable));
|
||||
return table;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_closeTable(UCMTable *table) {
|
||||
if(table!=NULL) {
|
||||
uprv_free(table->mappings);
|
||||
uprv_free(table->codePoints);
|
||||
uprv_free(table->bytes);
|
||||
uprv_free(table->reverseMap);
|
||||
uprv_free(table);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_addMapping(UCMTable *table,
|
||||
UCMapping *m,
|
||||
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
||||
uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
|
||||
UCMapping *tm;
|
||||
UChar32 c;
|
||||
int32_t index;
|
||||
|
||||
if(table->mappingsLength>=table->mappingsCapacity) {
|
||||
/* make the mappings array larger */
|
||||
if(table->mappingsCapacity==0) {
|
||||
table->mappingsCapacity=1000;
|
||||
} else {
|
||||
table->mappingsCapacity*=10;
|
||||
}
|
||||
table->mappings=(UCMapping *)uprv_realloc(table->mappings,
|
||||
table->mappingsCapacity*sizeof(UCMapping));
|
||||
if(table->mappings==NULL) {
|
||||
fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
|
||||
table->mappingsCapacity);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
if(table->reverseMap!=NULL) {
|
||||
/* the reverseMap must be reallocated in a new sort */
|
||||
uprv_free(table->reverseMap);
|
||||
table->reverseMap=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if(m->uLen>1 && table->codePointsCapacity==0) {
|
||||
table->codePointsCapacity=10000;
|
||||
table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
|
||||
if(table->codePoints==NULL) {
|
||||
fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
|
||||
table->codePointsCapacity);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
if(m->bLen>4 && table->bytesCapacity==0) {
|
||||
table->bytesCapacity=10000;
|
||||
table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
|
||||
if(table->bytes==NULL) {
|
||||
fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
|
||||
table->bytesCapacity);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
if(m->uLen>1) {
|
||||
index=table->codePointsLength;
|
||||
table->codePointsLength+=m->uLen;
|
||||
if(table->codePointsLength>table->codePointsCapacity) {
|
||||
fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
|
||||
m->u=index;
|
||||
}
|
||||
|
||||
if(m->bLen>4) {
|
||||
index=table->bytesLength;
|
||||
table->bytesLength+=m->bLen;
|
||||
if(table->bytesLength>table->bytesCapacity) {
|
||||
fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
uprv_memcpy(table->bytes+index, bytes, m->bLen);
|
||||
m->b.index=index;
|
||||
}
|
||||
|
||||
/* set unicodeMask */
|
||||
for(index=0; index<m->uLen; ++index) {
|
||||
c=codePoints[index];
|
||||
if(c>=0x10000) {
|
||||
table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
|
||||
} else if(U_IS_SURROGATE(c)) {
|
||||
table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
|
||||
}
|
||||
}
|
||||
|
||||
/* set flagsType */
|
||||
if(m->f<0) {
|
||||
table->flagsType|=UCM_FLAGS_IMPLICIT;
|
||||
} else {
|
||||
table->flagsType|=UCM_FLAGS_EXPLICIT;
|
||||
}
|
||||
|
||||
tm=table->mappings+table->mappingsLength++;
|
||||
uprv_memcpy(tm, m, sizeof(UCMapping));
|
||||
}
|
||||
|
||||
U_CAPI UCMFile * U_EXPORT2
|
||||
ucm_open() {
|
||||
UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
|
||||
if(ucm==NULL) {
|
||||
fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
memset(ucm, 0, sizeof(UCMFile));
|
||||
|
||||
ucm->base=ucm_openTable();
|
||||
ucm->ext=ucm_openTable();
|
||||
|
||||
ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
|
||||
ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
|
||||
ucm->states.outputType=-1;
|
||||
ucm->states.minCharLength=ucm->states.maxCharLength=1;
|
||||
|
||||
return ucm;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_close(UCMFile *ucm) {
|
||||
if(ucm!=NULL) {
|
||||
uprv_free(ucm->base);
|
||||
uprv_free(ucm->ext);
|
||||
uprv_free(ucm);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
|
||||
UCMapping m={ 0 };
|
||||
UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
|
||||
uint8_t bytes[UCNV_EXT_MAX_BYTES];
|
||||
int32_t count;
|
||||
|
||||
if(!ucm_parseMappingLine(&m, codePoints, bytes, line)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if(baseStates!=NULL) {
|
||||
/* check validity of the bytes and count the characters in them */
|
||||
count=ucm_countChars(baseStates, bytes, m.bLen);
|
||||
if(count<1) {
|
||||
/* illegal byte sequence */
|
||||
printMapping(&m, codePoints, bytes, stderr);
|
||||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
/* not used - adding a mapping for an extension-only table before its base table is read */
|
||||
count=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the mapping to the base table if this is requested
|
||||
* and it is a 1:1 mapping.
|
||||
* Otherwise, add it to the extension table.
|
||||
*
|
||||
* Also add |2 SUB mappings for <subchar1>
|
||||
* and |1 fallbacks from something other than U+0000 to 0x00
|
||||
* to the extension table.
|
||||
*/
|
||||
if( forBase && m.uLen==1 && count==1 &&
|
||||
!((m.f==2 && m.bLen==1 && ucm->states.maxCharLength>1) ||
|
||||
(m.f==1 && m.bLen==1 && bytes[0]==0 && !(m.uLen==1 && codePoints[0]==0)))
|
||||
) {
|
||||
ucm_addMapping(ucm->base, &m, codePoints, bytes);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
ucm_addMapping(ucm->ext, &m, codePoints, bytes);
|
||||
return TRUE;
|
||||
}
|
217
icu4c/source/tools/toolutil/ucm.h
Normal file
217
icu4c/source/tools/toolutil/ucm.h
Normal file
|
@ -0,0 +1,217 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: ucm.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003jun20
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Definitions for the .ucm file parser and handler module ucm.c.
|
||||
*/
|
||||
|
||||
#ifndef __UCM_H__
|
||||
#define __UCM_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include <stdio.h>
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/*
|
||||
* Per-mapping data structure
|
||||
*
|
||||
* u if uLen==1: Unicode code point
|
||||
* else index to uLen code points
|
||||
* b if bLen<=4: up to 4 bytes
|
||||
* else index to bLen bytes
|
||||
* uLen number of code points
|
||||
* bLen number of words containing left-justified bytes
|
||||
* bIsMultipleChars indicates that the bytes contain more than one sequence
|
||||
* according to the state table
|
||||
* f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
|
||||
* same values as in the source file after |
|
||||
*/
|
||||
typedef struct UCMapping {
|
||||
UChar32 u;
|
||||
union {
|
||||
uint32_t index;
|
||||
uint8_t bytes[4];
|
||||
} b;
|
||||
int8_t uLen, bLen, f;
|
||||
} UCMapping;
|
||||
|
||||
enum {
|
||||
UCM_FLAGS_INITIAL, /* no mappings parsed yet */
|
||||
UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
|
||||
UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
|
||||
UCM_FLAGS_MIXED /* both implicit and explicit */
|
||||
};
|
||||
|
||||
typedef struct UCMTable {
|
||||
UCMapping *mappings;
|
||||
int32_t mappingsCapacity, mappingsLength;
|
||||
|
||||
UChar32 *codePoints;
|
||||
int32_t codePointsCapacity, codePointsLength;
|
||||
|
||||
uint8_t *bytes;
|
||||
int32_t bytesCapacity, bytesLength;
|
||||
|
||||
/* index map for mapping by bytes first */
|
||||
int32_t *reverseMap;
|
||||
|
||||
uint8_t unicodeMask;
|
||||
int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
|
||||
} UCMTable;
|
||||
|
||||
enum {
|
||||
MBCS_STATE_FLAG_DIRECT=1,
|
||||
MBCS_STATE_FLAG_SURROGATES,
|
||||
|
||||
MBCS_STATE_FLAG_READY=16
|
||||
};
|
||||
|
||||
typedef struct UCMStates {
|
||||
int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
|
||||
uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
|
||||
stateOffsetSum[MBCS_MAX_STATE_COUNT];
|
||||
|
||||
int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
|
||||
int8_t conversionType, outputType;
|
||||
} UCMStates;
|
||||
|
||||
typedef struct UCMFile {
|
||||
UCMTable *base, *ext;
|
||||
UCMStates states;
|
||||
|
||||
char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
|
||||
} UCMFile;
|
||||
|
||||
/* simple accesses ---------------------------------------------------------- */
|
||||
|
||||
#define UCM_GET_CODE_POINTS(t, m) \
|
||||
(((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
|
||||
|
||||
#define UCM_GET_BYTES(t, m) \
|
||||
(((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.index)
|
||||
|
||||
/* APIs --------------------------------------------------------------------- */
|
||||
|
||||
U_CAPI UCMFile * U_EXPORT2
|
||||
ucm_open(void);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_close(UCMFile *ucm);
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_parseHeaderLine(UCMFile *ucm,
|
||||
char *line, char **pKey, char **pValue);
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
|
||||
|
||||
|
||||
U_CAPI UCMTable * U_EXPORT2
|
||||
ucm_openTable(void);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_closeTable(UCMTable *table);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_sortTable(UCMTable *t);
|
||||
|
||||
/**
|
||||
* Check the validity of mappings against a base table's states;
|
||||
* necessary for extension-only tables that were read before their base tables.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
|
||||
|
||||
/**
|
||||
* Check a base table against an extension table.
|
||||
* Set moveToExt=TRUE for where base and extension tables are parsed
|
||||
* from a single file,
|
||||
* and moveToExt=FALSE for where the extension table is in a separate file.
|
||||
*
|
||||
* For both tables in the same file, the extension table is automatically
|
||||
* built.
|
||||
* For separate files, the extension file can use a complete mapping table,
|
||||
* so that common mappings need not be stripped out manually.
|
||||
*
|
||||
*
|
||||
* Sort both tables, and then for each mapping direction:
|
||||
*
|
||||
* If the base table contains a mapping for which the input sequence is
|
||||
* the same as the extension input, then
|
||||
* - if the output is the same: remove the extension mapping
|
||||
* - else: error
|
||||
*
|
||||
* If the base table contains a mapping for which the input sequence is
|
||||
* a prefix of the extension input, then
|
||||
* - if moveToExt: move the base mapping to the extension table
|
||||
* - else: error
|
||||
*
|
||||
* @return FALSE in case of an irreparable error
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_addState(UCMStates *states, const char *s);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_processStates(UCMStates *states);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucm_countChars(UCMStates *states,
|
||||
const uint8_t *bytes, int32_t length);
|
||||
|
||||
|
||||
U_CAPI int8_t U_EXPORT2
|
||||
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_parseMappingLine(UCMapping *m,
|
||||
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
||||
uint8_t bytes[UCNV_EXT_MAX_BYTES],
|
||||
const char *line);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_addMapping(UCMTable *table,
|
||||
UCMapping *m,
|
||||
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
||||
uint8_t bytes[UCNV_EXT_MAX_BYTES]);
|
||||
|
||||
/* very makeconv-specific functions ----------------------------------------- */
|
||||
|
||||
/* finalize and optimize states after the toUnicode mappings are processed */
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_optimizeStates(UCMStates *states,
|
||||
uint16_t **pUnicodeCodeUnits,
|
||||
_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
|
||||
UBool verbose);
|
||||
|
||||
/* moved here because it is used inside ucmstate.c */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
|
||||
uint32_t offset);
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
1042
icu4c/source/tools/toolutil/ucmstate.c
Normal file
1042
icu4c/source/tools/toolutil/ucmstate.c
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue