ICU-3346 support stateless DBCS-only and simple (single-character) conversions

X-SVN-Rev: 13655
This commit is contained in:
Markus Scherer 2003-11-11 18:37:55 +00:00
parent 31a8625180
commit 506bc1495f
4 changed files with 162 additions and 112 deletions

View file

@ -26,12 +26,6 @@
#include "ucnv_ext.h"
#include "cmemory.h"
/*
* ### TODO: probably need pointer to baseTableSharedData
* and also copy the base table's pointers for the base table arrays etc.
* into this sharedData
*/
/* to Unicode --------------------------------------------------------------- */
/*
@ -331,25 +325,24 @@ ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
}
}
#if 0
/* ### TODO */
U_CFUNC UChar32
ucnv_extSimpleMatchToU(const int32_t *cx,
UChar32 cp,
UBool useFallback,
UErrorCode *pErrorCode) {
const char *source, int32_t length,
UBool useFallback) {
uint32_t value;
int32_t match;
if(length<=0) {
return 0xffff;
}
/* try to match */
match=ucnv_extMatchToU(cx, -1,
cp,
NULL, 0,
source, length,
NULL, 0,
&value,
useFallback, TRUE);
if(match>0) {
if(match==length) {
/* write result for simple, single-character conversion */
if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
return UCNV_EXT_TO_U_GET_CODE_POINT(value);
@ -359,14 +352,13 @@ ucnv_extSimpleMatchToU(const int32_t *cx,
/*
* return no match because
* - match>0 && value points to string: simple conversion cannot handle multiple code points
* - match>0 && match!=length: not all input consumed, forbidden for this function
* - match==0: no match found in the first place
* - match<0: partial match, not supported for simple conversion (and flush==TRUE)
*/
return 0;
return 0xfffe;
}
#endif
/*
* continue partial match with new input
* never called for simple, single-character conversion
@ -800,14 +792,10 @@ ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
}
}
#if 0
/* ### TODO */
U_CFUNC int32_t
ucnv_extSimpleMatchFromU(const int32_t *cx,
UChar32 cp, uint32_t *pValue,
UBool useFallback,
UErrorCode *pErrorCode) {
UBool useFallback) {
uint32_t value;
int32_t match;
@ -828,6 +816,7 @@ ucnv_extSimpleMatchFromU(const int32_t *cx,
if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
*pValue=value;
return length;
#if 0 /* not currently used */
} else if(length==4) {
/* de-serialize a 4-byte result */
const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
@ -837,6 +826,7 @@ ucnv_extSimpleMatchFromU(const int32_t *cx,
((uint32_t)result[2]<<8)|
result[3];
return 4;
#endif
}
}
@ -850,8 +840,6 @@ ucnv_extSimpleMatchFromU(const int32_t *cx,
return 0;
}
#endif
/*
* continue partial match with new input, requires cnv->preFromUFirstCP>=0
* never called for simple, single-character conversion

View file

@ -342,6 +342,11 @@ ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
UBool flush,
UErrorCode *pErrorCode);
U_CFUNC UChar32
ucnv_extSimpleMatchToU(const int32_t *cx,
const char *source, int32_t length,
UBool useFallback);
U_CFUNC void
ucnv_extContinueMatchToU(UConverter *cnv,
UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
@ -360,8 +365,7 @@ ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
U_CFUNC int32_t
ucnv_extSimpleMatchFromU(const int32_t *cx,
UChar32 cp, uint32_t *pValue,
UBool useFallback,
UErrorCode *pErrorCode);
UBool useFallback);
U_CFUNC void
ucnv_extContinueMatchFromU(UConverter *cnv,

View file

@ -937,22 +937,64 @@ _MBCSLoad(UConverterSharedData *sharedData,
/*
* Set a special, runtime-only outputType if the extension converter
* is a DBCS version of an SI/SO-stateful base converter.
* is a DBCS version of a base converter that also maps single bytes.
*/
if( baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO &&
(sharedData->staticData->conversionType==UCNV_DBCS ||
if( sharedData->staticData->conversionType==UCNV_DBCS ||
(sharedData->staticData->conversionType==UCNV_MBCS &&
sharedData->staticData->minBytesPerChar>=2))
sharedData->staticData->minBytesPerChar>=2)
) {
int32_t entry;
if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {
/* the base converter is SI/SO-stateful */
int32_t entry;
/* get the dbcs state from the state table entry for SO=0x0e */
entry=mbcsTable->stateTable[0][0xe];
if( MBCS_ENTRY_IS_FINAL(entry) &&
MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
MBCS_ENTRY_FINAL_STATE(entry)!=0
/* get the dbcs state from the state table entry for SO=0x0e */
entry=mbcsTable->stateTable[0][0xe];
if( MBCS_ENTRY_IS_FINAL(entry) &&
MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&
MBCS_ENTRY_FINAL_STATE(entry)!=0
) {
mbcsTable->dbcsOnlyState=MBCS_ENTRY_FINAL_STATE(entry);
mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
}
} else if(
baseSharedData->staticData->conversionType==UCNV_MBCS &&
baseSharedData->staticData->minBytesPerChar==1 &&
baseSharedData->staticData->maxBytesPerChar==2 &&
mbcsTable->countStates<=127
) {
mbcsTable->dbcsOnlyState=MBCS_ENTRY_FINAL_STATE(entry);
/* non-stateful base converter, need to modify the state table */
int32_t (*newStateTable)[256];
int32_t *state;
int32_t i, count;
/* allocate a new state table and copy the base state table contents */
count=mbcsTable->countStates;
newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024);
if(newStateTable==NULL) {
ucnv_unload(baseSharedData);
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);
/* change all final single-byte entries to go to a new all-illegal state */
state=newStateTable[0];
for(i=0; i<256; ++i) {
if(MBCS_ENTRY_IS_FINAL(state[i])) {
state[i]=MBCS_ENTRY_TRANSITION(count, 0);
}
}
/* build the new all-illegal state */
state=newStateTable[count];
for(i=0; i<256; ++i) {
state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);
}
mbcsTable->stateTable=newStateTable;
mbcsTable->countStates=(uint8_t)(count+1);
mbcsTable->stateTableOwned=TRUE;
mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;
}
@ -1007,6 +1049,12 @@ _MBCSUnload(UConverterSharedData *sharedData) {
if(mbcsTable->swapLFNLStateTable!=NULL) {
uprv_free(mbcsTable->swapLFNLStateTable);
}
if(mbcsTable->stateTableOwned) {
uprv_free((void *)mbcsTable->stateTable);
}
if(mbcsTable->baseSharedData!=NULL) {
ucnv_unload(mbcsTable->baseSharedData);
}
}
static void
@ -2235,11 +2283,12 @@ _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
#endif
/*
* This is a simple version of getNextUChar() that is used
* This is a simple version of _MBCSGetNextUChar() that is used
* by other converter implementations.
* It only returns an "assigned" result if it consumes the entire input.
* It does not use state from the converter, nor error codes.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It does not handle conversion extensions (_extToU()).
* It handles conversion extensions but not GB 18030.
*
* Return value:
* U+fffe unassigned
@ -2248,27 +2297,22 @@ _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
*/
U_CFUNC UChar32
_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
const char **pSource, const char *sourceLimit,
const char *source, int32_t length,
UBool useFallback) {
const uint8_t *source;
const int32_t (*stateTable)[256];
const uint16_t *unicodeCodeUnits;
uint32_t offset;
uint8_t state, action;
int32_t entry;
UChar32 c;
int32_t i, entry;
/* set up the local pointers */
source=(const uint8_t *)*pSource;
if(source>=(const uint8_t *)sourceLimit) {
if(length<=0) {
/* no input at all: "illegal" */
return 0xffff;
}
/* ### TODO extension */
#if 0
/*
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
@ -2278,10 +2322,15 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
*/
/* use optimized function if possible */
if(sharedData->mbcs.countStates==1) {
return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)(*(*pSource)++), useFallback);
if(length==1) {
return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback);
} else {
return 0xffff; /* illegal: more than a single byte for an SBCS converter */
}
}
#endif
/* set up the local pointers */
stateTable=sharedData->mbcs.stateTable;
unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;
@ -2290,14 +2339,16 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
state=sharedData->mbcs.dbcsOnlyState;
/* conversion loop */
do {
entry=stateTable[state][*source++];
for(i=0;;) {
entry=stateTable[state][(uint8_t)source[i++]];
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
} else {
*pSource=(const char *)source;
if(i==length) {
return 0xffff; /* truncated character */
}
} else {
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
@ -2305,81 +2356,82 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if(action==MBCS_STATE_VALID_16) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
entry=unicodeCodeUnits[offset];
if(entry!=0xfffe) {
return (UChar32)entry;
c=unicodeCodeUnits[offset];
if(c!=0xfffe) {
/* done */
} else if(UCNV_TO_U_USE_FALLBACK(cnv)) {
return _MBCSGetFallback(&sharedData->mbcs, offset);
} else {
return 0xfffe;
c=_MBCSGetFallback(&sharedData->mbcs, offset);
/* else done with 0xfffe */
}
break;
} else if(action==MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
} else if(action==MBCS_STATE_VALID_16_PAIR) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
entry=unicodeCodeUnits[offset++];
if(entry<0xd800) {
c=unicodeCodeUnits[offset++];
if(c<0xd800) {
/* output BMP code point below 0xd800 */
return (UChar32)entry;
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? entry<=0xdfff : entry<=0xdbff) {
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
/* output roundtrip or fallback supplementary code point */
return (UChar32)(((entry&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (entry&0xfffe)==0xe000 : entry==0xe000) {
c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00));
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
return unicodeCodeUnits[offset];
} else if(entry==0xffff) {
c=unicodeCodeUnits[offset];
} else if(c==0xffff) {
return 0xffff;
} else {
return 0xfffe;
c=0xfffe;
}
break;
} else if(action==MBCS_STATE_VALID_DIRECT_20) {
/* output supplementary code point */
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
break;
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(!TO_U_USE_FALLBACK(useFallback)) {
return 0xfffe;
c=0xfffe;
break;
}
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
if(!TO_U_USE_FALLBACK(useFallback)) {
return 0xfffe;
c=0xfffe;
break;
}
/* output supplementary code point */
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
} else if(action==MBCS_STATE_CHANGE_ONLY) {
/*
* This serves as a state change without any output.
* It is useful for reading simple stateful encodings,
* for example using just Shift-In/Shift-Out codes.
* The 21 unused bits may later be used for more sophisticated
* state transitions.
*/
if(sharedData->mbcs.dbcsOnlyState!=0) {
/* SI/SO are illegal for DBCS-only conversion */
return 0xffff;
}
if(source==(const uint8_t *)sourceLimit) {
/* if there are only state changes, then return "unassigned" */
return 0xfffe;
}
c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
break;
} else if(action==MBCS_STATE_UNASSIGNED) {
return 0xfffe;
} else if(action==MBCS_STATE_ILLEGAL) {
return 0xffff;
} else {
/* reserved, must never occur */
c=0xfffe;
break;
}
/* state change only - prepare for a new character */
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
offset=0;
/*
* forbid MBCS_STATE_CHANGE_ONLY for this function,
* and MBCS_STATE_ILLEGAL and reserved action codes
*/
return 0xffff;
}
} while(source<(const uint8_t *)sourceLimit);
}
*pSource=(const char *)source;
return 0xffff;
if(i!=length) {
/* illegal for this function: not all input consumed */
return 0xffff;
}
if(c==0xfffe) {
/* try an extension mapping */
const int32_t *cx=sharedData->mbcs.extIndexes;
if(cx!=NULL) {
return ucnv_extSimpleMatchToU(cx, source, length, useFallback);
}
}
return c;
}
/* MBCS-from-Unicode conversion functions ----------------------------------- */
@ -3248,7 +3300,7 @@ getTrail:
}
break;
case MBCS_OUTPUT_DBCS_ONLY:
/* 1/2-byte stateful table but only DBCS mappings used */
/* table with single-byte results, but only DBCS mappings used */
value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);
if(value<=0xff) {
/* no mapping or SBCS result, not taken for DBCS-only */
@ -3524,7 +3576,7 @@ unassigned:
* conversion implementations.
* It does not use the converter state nor call callbacks.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It does not handle conversion extensions (_extFromU()).
* It handles conversion extensions but not GB 18030.
*
* It converts one single Unicode code point into codepage bytes, encoded
* as one 32-bit value. The function returns the number of bytes in *pValue:
@ -3546,8 +3598,6 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
uint32_t value;
int32_t length;
/* ### TODO extension mapping */
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
return 0;
@ -3578,7 +3628,7 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
}
break;
case MBCS_OUTPUT_DBCS_ONLY:
/* 1/2-byte stateful table but only DBCS mappings used */
/* table with single-byte results, but only DBCS mappings used */
value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
if(value<=0xff) {
/* no mapping or SBCS result, not taken for DBCS-only */
@ -3663,6 +3713,12 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
*pValue=value;
return length;
} else {
const int32_t *cx=sharedData->mbcs.extIndexes;
if(cx!=NULL) {
return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);
}
/* unassigned */
return 0;
}
}

View file

@ -221,7 +221,7 @@ typedef struct {
*/
typedef struct UConverterMBCSTable {
/* toUnicode */
uint8_t countStates, dbcsOnlyState;
uint8_t countStates, dbcsOnlyState, stateTableOwned;
uint32_t countToUFallbacks;
const int32_t (*stateTable)/*[countStates]*/[256];
@ -258,12 +258,13 @@ typedef struct {
fromUBytesLength;
} _MBCSHeader;
/**
/*
* This is a simple version of _MBCSGetNextUChar() that is used
* by other converter implementations.
* It only returns an "assigned" result if it consumes the entire input.
* It does not use state from the converter, nor error codes.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It does not handle conversion extensions (_extToU()).
* It handles conversion extensions but not GB 18030.
*
* Return value:
* U+fffe unassigned
@ -272,7 +273,7 @@ typedef struct {
*/
U_CFUNC UChar32
_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
const char **pSource, const char *sourceLimit,
const char *source, int32_t length,
UBool useFallback);
/**
@ -304,11 +305,12 @@ _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);
#define _MBCS_IS_LEAD_BYTE(sharedData, byte) \
(UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->mbcs.stateTable[0][(uint8_t)(byte)])
/**
/*
* This is another simple conversion function for internal use by other
* conversion implementations.
* It does not use the converter state nor call callbacks.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It handles conversion extensions but not GB 18030.
*
* It converts one single Unicode code point into codepage bytes, encoded
* as one 32-bit value. The function returns the number of bytes in *pValue: