diff --git a/icu4c/source/common/ucnv_ext.c b/icu4c/source/common/ucnv_ext.c index e72d16d7318..260a0ae680e 100644 --- a/icu4c/source/common/ucnv_ext.c +++ b/icu4c/source/common/ucnv_ext.c @@ -26,12 +26,6 @@ #include "ucnv_ext.h" #include "cmemory.h" -/* - * ### TODO: probably need pointer to baseTableSharedData - * and also copy the base table's pointers for the base table arrays etc. - * into this sharedData - */ - /* to Unicode --------------------------------------------------------------- */ /* @@ -331,25 +325,24 @@ ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, } } -#if 0 -/* ### TODO */ - U_CFUNC UChar32 ucnv_extSimpleMatchToU(const int32_t *cx, - UChar32 cp, - UBool useFallback, - UErrorCode *pErrorCode) { + const char *source, int32_t length, + UBool useFallback) { uint32_t value; int32_t match; + if(length<=0) { + return 0xffff; + } + /* try to match */ match=ucnv_extMatchToU(cx, -1, - cp, - NULL, 0, + source, length, NULL, 0, &value, useFallback, TRUE); - if(match>0) { + if(match==length) { /* write result for simple, single-character conversion */ if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) { return UCNV_EXT_TO_U_GET_CODE_POINT(value); @@ -359,14 +352,13 @@ ucnv_extSimpleMatchToU(const int32_t *cx, /* * return no match because * - match>0 && value points to string: simple conversion cannot handle multiple code points + * - match>0 && match!=length: not all input consumed, forbidden for this function * - match==0: no match found in the first place * - match<0: partial match, not supported for simple conversion (and flush==TRUE) */ - return 0; + return 0xfffe; } -#endif - /* * continue partial match with new input * never called for simple, single-character conversion @@ -800,14 +792,10 @@ ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, } } -#if 0 -/* ### TODO */ - U_CFUNC int32_t ucnv_extSimpleMatchFromU(const int32_t *cx, UChar32 cp, uint32_t *pValue, - UBool useFallback, - UErrorCode *pErrorCode) { + UBool useFallback) { uint32_t value; int32_t match; @@ -828,6 +816,7 @@ ucnv_extSimpleMatchFromU(const int32_t *cx, if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { *pValue=value; return length; +#if 0 /* not currently used */ } else if(length==4) { /* de-serialize a 4-byte result */ const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; @@ -837,6 +826,7 @@ ucnv_extSimpleMatchFromU(const int32_t *cx, ((uint32_t)result[2]<<8)| result[3]; return 4; +#endif } } @@ -850,8 +840,6 @@ ucnv_extSimpleMatchFromU(const int32_t *cx, return 0; } -#endif - /* * continue partial match with new input, requires cnv->preFromUFirstCP>=0 * never called for simple, single-character conversion diff --git a/icu4c/source/common/ucnv_ext.h b/icu4c/source/common/ucnv_ext.h index 768c60f7591..aec55a1706a 100644 --- a/icu4c/source/common/ucnv_ext.h +++ b/icu4c/source/common/ucnv_ext.h @@ -342,6 +342,11 @@ ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, UBool flush, UErrorCode *pErrorCode); +U_CFUNC UChar32 +ucnv_extSimpleMatchToU(const int32_t *cx, + const char *source, int32_t length, + UBool useFallback); + U_CFUNC void ucnv_extContinueMatchToU(UConverter *cnv, UConverterToUnicodeArgs *pArgs, int32_t srcIndex, @@ -360,8 +365,7 @@ ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, U_CFUNC int32_t ucnv_extSimpleMatchFromU(const int32_t *cx, UChar32 cp, uint32_t *pValue, - UBool useFallback, - UErrorCode *pErrorCode); + UBool useFallback); U_CFUNC void ucnv_extContinueMatchFromU(UConverter *cnv, diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 9433eb0a8c8..8fa5e3a3d6e 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -937,22 +937,64 @@ _MBCSLoad(UConverterSharedData *sharedData, /* * Set a special, runtime-only outputType if the extension converter - * is a DBCS version of an SI/SO-stateful base converter. + * is a DBCS version of a base converter that also maps single bytes. */ - if( baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO && - (sharedData->staticData->conversionType==UCNV_DBCS || + if( sharedData->staticData->conversionType==UCNV_DBCS || (sharedData->staticData->conversionType==UCNV_MBCS && - sharedData->staticData->minBytesPerChar>=2)) + sharedData->staticData->minBytesPerChar>=2) ) { - int32_t entry; + if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { + /* the base converter is SI/SO-stateful */ + int32_t entry; - /* get the dbcs state from the state table entry for SO=0x0e */ - entry=mbcsTable->stateTable[0][0xe]; - if( MBCS_ENTRY_IS_FINAL(entry) && - MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && - MBCS_ENTRY_FINAL_STATE(entry)!=0 + /* get the dbcs state from the state table entry for SO=0x0e */ + entry=mbcsTable->stateTable[0][0xe]; + if( MBCS_ENTRY_IS_FINAL(entry) && + MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && + MBCS_ENTRY_FINAL_STATE(entry)!=0 + ) { + mbcsTable->dbcsOnlyState=MBCS_ENTRY_FINAL_STATE(entry); + + mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; + } + } else if( + baseSharedData->staticData->conversionType==UCNV_MBCS && + baseSharedData->staticData->minBytesPerChar==1 && + baseSharedData->staticData->maxBytesPerChar==2 && + mbcsTable->countStates<=127 ) { - mbcsTable->dbcsOnlyState=MBCS_ENTRY_FINAL_STATE(entry); + /* non-stateful base converter, need to modify the state table */ + int32_t (*newStateTable)[256]; + int32_t *state; + int32_t i, count; + + /* allocate a new state table and copy the base state table contents */ + count=mbcsTable->countStates; + newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); + if(newStateTable==NULL) { + ucnv_unload(baseSharedData); + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + + uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); + + /* change all final single-byte entries to go to a new all-illegal state */ + state=newStateTable[0]; + for(i=0; i<256; ++i) { + if(MBCS_ENTRY_IS_FINAL(state[i])) { + state[i]=MBCS_ENTRY_TRANSITION(count, 0); + } + } + + /* build the new all-illegal state */ + state=newStateTable[count]; + for(i=0; i<256; ++i) { + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); + } + mbcsTable->stateTable=newStateTable; + mbcsTable->countStates=(uint8_t)(count+1); + mbcsTable->stateTableOwned=TRUE; mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; } @@ -1007,6 +1049,12 @@ _MBCSUnload(UConverterSharedData *sharedData) { if(mbcsTable->swapLFNLStateTable!=NULL) { uprv_free(mbcsTable->swapLFNLStateTable); } + if(mbcsTable->stateTableOwned) { + uprv_free((void *)mbcsTable->stateTable); + } + if(mbcsTable->baseSharedData!=NULL) { + ucnv_unload(mbcsTable->baseSharedData); + } } static void @@ -2235,11 +2283,12 @@ _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, #endif /* - * This is a simple version of getNextUChar() that is used + * This is a simple version of _MBCSGetNextUChar() that is used * by other converter implementations. + * It only returns an "assigned" result if it consumes the entire input. * It does not use state from the converter, nor error codes. * It does not handle the EBCDIC swaplfnl option (set in UConverter). - * It does not handle conversion extensions (_extToU()). + * It handles conversion extensions but not GB 18030. * * Return value: * U+fffe unassigned @@ -2248,27 +2297,22 @@ _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, */ U_CFUNC UChar32 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, - const char **pSource, const char *sourceLimit, + const char *source, int32_t length, UBool useFallback) { - const uint8_t *source; - const int32_t (*stateTable)[256]; const uint16_t *unicodeCodeUnits; uint32_t offset; uint8_t state, action; - int32_t entry; + UChar32 c; + int32_t i, entry; - /* set up the local pointers */ - source=(const uint8_t *)*pSource; - if(source>=(const uint8_t *)sourceLimit) { + if(length<=0) { /* no input at all: "illegal" */ return 0xffff; } - /* ### TODO extension */ - #if 0 /* * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus @@ -2278,10 +2322,15 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, */ /* use optimized function if possible */ if(sharedData->mbcs.countStates==1) { - return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)(*(*pSource)++), useFallback); + if(length==1) { + return _MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); + } else { + return 0xffff; /* illegal: more than a single byte for an SBCS converter */ + } } #endif + /* set up the local pointers */ stateTable=sharedData->mbcs.stateTable; unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; @@ -2290,14 +2339,16 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, state=sharedData->mbcs.dbcsOnlyState; /* conversion loop */ - do { - entry=stateTable[state][*source++]; + for(i=0;;) { + entry=stateTable[state][(uint8_t)source[i++]]; if(MBCS_ENTRY_IS_TRANSITION(entry)) { state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); - } else { - *pSource=(const char *)source; + if(i==length) { + return 0xffff; /* truncated character */ + } + } else { /* * An if-else-if chain provides more reliable performance for * the most common cases compared to a switch. @@ -2305,81 +2356,82 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); if(action==MBCS_STATE_VALID_16) { offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); - entry=unicodeCodeUnits[offset]; - if(entry!=0xfffe) { - return (UChar32)entry; + c=unicodeCodeUnits[offset]; + if(c!=0xfffe) { + /* done */ } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { - return _MBCSGetFallback(&sharedData->mbcs, offset); - } else { - return 0xfffe; + c=_MBCSGetFallback(&sharedData->mbcs, offset); + /* else done with 0xfffe */ } + break; } else if(action==MBCS_STATE_VALID_DIRECT_16) { /* output BMP code point */ - return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + break; } else if(action==MBCS_STATE_VALID_16_PAIR) { offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); - entry=unicodeCodeUnits[offset++]; - if(entry<0xd800) { + c=unicodeCodeUnits[offset++]; + if(c<0xd800) { /* output BMP code point below 0xd800 */ - return (UChar32)entry; - } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? entry<=0xdfff : entry<=0xdbff) { + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { /* output roundtrip or fallback supplementary code point */ - return (UChar32)(((entry&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); - } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (entry&0xfffe)==0xe000 : entry==0xe000) { + c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ - return unicodeCodeUnits[offset]; - } else if(entry==0xffff) { + c=unicodeCodeUnits[offset]; + } else if(c==0xffff) { return 0xffff; } else { - return 0xfffe; + c=0xfffe; } + break; } else if(action==MBCS_STATE_VALID_DIRECT_20) { /* output supplementary code point */ - return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + break; } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { if(!TO_U_USE_FALLBACK(useFallback)) { - return 0xfffe; + c=0xfffe; + break; } /* output BMP code point */ - return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + break; } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { if(!TO_U_USE_FALLBACK(useFallback)) { - return 0xfffe; + c=0xfffe; + break; } /* output supplementary code point */ - return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); - } else if(action==MBCS_STATE_CHANGE_ONLY) { - /* - * This serves as a state change without any output. - * It is useful for reading simple stateful encodings, - * for example using just Shift-In/Shift-Out codes. - * The 21 unused bits may later be used for more sophisticated - * state transitions. - */ - if(sharedData->mbcs.dbcsOnlyState!=0) { - /* SI/SO are illegal for DBCS-only conversion */ - return 0xffff; - } - if(source==(const uint8_t *)sourceLimit) { - /* if there are only state changes, then return "unassigned" */ - return 0xfffe; - } + c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + break; } else if(action==MBCS_STATE_UNASSIGNED) { - return 0xfffe; - } else if(action==MBCS_STATE_ILLEGAL) { - return 0xffff; - } else { - /* reserved, must never occur */ + c=0xfffe; + break; } - /* state change only - prepare for a new character */ - state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ - offset=0; + /* + * forbid MBCS_STATE_CHANGE_ONLY for this function, + * and MBCS_STATE_ILLEGAL and reserved action codes + */ + return 0xffff; } - } while(source<(const uint8_t *)sourceLimit); + } - *pSource=(const char *)source; - return 0xffff; + if(i!=length) { + /* illegal for this function: not all input consumed */ + return 0xffff; + } + + if(c==0xfffe) { + /* try an extension mapping */ + const int32_t *cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { + return ucnv_extSimpleMatchToU(cx, source, length, useFallback); + } + } + + return c; } /* MBCS-from-Unicode conversion functions ----------------------------------- */ @@ -3248,7 +3300,7 @@ getTrail: } break; case MBCS_OUTPUT_DBCS_ONLY: - /* 1/2-byte stateful table but only DBCS mappings used */ + /* table with single-byte results, but only DBCS mappings used */ value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); if(value<=0xff) { /* no mapping or SBCS result, not taken for DBCS-only */ @@ -3524,7 +3576,7 @@ unassigned: * conversion implementations. * It does not use the converter state nor call callbacks. * It does not handle the EBCDIC swaplfnl option (set in UConverter). - * It does not handle conversion extensions (_extFromU()). + * It handles conversion extensions but not GB 18030. * * It converts one single Unicode code point into codepage bytes, encoded * as one 32-bit value. The function returns the number of bytes in *pValue: @@ -3546,8 +3598,6 @@ _MBCSFromUChar32(UConverterSharedData *sharedData, uint32_t value; int32_t length; - /* ### TODO extension mapping */ - /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { return 0; @@ -3578,7 +3628,7 @@ _MBCSFromUChar32(UConverterSharedData *sharedData, } break; case MBCS_OUTPUT_DBCS_ONLY: - /* 1/2-byte stateful table but only DBCS mappings used */ + /* table with single-byte results, but only DBCS mappings used */ value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); if(value<=0xff) { /* no mapping or SBCS result, not taken for DBCS-only */ @@ -3663,6 +3713,12 @@ _MBCSFromUChar32(UConverterSharedData *sharedData, *pValue=value; return length; } else { + const int32_t *cx=sharedData->mbcs.extIndexes; + if(cx!=NULL) { + return ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); + } + + /* unassigned */ return 0; } } diff --git a/icu4c/source/common/ucnvmbcs.h b/icu4c/source/common/ucnvmbcs.h index ce951f41219..a7637757684 100644 --- a/icu4c/source/common/ucnvmbcs.h +++ b/icu4c/source/common/ucnvmbcs.h @@ -221,7 +221,7 @@ typedef struct { */ typedef struct UConverterMBCSTable { /* toUnicode */ - uint8_t countStates, dbcsOnlyState; + uint8_t countStates, dbcsOnlyState, stateTableOwned; uint32_t countToUFallbacks; const int32_t (*stateTable)/*[countStates]*/[256]; @@ -258,12 +258,13 @@ typedef struct { fromUBytesLength; } _MBCSHeader; -/** +/* * This is a simple version of _MBCSGetNextUChar() that is used * by other converter implementations. + * It only returns an "assigned" result if it consumes the entire input. * It does not use state from the converter, nor error codes. * It does not handle the EBCDIC swaplfnl option (set in UConverter). - * It does not handle conversion extensions (_extToU()). + * It handles conversion extensions but not GB 18030. * * Return value: * U+fffe unassigned @@ -272,7 +273,7 @@ typedef struct { */ U_CFUNC UChar32 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, - const char **pSource, const char *sourceLimit, + const char *source, int32_t length, UBool useFallback); /** @@ -304,11 +305,12 @@ _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte); #define _MBCS_IS_LEAD_BYTE(sharedData, byte) \ (UBool)MBCS_ENTRY_IS_TRANSITION((sharedData)->mbcs.stateTable[0][(uint8_t)(byte)]) -/** +/* * This is another simple conversion function for internal use by other * conversion implementations. * It does not use the converter state nor call callbacks. * It does not handle the EBCDIC swaplfnl option (set in UConverter). + * It handles conversion extensions but not GB 18030. * * It converts one single Unicode code point into codepage bytes, encoded * as one 32-bit value. The function returns the number of bytes in *pValue: