diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index bd818e82dfa..e5b00d6d105 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -43,6 +43,8 @@ #include "ucnv_bld.h" #include "ucnvmbcs.h" #include "ucnv_cnv.h" +#include "umutex.h" +#include "cmemory.h" #include "cstring.h" /* control optimizations according to the platform */ @@ -249,7 +251,7 @@ * * Stage 2 contains a 32-bit word for each 16-block in stage 3: * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results - * test: (stage2Entry&(1<<(16+(c&0xf))))!=0 + * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) * If this test is false, then a non-zero result will be interpreted as * a fallback mapping. * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) @@ -352,6 +354,247 @@ gb18030Ranges[13][4]={ {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} }; +/* bit flag for UConverter.options indicating GB 18030 special handling */ +#define _MBCS_OPTION_GB18030 0x8000 + +/* Miscellaneous ------------------------------------------------------------ */ + +static uint32_t +_MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) { + /* ### TODO markus 20020911 Use _MBCSHeader.reserved to store size of fromUBytes[] */ + const uint16_t *table; + + uint32_t st3, maxStage3; + uint16_t st1, maxStage1, st2; + + /* Enumerate the from-Unicode trie table to find the highest stage 3 index. */ + table=mbcsTable->fromUnicodeTable; + maxStage3=0; + if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { + maxStage1=0x440; + } else { + maxStage1=0x40; + } + + + if(mbcsTable->outputType==MBCS_OUTPUT_1) { + const uint16_t *stage2; + + for(st1=0; st1maxStage3) { + maxStage3=st3; + } + } + } + } + + /* + * add 16 to get the limit not start index of the last stage 3 block, + * times 2 for number of bytes + */ + return (maxStage3+16)*2; + } else { + const uint32_t *stage2; + + for(st1=0; st1maxStage3) { + maxStage3=st3; + } + } + } + } + + /* + * add 16 to get the limit not start index of the last stage 3 block, + * times 2..4 for number of bytes + */ + maxStage3=16*maxStage3+16; + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: + maxStage3*=3; + break; + case MBCS_OUTPUT_4: + maxStage3*=4; + break; + default: + /* MBCS_OUTPUT_2... and MBCS_OUTPUT_3_EUC */ + maxStage3*=2; + break; + } + return maxStage3; + } +} + +/* EBCDIC swap LF<->NL ------------------------------------------------------ */ + +/* + * This code modifies a standard EBCDIC<->Unicode mapping table for + * OS/390 (z/OS) Unix System Services (Open Edition). + * The difference is in the mapping of Line Feed and New Line control codes: + * Standard EBCDIC maps + * + * \x25 |0 + * \x15 |0 + * + * but OS/390 USS EBCDIC swaps the control codes for LF and NL, + * mapping + * + * \x15 |0 + * \x25 |0 + * + * This code modifies a loaded standard EBCDIC<->Unicode mapping table + * by copying it into allocated memory and swapping the LF and NL values. + * It allows to support the same EBCDIC charset in both versions without + * duplicating the entire installed table. + */ + +/* standard EBCDIC codes */ +#define EBCDIC_LF 0x25 +#define EBCDIC_NL 0x15 + +/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ +#define EBCDIC_RT_LF 0xf25 +#define EBCDIC_RT_NL 0xf15 + +/* Unicode code points */ +#define U_LF 0x0a +#define U_NL 0x85 + +static UBool +_EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { + UConverterMBCSTable *mbcsTable; + + const uint16_t *table, *results; + const uint8_t *bytes; + + int32_t (*newStateTable)[256]; + uint16_t *newResults; + uint8_t *p; + char *name; + + uint32_t stage2Entry; + uint32_t size, sizeofFromUBytes; + + mbcsTable=&sharedData->table->mbcs; + + table=mbcsTable->fromUnicodeTable; + bytes=mbcsTable->fromUnicodeBytes; + results=(const uint16_t *)bytes; + + /* + * Check that this is an EBCDIC table with SBCS portion - + * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. + * + * If not, ignore the option. Options are always ignored if they do not apply. + */ + if(!( + (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && + mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && + mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) + )) { + return FALSE; + } + + if(mbcsTable->outputType==MBCS_OUTPUT_1) { + if(!( + EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && + EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) + )) { + return FALSE; + } + } else /* MBCS_OUTPUT_2_SISO */ { + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); + if(!( + MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && + EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) + )) { + return FALSE; + } + + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); + if(!( + MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && + EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) + )) { + return FALSE; + } + } + + /* + * The table has an appropriate format. + * Allocate and build + * - a modified to-Unicode state table + * - a modified from-Unicode output array + * - a converter name string with the swap option appended + */ + sizeofFromUBytes=_MBCSSizeofFromUBytes(mbcsTable); + size= + mbcsTable->countStates*1024+ + sizeofFromUBytes+ + UCNV_MAX_CONVERTER_NAME_LENGTH+20; + p=(uint8_t *)uprv_malloc(size); + if(p==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return FALSE; + } + + /* copy and modify the to-Unicode state table */ + newStateTable=(int32_t (*)[256])p; + uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); + + newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); + newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); + + /* copy and modify the from-Unicode result table */ + newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; + uprv_memcpy(newResults, bytes, sizeofFromUBytes); + + /* conveniently, the table access macros work on the left side of expressions */ + if(mbcsTable->outputType==MBCS_OUTPUT_1) { + MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; + MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; + } else /* MBCS_OUTPUT_2_SISO */ { + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); + MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; + + stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); + MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; + } + + /* set the canonical converter name */ + name=(char *)newResults+sizeofFromUBytes; + uprv_strcpy(name, sharedData->staticData->name); + uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); + + /* set the pointers */ + umtx_lock(NULL); + if(mbcsTable->swapLFNLStateTable==NULL) { + mbcsTable->swapLFNLStateTable=newStateTable; + mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; + mbcsTable->swapLFNLName=name; + + newStateTable=NULL; + } + umtx_unlock(NULL); + + /* release the allocated memory if another thread beat us to it */ + if(newStateTable!=NULL) { + uprv_free(newStateTable); + } + return TRUE; +} + /* MBCS setup functions ----------------------------------------------------- */ static void @@ -408,6 +651,15 @@ _MBCSLoad(UConverterSharedData *sharedData, } } +static void +_MBCSUnload(UConverterSharedData *sharedData) { + UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs; + + if(mbcsTable->swapLFNLStateTable!=NULL) { + uprv_free(mbcsTable->swapLFNLStateTable); + } +} + static void _MBCSReset(UConverter *cnv, UConverterResetChoice choice) { if(choice<=UCNV_RESET_TO_UNICODE) { @@ -429,10 +681,29 @@ _MBCSOpen(UConverter *cnv, const char *locale, uint32_t options, UErrorCode *pErrorCode) { - _MBCSReset(cnv, UCNV_RESET_BOTH); + if((options&UCNV_OPTION_SWAP_LFNL)!=0) { + if(cnv->sharedData->table->mbcs.swapLFNLStateTable==NULL) { + if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { + /* the option does not apply, remove it */ + cnv->options&=~UCNV_OPTION_SWAP_LFNL; + } + } + } + if(uprv_strstr(name, "gb18030")!=NULL || uprv_strstr(name, "GB18030")!=NULL) { /* set a flag for GB 18030 mode, which changes the callback behavior */ - cnv->extraInfo=(void *)gb18030Ranges; + cnv->options|=_MBCS_OPTION_GB18030; + } + + _MBCSReset(cnv, UCNV_RESET_BOTH); +} + +static const char * +_MBCSGetName(const UConverter *cnv) { + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->table->mbcs.swapLFNLName!=NULL) { + return cnv->sharedData->table->mbcs.swapLFNLName; + } else { + return cnv->sharedData->staticData->name; } } @@ -508,7 +779,11 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, targetLimit=pArgs->targetLimit; offsets=pArgs->offsets; - stateTable=cnv->sharedData->table->mbcs.stateTable; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits; /* get the converter state from UConverter */ @@ -798,7 +1073,11 @@ _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, targetLimit=pArgs->targetLimit; offsets=pArgs->offsets; - stateTable=cnv->sharedData->table->mbcs.stateTable; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex=0; @@ -975,7 +1254,11 @@ _MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, targetCapacity=pArgs->targetLimit-pArgs->target; offsets=pArgs->offsets; - stateTable=cnv->sharedData->table->mbcs.stateTable; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex=0; @@ -1221,7 +1504,11 @@ _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, source=(const uint8_t *)pArgs->source; sourceLimit=(const uint8_t *)pArgs->sourceLimit; - stateTable=cnv->sharedData->table->mbcs.stateTable; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits; /* get the converter state from UConverter */ @@ -1419,6 +1706,7 @@ _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, UChar buffer[UTF_MAX_CHAR_LENGTH]; UConverter *cnv; + const int32_t (*stateTable)[256]; const uint8_t *source, *sourceLimit; int32_t entry; @@ -1429,10 +1717,15 @@ _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, cnv=pArgs->converter; source=(const uint8_t *)pArgs->source; sourceLimit=(const uint8_t *)pArgs->sourceLimit; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } /* conversion loop */ while(sourcesharedData->table->mbcs.stateTable[0][*source++]; + entry=stateTable[0][*source++]; /* MBCS_ENTRY_IS_FINAL(entry) */ /* write back the updated pointer early so that we can return directly */ @@ -1523,6 +1816,7 @@ _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, * This is a simple version of getNextUChar() that is used * by other converter implementations. * It does not use state from the converter, nor error codes. + * It does not handle the EBCDIC swaplfnl option (set in UConverter). * * Return value: * U+fffe unassigned @@ -1651,7 +1945,10 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, return 0xffff; } -/* This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. */ +/** + * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. + * It does not handle the EBCDIC swaplfnl option (set in UConverter). + */ U_CFUNC UChar32 _MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, uint8_t b, UBool useFallback) { @@ -1745,7 +2042,11 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, offsets=pArgs->offsets; table=cnv->sharedData->table->mbcs.fromUnicodeTable; - bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes; + } else { + bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes; + } /* get the converter state from UConverter */ c=cnv->fromUSurrogateLead; @@ -1982,7 +2283,7 @@ getTrail: } /* is this code point assigned, or do we use fallbacks? */ - if(!((stage2Entry&(1<<(16+(c&0xf))))!=0 || + if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || (UCNV_FROM_U_USE_FALLBACK(cnv, c) && (value!=0 || c==0))) ) { /* @@ -2219,7 +2520,11 @@ _MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, offsets=pArgs->offsets; table=cnv->sharedData->table->mbcs.fromUnicodeTable; - bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes; + } else { + bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes; + } /* get the converter state from UConverter */ c=cnv->fromUSurrogateLead; @@ -2306,7 +2611,7 @@ getTrail: } /* is this code point assigned, or do we use fallbacks? */ - if(!((stage2Entry&(1<<(16+(c&0xf))))!=0 || + if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (UCNV_FROM_U_USE_FALLBACK(cnv, c) && (value!=0 || c==0))) ) { /* @@ -2469,7 +2774,11 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, offsets=pArgs->offsets; table=cnv->sharedData->table->mbcs.fromUnicodeTable; - results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + results=(uint16_t *)cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes; + } else { + results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes; + } if(cnv->useFallback) { /* use all roundtrip and fallback results */ @@ -2681,7 +2990,11 @@ _MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, offsets=pArgs->offsets; table=cnv->sharedData->table->mbcs.fromUnicodeTable; - results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + results=(uint16_t *)cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes; + } else { + results=(uint16_t *)cnv->sharedData->table->mbcs.fromUnicodeBytes; + } if(cnv->useFallback) { /* use all roundtrip and fallback results */ @@ -2930,6 +3243,8 @@ getTrail: * This is another simple conversion function for internal use by other * conversion implementations. * It does not use the converter state nor call callbacks. + * It does not handle the EBCDIC swaplfnl option (set in UConverter). + * * It converts one single Unicode code point into codepage bytes, encoded * as one 32-bit value. The function returns the number of bytes in *pValue: * 1..4 the number of bytes in *pValue @@ -3041,7 +3356,7 @@ _MBCSFromUChar32(UConverterSharedData *sharedData, } /* is this code point assigned, or do we use fallbacks? */ - if( (stage2Entry&(1<<(16+(c&0xf))))!=0 || + if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (FROM_U_USE_FALLBACK(useFallback, c) && (value!=0 || c==0)) ) { /* @@ -3058,6 +3373,12 @@ _MBCSFromUChar32(UConverterSharedData *sharedData, } } +/** + * This version of _MBCSFromUChar32() is optimized for single-byte codepages. + * It does not handle the EBCDIC swaplfnl option (set in UConverter). + * + * It returns the codepage byte for the code point, or -1 if it is unassigned. + */ U_CFUNC int32_t _MBCSSingleFromUChar32(UConverterSharedData *sharedData, UChar32 c, @@ -3183,7 +3504,7 @@ static const UConverterImpl _MBCSImpl={ UCNV_MBCS, _MBCSLoad, - NULL, + _MBCSUnload, _MBCSOpen, NULL, @@ -3196,7 +3517,7 @@ static const UConverterImpl _MBCSImpl={ _MBCSGetNextUChar, _MBCSGetStarters, - NULL, + _MBCSGetName, _MBCSWriteSub }; @@ -3223,7 +3544,7 @@ fromUCallback(UConverter *cnv, UConverterCallbackReason reason, UErrorCode *pErrorCode) { int32_t i; - if(cnv->extraInfo==gb18030Ranges && reason==UCNV_UNASSIGNED) { + if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED) { const uint32_t *range; range=gb18030Ranges[0]; @@ -3270,7 +3591,7 @@ toUCallback(UConverter *cnv, UConverterCallbackReason reason, UErrorCode *pErrorCode) { int32_t i; - if(cnv->extraInfo==gb18030Ranges && reason==UCNV_UNASSIGNED && length==4) { + if((cnv->options&_MBCS_OPTION_GB18030)!=0 && reason==UCNV_UNASSIGNED && length==4) { const uint32_t *range; uint32_t linear;