diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 96850355bbe..2411260ac16 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 2000-2009, International Business Machines +* Copyright (C) 2000-2010, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -398,6 +398,76 @@ gb18030Ranges[13][4]={ /* bit flag for UConverter.options indicating GB 18030 special handling */ #define _MBCS_OPTION_GB18030 0x8000 +/* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ +#define _MBCS_OPTION_KEIS 0x01000 +#define _MBCS_OPTION_JEF 0x02000 +#define _MBCS_OPTION_JIPS 0x04000 + +#define KEIS_SO_CHAR_1 0x0A +#define KEIS_SO_CHAR_2 0x42 +#define KEIS_SI_CHAR_1 0x0A +#define KEIS_SI_CHAR_2 0x41 + +#define JEF_SO_CHAR 0x28 +#define JEF_SI_CHAR 0x29 + +#define JIPS_SO_CHAR_1 0x1A +#define JIPS_SO_CHAR_2 0x70 +#define JIPS_SI_CHAR_1 0x1A +#define JIPS_SI_CHAR_2 0x71 + +enum SISO_Option { + SI, + SO +}; +typedef enum SISO_Option SISO_Option; + +static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { + int32_t SISOLength = 0; + + switch (option) { + case SI: + if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { + value[0] = KEIS_SI_CHAR_1; + value[1] = KEIS_SI_CHAR_2; + SISOLength = 2; + } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { + value[0] = JEF_SI_CHAR; + SISOLength = 1; + } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { + value[0] = JIPS_SI_CHAR_1; + value[1] = JIPS_SI_CHAR_2; + SISOLength = 2; + } else { + value[0] = UCNV_SI; + SISOLength = 1; + } + break; + case SO: + if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { + value[0] = KEIS_SO_CHAR_1; + value[1] = KEIS_SO_CHAR_2; + SISOLength = 2; + } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { + value[0] = JEF_SO_CHAR; + SISOLength = 1; + } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { + value[0] = JIPS_SO_CHAR_1; + value[1] = JIPS_SO_CHAR_2; + SISOLength = 2; + } else { + value[0] = UCNV_SO; + SISOLength = 1; + } + break; + default: + /* Should never happen. */ + break; + } + + return SISOLength; +} + /* Miscellaneous ------------------------------------------------------------ */ /** @@ -1724,6 +1794,12 @@ ucnv_MBCSOpen(UConverter *cnv, /* set a flag for GB 18030 mode, which changes the callback behavior */ cnv->options|=_MBCS_OPTION_GB18030; } + } else if(uprv_strstr(pArgs->name, "KEIS")!=NULL) { + cnv->options|=_MBCS_OPTION_KEIS; + } else if(uprv_strstr(pArgs->name, "JEF")!=NULL) { + cnv->options|=_MBCS_OPTION_JEF; + } else if(uprv_strstr(pArgs->name, "JIPS")!=NULL) { + cnv->options|=_MBCS_OPTION_JIPS; } /* fix maxBytesPerUChar depending on outputType and options etc. */ @@ -3859,6 +3935,7 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, uint32_t stage2Entry; uint32_t asciiRoundtrips; uint32_t value; + uint8_t si_value[2], so_value[2], si_value_length, so_value_length; int32_t length, prevLength; uint8_t unicodeMask; @@ -3930,6 +4007,10 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; + /* Get the SI/SO character for the converter */ + si_value_length = getSISOBytes(SI, cnv->options, si_value); + so_value_length = getSISOBytes(SO, cnv->options, so_value); + /* conversion loop */ /* * This is another piece of ugly code: @@ -4019,8 +4100,14 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, length=1; } else { /* change from double-byte mode to single-byte */ - value|=(uint32_t)UCNV_SI<<8; - length=2; + if (si_value_length == 1) { + value|=si_value[0]<<8; + length = 2; + } else if (si_value_length == 2) { + value|=si_value[1]<<8; + value|=si_value[0]<<16; + length = 3; + } prevLength=1; } } else { @@ -4028,8 +4115,14 @@ ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, length=2; } else { /* change from single-byte mode to double-byte */ - value|=(uint32_t)UCNV_SO<<16; - length=3; + if (so_value_length == 1) { + value|=so_value[0]<<16; + length = 3; + } else if (so_value_length == 2) { + value|=so_value[1]<<16; + value|=so_value[0]<<24; + length = 4; + } prevLength=2; } } @@ -4239,8 +4332,14 @@ getTrail: length=1; } else { /* change from double-byte mode to single-byte */ - value|=(uint32_t)UCNV_SI<<8; - length=2; + if (si_value_length == 1) { + value|=si_value[0]<<8; + length = 2; + } else if (si_value_length == 2) { + value|=si_value[1]<<8; + value|=si_value[0]<<16; + length = 3; + } prevLength=1; } } else { @@ -4248,8 +4347,14 @@ getTrail: length=2; } else { /* change from single-byte mode to double-byte */ - value|=(uint32_t)UCNV_SO<<16; - length=3; + if (so_value_length == 1) { + value|=so_value[0]<<16; + length = 3; + } else if (so_value_length == 2) { + value|=so_value[1]<<16; + value|=so_value[0]<<24; + length = 4; + } prevLength=2; } } @@ -4502,15 +4607,27 @@ unassigned: ) { /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ if(targetCapacity>0) { - *target++=(uint8_t)UCNV_SI; + *target++=(uint8_t)si_value[0]; + if (si_value_length == 2) { + if (targetCapacity<2) { + cnv->charErrorBuffer[0]=(uint8_t)si_value[1]; + cnv->charErrorBufferLength=1; + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } else { + *target++=(uint8_t)si_value[1]; + } + } if(offsets!=NULL) { /* set the last source character's index (sourceIndex points at sourceLimit now) */ *offsets++=prevSourceIndex; } } else { /* target is full */ - cnv->charErrorBuffer[0]=(char)UCNV_SI; - cnv->charErrorBufferLength=1; + cnv->charErrorBuffer[0]=(uint8_t)si_value[0]; + if (si_value_length == 2) { + cnv->charErrorBuffer[1]=(uint8_t)si_value[1]; + } + cnv->charErrorBufferLength=si_value_length; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } prevLength=1; /* we switched into SBCS */ diff --git a/icu4c/source/data/Makefile.in b/icu4c/source/data/Makefile.in index 633485427ca..e315a30afd0 100644 --- a/icu4c/source/data/Makefile.in +++ b/icu4c/source/data/Makefile.in @@ -258,6 +258,10 @@ ALL_UCM_SOURCE=ibm-37_P100-1995.ucm ibm-1047_P100-1995.ucm $(UCM_SOURCE_CORE) $( UCM_FILES = $(ALL_UCM_SOURCE:%=$(SRCDATADIR)/%) CNV_FILES = $(ALL_UCM_SOURCE:%.ucm=$(BUILDDIR)/%.cnv) CNV_FILES_SHORT = $(ALL_UCM_SOURCE:%.ucm=%.cnv) +UCM_SOURCE_SPECIAL=$(UCM_SOURCE_EBCDIC_IGNORE_SISO) +UCM_FILES_SPECIAL=$(UCM_SOURCE_SPECIAL:%=$(UCMSRCDIR)/%) +CNV_FILES_SPECIAL=$(UCM_SOURCE_SPECIAL:%.ucm=$(BUILDDIR)/%.cnv) +CNV_FILES_SHORT_SPECIAL=$(UCM_SOURCE_SPECIAL:%.ucm=%.cnv) ## RES files -include $(LOCSRCDIR)/resfiles.mk @@ -396,13 +400,13 @@ SPREP_FILES = $(ALL_SPREP_SOURCE:%.txt=$(BUILDDIR)/%.spp) SPREP_FILES_SHORT = $(ALL_SPREP_SOURCE:%.txt=%.spp) ## All generated files -ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES) +ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(CNV_FILES_SPECIAL) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES) ALL_INDEX_SRC_FILES = $(PKGDATA_LIST) $(INDEX_FILE) $(CURR_INDEX_FILE) $(LANG_INDEX_FILE) $(REGION_INDEX_FILE) $(ZONE_INDEX_FILE) $(COLLATION_INDEX_FILE) $(BRK_RES_INDEX_FILE) $(RBNF_INDEX_FILE) # a list to use in the .lst files (package-relative) COLL_FILES_LIST=$(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) BRK_FILES_LIST=$(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) LOCALE_FILES_LIST= $(RES_FILES_SHORT) $(LANG_FILES_SHORT) $(REGION_FILES_SHORT) $(ZONE_FILES_SHORT) -MISC_FILES_LIST=$(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(CURR_FILES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT) +MISC_FILES_LIST=$(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(CNV_FILES_SHORT_SPECIAL) $(CURR_FILES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT) UNI_CORE_DATA=uprops.icu ucase.icu ubidi.icu UNI_CORE_TARGET_DATA=$(UNI_CORE_DATA:%=$(BUILDDIR)/%) @@ -543,6 +547,8 @@ $(CFU_FILES): $(ALL_CFU_SOURCE) $(TOOLBINDIR)/gencfu$(EXEEXT) $(DAT_FILES) #################################################### CNV # CNV FILES +$(CNV_FILES_SPECIAL) : $(UCM_FILES_SPECIAL) $(TOOLBINDIR)/makeconv$(TOOLEXEEXT) + $(INVOKE) $(TOOLBINDIR)/makeconv --ignore-siso-check -c -d $(BUILDDIR) $(UCMSRCDIR)/$(@F:%.cnv=%.ucm) $(BUILDDIR)/%.cnv: $(UCMSRCDIR)/%.ucm $(TOOLBINDIR)/makeconv$(TOOLEXEEXT) $(INVOKE) $(TOOLBINDIR)/makeconv -c -d $(BUILDDIR) $(UCMSRCDIR)/$(ucm->states.maxCharLength; if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO && - (*bytes==0xe || *bytes==0xf) + (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf)) ) { fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n", (int)c, printBytes(buffer, bytes, length)); diff --git a/icu4c/source/tools/makeconv/makeconv.c b/icu4c/source/tools/makeconv/makeconv.c index bdc12e9e20d..86ead4c0fe7 100644 --- a/icu4c/source/tools/makeconv/makeconv.c +++ b/icu4c/source/tools/makeconv/makeconv.c @@ -79,6 +79,7 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP */ UBool VERBOSE = FALSE; UBool SMALL = FALSE; +UBool IGNORE_SISO_CHECK = FALSE; static void createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); @@ -174,6 +175,7 @@ enum { OPT_DESTDIR, OPT_VERBOSE, OPT_SMALL, + OPT_IGNORE_SISO_CHECK, OPT_COUNT }; @@ -184,7 +186,8 @@ static UOption options[]={ UOPTION_VERSION, UOPTION_DESTDIR, UOPTION_VERBOSE, - { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } + { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }, + { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } }; int main(int argc, char* argv[]) @@ -236,7 +239,8 @@ int main(int argc, char* argv[]) "\t --small Generate smaller .cnv files. They will be\n" "\t significantly smaller but may not be compatible with\n" "\t older versions of ICU and will require heap memory\n" - "\t allocation when loaded.\n"); + "\t allocation when loaded.\n" + "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n"); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } @@ -253,6 +257,10 @@ int main(int argc, char* argv[]) VERBOSE = options[OPT_VERBOSE].doesOccur; SMALL = options[OPT_SMALL].doesOccur; + if (options[OPT_IGNORE_SISO_CHECK].doesOccur) { + IGNORE_SISO_CHECK = TRUE; + } + if (destdir != NULL && *destdir != 0) { uprv_strcpy(outFileName, destdir); destdirlen = uprv_strlen(destdir); @@ -579,7 +587,7 @@ readFile(ConvData *data, const char* converterName, if(data->ucm->baseName[0]==0) { dataIsBase=TRUE; baseStates=&data->ucm->states; - ucm_processStates(baseStates); + ucm_processStates(baseStates, IGNORE_SISO_CHECK); } else { dataIsBase=FALSE; baseStates=NULL; @@ -782,7 +790,7 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; - } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { + } else if(1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; diff --git a/icu4c/source/tools/makeconv/makeconv.h b/icu4c/source/tools/makeconv/makeconv.h index a3c2d375a1b..3cb50706e0e 100644 --- a/icu4c/source/tools/makeconv/makeconv.h +++ b/icu4c/source/tools/makeconv/makeconv.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2007, International Business Machines +* Copyright (C) 2000-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -25,6 +25,7 @@ /* exports from makeconv.c */ U_CFUNC UBool VERBOSE; U_CFUNC UBool SMALL; +U_CFUNC UBool IGNORE_SISO_CHECK; /* converter table type for writing */ enum { diff --git a/icu4c/source/tools/toolutil/ucm.h b/icu4c/source/tools/toolutil/ucm.h index 6ee13215f2b..20324f54c70 100644 --- a/icu4c/source/tools/toolutil/ucm.h +++ b/icu4c/source/tools/toolutil/ucm.h @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2003-2009, International Business Machines + * Copyright (C) 2003-2010, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * file name: ucm.h @@ -237,7 +237,7 @@ U_CAPI void U_EXPORT2 ucm_addState(UCMStates *states, const char *s); U_CAPI void U_EXPORT2 -ucm_processStates(UCMStates *states); +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); U_CAPI int32_t U_EXPORT2 ucm_countChars(UCMStates *states, diff --git a/icu4c/source/tools/toolutil/ucmstate.c b/icu4c/source/tools/toolutil/ucmstate.c index 393d18b3781..e1adb974a07 100644 --- a/icu4c/source/tools/toolutil/ucmstate.c +++ b/icu4c/source/tools/toolutil/ucmstate.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2003-2005, International Business Machines +* Copyright (C) 2003-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -421,7 +421,7 @@ sumUpStates(UCMStates *states) { } U_CAPI void U_EXPORT2 -ucm_processStates(UCMStates *states) { +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { int32_t entry, state, cell, count; if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { @@ -557,10 +557,11 @@ ucm_processStates(UCMStates *states) { exit(U_INVALID_TABLE_FORMAT); } /* are the SI/SO all in the right places? */ - if( states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + if( ignoreSISOCheck || + (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && - states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) + states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) ) { states->outputType=MBCS_OUTPUT_2_SISO; } else {