diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 3d6a27ed765..2def6c8d9ce 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -43,6 +43,23 @@ conversion { toUnicode { Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } Cases { + // DBCS-only extensions + { + "ibm-16684", + :bin{ 430e4395ecc1404042e1 }, + "\ufffd\u30C8\u30C8\u309A\u3000\u20ac", + :intvector{ 0, 2, 4, 4, 6, 8 }, + :int{1}, :int{0}, "", "?", :bin{""} + } + + { + "ibm-1399", + :bin{ 430e4395ecc140400fe1 }, + "\uff62\u30C8\u30C8\u309A\u3000\u20ac", + :intvector{ 0, 2, 4, 4, 6, 9 }, + :int{1}, :int{0}, "", "?", :bin{""} + } + // extensions { "ibm-1390", @@ -144,6 +161,31 @@ conversion { fromUnicode { Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" } Cases { + // DBCS-only extensions + { + "ibm-1390,swaplfnl", + "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", + :bin{ 430e4395ecc140400fc1e115 }, + :intvector{ 0, 1, 1, 1, 2, 2, 4, 4, 5, 5, 6, 7 }, + :int{1}, :int{0}, "", "?", "" + } + + { + "ibm-16684", + "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", + :bin{ fefe4395ecc14040fefe42e1fefe }, + :intvector{ 0, 0, 1, 1, 2, 2, 4, 4, 5, 5, 6, 6, 7, 7 }, + :int{1}, :int{0}, "", "?", "" + } + + { + "ibm-1399", + "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", + :bin{ 440e4395ecc140400fc1e125 }, + :intvector{ 0, 1, 1, 1, 2, 2, 4, 4, 5, 5, 6, 7 }, + :int{1}, :int{0}, "", "?", "" + } + // from |2 mappings { "ibm-1390", @@ -296,6 +338,16 @@ conversion { // which - numeric UConverterUnicodeSet value Headers { "charset", "map", "mapnot", "which" } Cases { + // DBCS-only + { + "ibm-16684", + "[\xa0\xa1\xa4\xa6-\xab\xad-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2" + "{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]", + "[\x00-0x9f\xa2\xa3\xa5\xac\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]", + :int{0} + } + + // extensions { "ibm-1390", "[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2" diff --git a/icu4c/source/tools/makeconv/makeconv.c b/icu4c/source/tools/makeconv/makeconv.c index 6cefc4134f9..921710e31d7 100644 --- a/icu4c/source/tools/makeconv/makeconv.c +++ b/icu4c/source/tools/makeconv/makeconv.c @@ -29,7 +29,7 @@ #include "uoptions.h" #include "unicode/udata.h" #include "unewdata.h" -#include "ucmpwrit.h" +#include "uparse.h" #include "ucm.h" #include "makeconv.h" #include "genmbcs.h" @@ -305,18 +305,7 @@ int main(int argc, char* argv[]) const char *basename; /* find the last file sepator */ - basename = uprv_strrchr(arg, U_FILE_SEP_CHAR); - if (basename == NULL) { - basename = uprv_strrchr(arg, U_FILE_ALT_SEP_CHAR); - if (basename == NULL) { - basename = arg; - } else { - ++basename; - } - } else { - ++basename; - } - + basename = findBasename(arg); uprv_strcpy(outBasename, basename); } else @@ -593,53 +582,6 @@ readHeader(ConvData *data, } } -static void -readTable(ConvData *data, FileStream* convFile, - UBool forBase, UCMStates *baseStates, - UErrorCode *pErrorCode) { - char line[500]; - char *end; - UBool isOK; - - if(U_FAILURE(*pErrorCode)) { - return; - } - - isOK=TRUE; - - for(;;) { - /* read the next line */ - if(!T_FileStream_readLine(convFile, line, sizeof(line))) { - fprintf(stderr, "incomplete charmap section\n"); - isOK=FALSE; - break; - } - - /* remove CR LF */ - end=uprv_strchr(line, 0); - while(lineucm, line, forBase, baseStates); - } - - if(!isOK) { - *pErrorCode=U_INVALID_TABLE_FORMAT; - } -} - /* return TRUE if a base table was read, FALSE for an extension table */ static UBool readFile(ConvData *data, const char* converterName, @@ -647,6 +589,8 @@ readFile(ConvData *data, const char* converterName, char line[200]; char *end; FileStream *convFile; + + UCMStates *baseStates; UBool dataIsBase; if(U_FAILURE(*pErrorCode)) { @@ -668,37 +612,39 @@ readFile(ConvData *data, const char* converterName, if(data->ucm->baseName[0]==0) { dataIsBase=TRUE; - ucm_processStates(&data->ucm->states); - - /* read the base table */ - readTable(data, convFile, TRUE, &data->ucm->states, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return FALSE; - } - - /* read an extension table if there is one */ - while(T_FileStream_readLine(convFile, line, sizeof(line))) { - end=uprv_strchr(line, 0); - while(lineucm->states, pErrorCode); - break; - } - } + baseStates=&data->ucm->states; + ucm_processStates(baseStates); } else { - /* read only the extension table */ dataIsBase=FALSE; - readTable(data, convFile, FALSE, NULL, pErrorCode); + baseStates=NULL; + } - /* ### TODO enable extension-only tables, Jitterbug 3346 */ - fprintf(stderr, "error: delta/extension-only conversion tables are not yet supported\n"); - *pErrorCode=U_INVALID_TABLE_FORMAT; + /* read the base table */ + ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return FALSE; + } + + /* read an extension table if there is one */ + while(T_FileStream_readLine(convFile, line, sizeof(line))) { + end=uprv_strchr(line, 0); + while(lineucm, convFile, FALSE, baseStates, pErrorCode); + } else { + fprintf(stderr, "unexpected text after the base mapping table\n"); + } + break; } T_FileStream_close(convFile); @@ -712,7 +658,7 @@ readFile(ConvData *data, const char* converterName, } static void -createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode) { +createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) { ConvData baseData; UBool dataIsBase; @@ -722,17 +668,11 @@ createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCod initConvData(data); - /* ### TODO if there is an extension table: - 1. the base table must use precision flags - 2. check base vs. extension for mappings overlap - */ dataIsBase=readFile(data, converterName, pErrorCode); if(U_FAILURE(*pErrorCode)) { return; } - initConvData(&baseData); - if(dataIsBase) { data->cnvData=MBCSOpen(data->ucm); if(data->cnvData==NULL) { @@ -751,7 +691,7 @@ createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCod *pErrorCode=U_MEMORY_ALLOCATION_ERROR; } else if( - !ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, TRUE) || + !ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) || !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) ) { *pErrorCode=U_INVALID_TABLE_FORMAT; @@ -765,20 +705,41 @@ createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCod *pErrorCode=U_INVALID_TABLE_FORMAT; } } else { - /* ### TODO assemble a path/filename for data->ucm->states.baseName */ - /* must be TRUE */readFile(&baseData, ""/*extConverterName*/, pErrorCode); - /* ### TODO read extension table */ - /* ### TODO - actually write the mappings into genmbcs or into ext */ + char baseFilename[500]; + char *basename; - if( !ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) || - !ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, FALSE) || - !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) - ) { + initConvData(&baseData); + + /* assemble a path/filename for data->ucm->baseName */ + uprv_strcpy(baseFilename, converterName); + basename=(char *)findBasename(baseFilename); + uprv_strcpy(basename, data->ucm->baseName); + uprv_strcat(basename, ".ucm"); + + /* read the base table */ + dataIsBase=readFile(&baseData, baseFilename, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } else if(!dataIsBase) { + fprintf(stderr, "error: the file \"%s\" is not a base table file\n", baseFilename); *pErrorCode=U_INVALID_TABLE_FORMAT; - } - } + } else { + /* prepare the extension table */ + data->extData=CnvExtOpen(data->ucm); + if(data->extData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - cleanupConvData(&baseData); + } else if( + !ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) || + !ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) || + !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + + cleanupConvData(&baseData); + } } /* diff --git a/icu4c/source/tools/toolutil/toolutil.c b/icu4c/source/tools/toolutil/toolutil.c index fb0778bf197..b7119a39db0 100644 --- a/icu4c/source/tools/toolutil/toolutil.c +++ b/icu4c/source/tools/toolutil/toolutil.c @@ -59,18 +59,19 @@ getLongPathname(const char *pathname) { U_CAPI const char * U_EXPORT2 findBasename(const char *filename) { const char *basename=uprv_strrchr(filename, U_FILE_SEP_CHAR); + +#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR + if(basename==NULL) { + /* Use lenient matching on Windows, which can accept either \ or / + This is useful for environments like Win32+CygWin which have both. + */ + basename=uprv_strrchr(filename, U_FILE_ALT_SEP_CHAR); + } +#endif + if(basename!=NULL) { return basename+1; } else { -#ifdef WIN32 - /* Use lenient matching on Windows, which can accept either \ or / - This is useful for CygWin environments which has both - */ - basename=uprv_strrchr(filename, '/'); - if(basename!=NULL) { - return basename+1; - } -#endif return filename; } } diff --git a/icu4c/source/tools/toolutil/ucm.c b/icu4c/source/tools/toolutil/ucm.c index 43d55c48959..7d3cb5c0393 100644 --- a/icu4c/source/tools/toolutil/ucm.c +++ b/icu4c/source/tools/toolutil/ucm.c @@ -26,8 +26,10 @@ #include "unicode/ustring.h" #include "cstring.h" #include "cmemory.h" +#include "filestrm.h" #include "uarrsort.h" #include "ucnvmbcs.h" +#include "ucnv_bld.h" #include "ucnv_ext.h" #include "uparse.h" #include "ucm.h" @@ -217,6 +219,10 @@ ucm_sortTable(UCMTable *t) { UErrorCode errorCode; int32_t i; + if(t->isSorted) { + return; + } + errorCode=U_ZERO_ERROR; /* 1. sort by Unicode first */ @@ -252,17 +258,18 @@ ucm_sortTable(UCMTable *t) { u_errorName(errorCode)); exit(errorCode); } + + t->isSorted=TRUE; } enum { - MOVE_TO_EXT=0x10, - REMOVE_MAPPING=0x20, - MOVE_ANY=0x30 + MOVE_TO_EXT=1, + REMOVE_MAPPING=2 }; /* - * move mappings with MOVE_ANY ored into their flags from the base table - * to the extension table + * move mappings with their move flag set from the base table + * and optionally to the extension table * * works only with explicit precision flags because it uses some of the * flags bits @@ -276,10 +283,10 @@ moveMappings(UCMTable *base, UCMTable *ext) { mbLimit=mb+base->mappingsLength; while(mbf; - if(flag&MOVE_ANY) { - /* restore the original flag value */ - mb->f=flag&~MOVE_ANY; + flag=mb->moveFlag; + if(flag!=0) { + /* reset the move flag */ + mb->moveFlag=0; if(ext!=NULL && (flag&MOVE_TO_EXT)) { /* add the mapping to the extension table */ @@ -292,6 +299,7 @@ moveMappings(UCMTable *base, UCMTable *ext) { } --mbLimit; --base->mappingsLength; + base->isSorted=FALSE; } else { ++mb; } @@ -304,10 +312,12 @@ enum { }; static uint8_t -checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) { +checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { UCMapping *mb, *me, *mbLimit, *meLimit; int32_t cmp; uint8_t result; + UBool isSISO; mb=base->mappings; mbLimit=mb+base->mappingsLength; @@ -317,6 +327,8 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) { result=0; + isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); + for(;;) { /* skip irrelevant mappings on both sides */ for(;;) { @@ -346,21 +358,32 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) { /* compare the base and extension mappings */ cmp=compareUnicode(base, mb, ext, me); if(cmp<0) { + if(intersectBase && (!(isSISO && intersectBase==2) || mb->bLen>1)) { + /* + * mapping in base but not in ext, move it + * + * if base is EBCDIC_STATEFUL and ext is DBCS, move DBCS mappings here + * and check SBCS ones for Unicode prefix below + */ + mb->moveFlag|=MOVE_TO_EXT; + result|=NEEDS_MOVE; + /* does mb map from an input sequence that is a prefix of me's? */ - if( mb->uLenuLen && + } else if( mb->uLenuLen && 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) ) { if(moveToExt) { /* mark this mapping to be moved to the extension table */ - mb->f|=MOVE_TO_EXT; + mb->moveFlag|=MOVE_TO_EXT; + result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; } - result|=NEEDS_MOVE; } ++mb; @@ -372,7 +395,11 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) { if( mb->f==me->f && mb->bLen==me->bLen && 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) ) { - me->f|=REMOVE_MAPPING; + me->moveFlag|=REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, @@ -392,7 +419,8 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) { } static uint8_t -checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) { +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { UCMapping *mb, *me; int32_t *baseMap, *extMap; int32_t b, e, bLimit, eLimit, cmp; @@ -412,17 +440,23 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mo for(;;) { /* skip irrelevant mappings on both sides */ - for(;;) { + for(;; ++b) { if(b==bLimit) { return result; } mb=base->mappings+baseMap[b]; + if(isSISO && intersectBase==2 && mb->bLen==1) { + /* + * comparing an EBCDIC_STATEFUL base against a DBCS extension: + * leave SBCS base mappings alone + */ + continue; + } + if(mb->f==0 || mb->f==3) { break; } - - ++b; } for(;;) { @@ -441,18 +475,23 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mo /* compare the base and extension mappings */ cmp=compareBytes(base, mb, ext, me, TRUE); if(cmp<0) { + if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=MOVE_TO_EXT; + result|=NEEDS_MOVE; + /* * does mb map from an input sequence that is a prefix of me's? * for SI/SO tables, a single byte is never a prefix because it * occurs in a separate single-byte state */ - if( mb->bLenbLen && + } else if( mb->bLenbLen && (!isSISO || mb->bLen>1) && 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) ) { if(moveToExt) { /* mark this mapping to be moved to the extension table */ - mb->f|=MOVE_TO_EXT; + mb->moveFlag|=MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, @@ -473,7 +512,11 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mo if( mb->f==me->f && mb->uLen==me->uLen && 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) ) { - me->f|=REMOVE_MAPPING; + me->moveFlag|=REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, @@ -515,12 +558,18 @@ ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { } U_CAPI UBool U_EXPORT2 -ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) { +ucm_checkBaseExt(UCMStates *baseStates, + UCMTable *base, UCMTable *ext, UCMTable *moveTarget, + UBool intersectBase) { uint8_t result; /* if we have an extension table, we must always use precision flags */ - if(base->flagsType!=UCM_FLAGS_EXPLICIT || ext->flagsType!=UCM_FLAGS_EXPLICIT) { - fprintf(stderr, "ucm error: the base or extension table contains mappings without precision flags\n"); + if(base->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); + return FALSE; + } + if(ext->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); return FALSE; } @@ -530,8 +579,8 @@ ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mov /* check */ result= - checkBaseExtUnicode(base, ext, moveToExt)| - checkBaseExtBytes(baseStates, base, ext, moveToExt); + checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| + checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); if(result&HAS_ERRORS) { return FALSE; @@ -539,9 +588,12 @@ ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mov if(result&NEEDS_MOVE) { moveMappings(ext, NULL); - moveMappings(base, ext); + moveMappings(base, moveTarget); ucm_sortTable(base); ucm_sortTable(ext); + if(moveTarget!=NULL) { + ucm_sortTable(moveTarget); + } } return TRUE; @@ -640,6 +692,8 @@ ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, ++toUMapping; ++toUIndex; } + + fromUTable->isSorted=FALSE; } /* separate extension mappings out of base table for rptp2ucm --------------- */ @@ -662,7 +716,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) { if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); ucm_printMapping(table, m, stderr); - m->f|=REMOVE_MAPPING; + m->moveFlag|=REMOVE_MAPPING; needsMove=TRUE; continue; } @@ -675,7 +729,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) { printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); isOK=FALSE; } else if(type>0) { - m->f|=MOVE_TO_EXT; + m->moveFlag|=MOVE_TO_EXT; needsMove=TRUE; } } @@ -685,7 +739,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) { } if(needsMove) { moveMappings(ucm->base, ucm->ext); - return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, TRUE); + return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); } else { ucm_sortTable(ucm->base); return TRUE; @@ -852,6 +906,17 @@ ucm_closeTable(UCMTable *table) { } } +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table) { + if(table!=NULL) { + table->mappingsLength=0; + table->flagsType=0; + table->unicodeMask=0; + table->bytesLength=table->codePointsLength=0; + table->isSorted=FALSE; + } +} + U_CAPI void U_EXPORT2 ucm_addMapping(UCMTable *table, UCMapping *m, @@ -946,6 +1011,8 @@ ucm_addMapping(UCMTable *table, tm=table->mappings+table->mappingsLength++; uprv_memcpy(tm, m, sizeof(UCMapping)); + + table->isSorted=FALSE; } U_CAPI UCMFile * U_EXPORT2 @@ -1051,7 +1118,61 @@ ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; uint8_t bytes[UCNV_EXT_MAX_BYTES]; + const char *s; + + /* ignore empty and comment lines */ + if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { + return TRUE; + } + return ucm_parseMappingLine(&m, codePoints, bytes, line) && ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); } + +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode) { + char line[500]; + char *end; + UBool isOK; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + isOK=TRUE; + + for(;;) { + /* read the next line */ + if(!T_FileStream_readLine(convFile, line, sizeof(line))) { + fprintf(stderr, "incomplete charmap section\n"); + isOK=FALSE; + break; + } + + /* remove CR LF */ + end=uprv_strchr(line, 0); + while(line U_CDECL_BEGIN @@ -46,7 +47,7 @@ typedef struct UCMapping { uint32_t index; uint8_t bytes[4]; } b; - int8_t uLen, bLen, f; + int8_t uLen, bLen, f, moveFlag; } UCMapping; enum { @@ -71,6 +72,7 @@ typedef struct UCMTable { uint8_t unicodeMask; int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ + UBool isSorted; } UCMTable; enum { @@ -140,9 +142,21 @@ ucm_openTable(void); U_CAPI void U_EXPORT2 ucm_closeTable(UCMTable *table); +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table); + U_CAPI void U_EXPORT2 ucm_sortTable(UCMTable *t); +/** + * Read a table from a .ucm file, from after the CHARMAP line to + * including the END CHARMAP line. + */ +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode); + /** * Check the validity of mappings against a base table's states; * necessary for extension-only tables that were read before their base tables. @@ -152,9 +166,22 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); /** * Check a base table against an extension table. - * Set moveToExt=TRUE for where base and extension tables are parsed - * from a single file, - * and moveToExt=FALSE for where the extension table is in a separate file. + * Set the moveTarget!=NULL if it is possible to move mappings from the base. + * This is the case where base and extension tables are parsed from a single file + * (moveTarget==ext) + * or when delta file mappings are subtracted from a base table. + * + * When a base table cannot be modified because a delta file is parsed in makeconv, + * then set moveTarget=NULL. + * + * if(intersectBase) then mappings that exist in the base table but not in + * the extension table are moved to moveTarget instead of showing an error. + * + * Special mode: If the base table is an SISO table (indicated in the baseStates) + * and intersectBase==2 for a DBCS extension table, then SBCS mappings are + * not moved out of the base unless their Unicode input requires it. + * This helps ucmkbase generate base tables for where the dbcsonly converter + * option will be employed. * * For both tables in the same file, the extension table is automatically * built. @@ -164,6 +191,12 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); * * Sort both tables, and then for each mapping direction: * + * If intersectBase is TRUE and the base table contains a mapping + * that does not exist in the extension table, then this mapping is moved + * to moveTarget. + * + * - otherwise - + * * If the base table contains a mapping for which the input sequence is * the same as the extension input, then * - if the output is the same: remove the extension mapping @@ -171,13 +204,14 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); * * If the base table contains a mapping for which the input sequence is * a prefix of the extension input, then - * - if moveToExt: move the base mapping to the extension table + * - if moveTarget!=NULL: move the base mapping to the moveTarget table * - else: error * * @return FALSE in case of an irreparable error */ U_CAPI UBool U_EXPORT2 -ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt); +ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UCMTable *moveTarget, UBool intersectBase); U_CAPI void U_EXPORT2 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);