mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-3346 support DBCS-only and other delta (extension-only) .cnv files
X-SVN-Rev: 13637
This commit is contained in:
parent
558442a420
commit
693cbae3a7
5 changed files with 321 additions and 152 deletions
52
icu4c/source/test/testdata/conversion.txt
vendored
52
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -43,6 +43,23 @@ conversion {
|
|||
toUnicode {
|
||||
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
||||
Cases {
|
||||
// DBCS-only extensions
|
||||
{
|
||||
"ibm-16684",
|
||||
:bin{ 430e4395ecc1404042e1 },
|
||||
"\ufffd\u30C8\u30C8\u309A\u3000\u20ac",
|
||||
:intvector{ 0, 2, 4, 4, 6, 8 },
|
||||
:int{1}, :int{0}, "", "?", :bin{""}
|
||||
}
|
||||
|
||||
{
|
||||
"ibm-1399",
|
||||
:bin{ 430e4395ecc140400fe1 },
|
||||
"\uff62\u30C8\u30C8\u309A\u3000\u20ac",
|
||||
:intvector{ 0, 2, 4, 4, 6, 9 },
|
||||
:int{1}, :int{0}, "", "?", :bin{""}
|
||||
}
|
||||
|
||||
// extensions
|
||||
{
|
||||
"ibm-1390",
|
||||
|
@ -144,6 +161,31 @@ conversion {
|
|||
fromUnicode {
|
||||
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
||||
Cases {
|
||||
// DBCS-only extensions
|
||||
{
|
||||
"ibm-1390,swaplfnl",
|
||||
"\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a",
|
||||
:bin{ 430e4395ecc140400fc1e115 },
|
||||
:intvector{ 0, 1, 1, 1, 2, 2, 4, 4, 5, 5, 6, 7 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
|
||||
{
|
||||
"ibm-16684",
|
||||
"\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a",
|
||||
:bin{ fefe4395ecc14040fefe42e1fefe },
|
||||
:intvector{ 0, 0, 1, 1, 2, 2, 4, 4, 5, 5, 6, 6, 7, 7 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
|
||||
{
|
||||
"ibm-1399",
|
||||
"\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a",
|
||||
:bin{ 440e4395ecc140400fc1e125 },
|
||||
:intvector{ 0, 1, 1, 1, 2, 2, 4, 4, 5, 5, 6, 7 },
|
||||
:int{1}, :int{0}, "", "?", ""
|
||||
}
|
||||
|
||||
// <subchar1> from |2 mappings
|
||||
{
|
||||
"ibm-1390",
|
||||
|
@ -296,6 +338,16 @@ conversion {
|
|||
// which - numeric UConverterUnicodeSet value
|
||||
Headers { "charset", "map", "mapnot", "which" }
|
||||
Cases {
|
||||
// DBCS-only
|
||||
{
|
||||
"ibm-16684",
|
||||
"[\xa0\xa1\xa4\xa6-\xab\xad-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2"
|
||||
"{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]",
|
||||
"[\x00-0x9f\xa2\xa3\xa5\xac\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]",
|
||||
:int{0}
|
||||
}
|
||||
|
||||
// extensions
|
||||
{
|
||||
"ibm-1390",
|
||||
"[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2"
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
#include "uoptions.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "ucmpwrit.h"
|
||||
#include "uparse.h"
|
||||
#include "ucm.h"
|
||||
#include "makeconv.h"
|
||||
#include "genmbcs.h"
|
||||
|
@ -305,18 +305,7 @@ int main(int argc, char* argv[])
|
|||
const char *basename;
|
||||
|
||||
/* find the last file sepator */
|
||||
basename = uprv_strrchr(arg, U_FILE_SEP_CHAR);
|
||||
if (basename == NULL) {
|
||||
basename = uprv_strrchr(arg, U_FILE_ALT_SEP_CHAR);
|
||||
if (basename == NULL) {
|
||||
basename = arg;
|
||||
} else {
|
||||
++basename;
|
||||
}
|
||||
} else {
|
||||
++basename;
|
||||
}
|
||||
|
||||
basename = findBasename(arg);
|
||||
uprv_strcpy(outBasename, basename);
|
||||
}
|
||||
else
|
||||
|
@ -593,53 +582,6 @@ readHeader(ConvData *data,
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
readTable(ConvData *data, FileStream* convFile,
|
||||
UBool forBase, UCMStates *baseStates,
|
||||
UErrorCode *pErrorCode) {
|
||||
char line[500];
|
||||
char *end;
|
||||
UBool isOK;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
isOK=TRUE;
|
||||
|
||||
for(;;) {
|
||||
/* read the next line */
|
||||
if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
|
||||
fprintf(stderr, "incomplete charmap section\n");
|
||||
isOK=FALSE;
|
||||
break;
|
||||
}
|
||||
|
||||
/* remove CR LF */
|
||||
end=uprv_strchr(line, 0);
|
||||
while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
|
||||
--end;
|
||||
}
|
||||
*end=0;
|
||||
|
||||
/* ignore empty and comment lines */
|
||||
if(line[0]==0 || line[0]=='#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* stop at the end of the mapping table */
|
||||
if(0==uprv_strcmp(line, "END CHARMAP")) {
|
||||
break;
|
||||
}
|
||||
|
||||
isOK&=ucm_addMappingFromLine(data->ucm, line, forBase, baseStates);
|
||||
}
|
||||
|
||||
if(!isOK) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
|
||||
/* return TRUE if a base table was read, FALSE for an extension table */
|
||||
static UBool
|
||||
readFile(ConvData *data, const char* converterName,
|
||||
|
@ -647,6 +589,8 @@ readFile(ConvData *data, const char* converterName,
|
|||
char line[200];
|
||||
char *end;
|
||||
FileStream *convFile;
|
||||
|
||||
UCMStates *baseStates;
|
||||
UBool dataIsBase;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
|
@ -668,37 +612,39 @@ readFile(ConvData *data, const char* converterName,
|
|||
|
||||
if(data->ucm->baseName[0]==0) {
|
||||
dataIsBase=TRUE;
|
||||
ucm_processStates(&data->ucm->states);
|
||||
|
||||
/* read the base table */
|
||||
readTable(data, convFile, TRUE, &data->ucm->states, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* read an extension table if there is one */
|
||||
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
|
||||
end=uprv_strchr(line, 0);
|
||||
while(line<end &&
|
||||
(*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
|
||||
--end;
|
||||
}
|
||||
*end=0;
|
||||
|
||||
if(0==uprv_strcmp(line, "CHARMAP")) {
|
||||
/* read the extension table */
|
||||
readTable(data, convFile, FALSE, &data->ucm->states, pErrorCode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
baseStates=&data->ucm->states;
|
||||
ucm_processStates(baseStates);
|
||||
} else {
|
||||
/* read only the extension table */
|
||||
dataIsBase=FALSE;
|
||||
readTable(data, convFile, FALSE, NULL, pErrorCode);
|
||||
baseStates=NULL;
|
||||
}
|
||||
|
||||
/* ### TODO enable extension-only tables, Jitterbug 3346 */
|
||||
fprintf(stderr, "error: delta/extension-only conversion tables are not yet supported\n");
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
/* read the base table */
|
||||
ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* read an extension table if there is one */
|
||||
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
|
||||
end=uprv_strchr(line, 0);
|
||||
while(line<end &&
|
||||
(*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
|
||||
--end;
|
||||
}
|
||||
*end=0;
|
||||
|
||||
if(line[0]=='#' || u_skipWhitespace(line)==end) {
|
||||
continue; /* ignore empty and comment lines */
|
||||
}
|
||||
|
||||
if(0==uprv_strcmp(line, "CHARMAP")) {
|
||||
/* read the extension table */
|
||||
ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
|
||||
} else {
|
||||
fprintf(stderr, "unexpected text after the base mapping table\n");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
T_FileStream_close(convFile);
|
||||
|
@ -712,7 +658,7 @@ readFile(ConvData *data, const char* converterName,
|
|||
}
|
||||
|
||||
static void
|
||||
createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode) {
|
||||
createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
|
||||
ConvData baseData;
|
||||
UBool dataIsBase;
|
||||
|
||||
|
@ -722,17 +668,11 @@ createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCod
|
|||
|
||||
initConvData(data);
|
||||
|
||||
/* ### TODO if there is an extension table:
|
||||
1. the base table must use precision flags
|
||||
2. check base vs. extension for mappings overlap
|
||||
*/
|
||||
dataIsBase=readFile(data, converterName, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
initConvData(&baseData);
|
||||
|
||||
if(dataIsBase) {
|
||||
data->cnvData=MBCSOpen(data->ucm);
|
||||
if(data->cnvData==NULL) {
|
||||
|
@ -751,7 +691,7 @@ createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCod
|
|||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
|
||||
} else if(
|
||||
!ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, TRUE) ||
|
||||
!ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
|
||||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
|
@ -765,20 +705,41 @@ createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCod
|
|||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
} else {
|
||||
/* ### TODO assemble a path/filename for data->ucm->states.baseName */
|
||||
/* must be TRUE */readFile(&baseData, ""/*extConverterName*/, pErrorCode);
|
||||
/* ### TODO read extension table */
|
||||
/* ### TODO - actually write the mappings into genmbcs or into ext */
|
||||
char baseFilename[500];
|
||||
char *basename;
|
||||
|
||||
if( !ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) ||
|
||||
!ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, FALSE) ||
|
||||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
|
||||
) {
|
||||
initConvData(&baseData);
|
||||
|
||||
/* assemble a path/filename for data->ucm->baseName */
|
||||
uprv_strcpy(baseFilename, converterName);
|
||||
basename=(char *)findBasename(baseFilename);
|
||||
uprv_strcpy(basename, data->ucm->baseName);
|
||||
uprv_strcat(basename, ".ucm");
|
||||
|
||||
/* read the base table */
|
||||
dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
} else if(!dataIsBase) {
|
||||
fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* prepare the extension table */
|
||||
data->extData=CnvExtOpen(data->ucm);
|
||||
if(data->extData==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
|
||||
cleanupConvData(&baseData);
|
||||
} else if(
|
||||
!ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) ||
|
||||
!ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE) ||
|
||||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
|
||||
) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
|
||||
cleanupConvData(&baseData);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -59,18 +59,19 @@ getLongPathname(const char *pathname) {
|
|||
U_CAPI const char * U_EXPORT2
|
||||
findBasename(const char *filename) {
|
||||
const char *basename=uprv_strrchr(filename, U_FILE_SEP_CHAR);
|
||||
|
||||
#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR
|
||||
if(basename==NULL) {
|
||||
/* Use lenient matching on Windows, which can accept either \ or /
|
||||
This is useful for environments like Win32+CygWin which have both.
|
||||
*/
|
||||
basename=uprv_strrchr(filename, U_FILE_ALT_SEP_CHAR);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(basename!=NULL) {
|
||||
return basename+1;
|
||||
} else {
|
||||
#ifdef WIN32
|
||||
/* Use lenient matching on Windows, which can accept either \ or /
|
||||
This is useful for CygWin environments which has both
|
||||
*/
|
||||
basename=uprv_strrchr(filename, '/');
|
||||
if(basename!=NULL) {
|
||||
return basename+1;
|
||||
}
|
||||
#endif
|
||||
return filename;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,8 +26,10 @@
|
|||
#include "unicode/ustring.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "filestrm.h"
|
||||
#include "uarrsort.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "uparse.h"
|
||||
#include "ucm.h"
|
||||
|
@ -217,6 +219,10 @@ ucm_sortTable(UCMTable *t) {
|
|||
UErrorCode errorCode;
|
||||
int32_t i;
|
||||
|
||||
if(t->isSorted) {
|
||||
return;
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
|
||||
/* 1. sort by Unicode first */
|
||||
|
@ -252,17 +258,18 @@ ucm_sortTable(UCMTable *t) {
|
|||
u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
t->isSorted=TRUE;
|
||||
}
|
||||
|
||||
enum {
|
||||
MOVE_TO_EXT=0x10,
|
||||
REMOVE_MAPPING=0x20,
|
||||
MOVE_ANY=0x30
|
||||
MOVE_TO_EXT=1,
|
||||
REMOVE_MAPPING=2
|
||||
};
|
||||
|
||||
/*
|
||||
* move mappings with MOVE_ANY ored into their flags from the base table
|
||||
* to the extension table
|
||||
* move mappings with their move flag set from the base table
|
||||
* and optionally to the extension table
|
||||
*
|
||||
* works only with explicit precision flags because it uses some of the
|
||||
* flags bits
|
||||
|
@ -276,10 +283,10 @@ moveMappings(UCMTable *base, UCMTable *ext) {
|
|||
mbLimit=mb+base->mappingsLength;
|
||||
|
||||
while(mb<mbLimit) {
|
||||
flag=mb->f;
|
||||
if(flag&MOVE_ANY) {
|
||||
/* restore the original flag value */
|
||||
mb->f=flag&~MOVE_ANY;
|
||||
flag=mb->moveFlag;
|
||||
if(flag!=0) {
|
||||
/* reset the move flag */
|
||||
mb->moveFlag=0;
|
||||
|
||||
if(ext!=NULL && (flag&MOVE_TO_EXT)) {
|
||||
/* add the mapping to the extension table */
|
||||
|
@ -292,6 +299,7 @@ moveMappings(UCMTable *base, UCMTable *ext) {
|
|||
}
|
||||
--mbLimit;
|
||||
--base->mappingsLength;
|
||||
base->isSorted=FALSE;
|
||||
} else {
|
||||
++mb;
|
||||
}
|
||||
|
@ -304,10 +312,12 @@ enum {
|
|||
};
|
||||
|
||||
static uint8_t
|
||||
checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
||||
checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
||||
UBool moveToExt, UBool intersectBase) {
|
||||
UCMapping *mb, *me, *mbLimit, *meLimit;
|
||||
int32_t cmp;
|
||||
uint8_t result;
|
||||
UBool isSISO;
|
||||
|
||||
mb=base->mappings;
|
||||
mbLimit=mb+base->mappingsLength;
|
||||
|
@ -317,6 +327,8 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
|||
|
||||
result=0;
|
||||
|
||||
isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
|
||||
|
||||
for(;;) {
|
||||
/* skip irrelevant mappings on both sides */
|
||||
for(;;) {
|
||||
|
@ -346,21 +358,32 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
|||
/* compare the base and extension mappings */
|
||||
cmp=compareUnicode(base, mb, ext, me);
|
||||
if(cmp<0) {
|
||||
if(intersectBase && (!(isSISO && intersectBase==2) || mb->bLen>1)) {
|
||||
/*
|
||||
* mapping in base but not in ext, move it
|
||||
*
|
||||
* if base is EBCDIC_STATEFUL and ext is DBCS, move DBCS mappings here
|
||||
* and check SBCS ones for Unicode prefix below
|
||||
*/
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
|
||||
/* does mb map from an input sequence that is a prefix of me's? */
|
||||
if( mb->uLen<me->uLen &&
|
||||
} else if( mb->uLen<me->uLen &&
|
||||
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
||||
) {
|
||||
if(moveToExt) {
|
||||
/* mark this mapping to be moved to the extension table */
|
||||
mb->f|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
"ucm error: the base table contains a mapping whose input sequence\n"
|
||||
" is a prefix of the input sequence of an extension mapping\n");
|
||||
ucm_printMapping(base, mb, stderr);
|
||||
ucm_printMapping(ext, me, stderr);
|
||||
result|=HAS_ERRORS;
|
||||
}
|
||||
result|=NEEDS_MOVE;
|
||||
}
|
||||
|
||||
++mb;
|
||||
|
@ -372,7 +395,11 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
|||
if( mb->f==me->f && mb->bLen==me->bLen &&
|
||||
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
||||
) {
|
||||
me->f|=REMOVE_MAPPING;
|
||||
me->moveFlag|=REMOVE_MAPPING;
|
||||
result|=NEEDS_MOVE;
|
||||
} else if(intersectBase) {
|
||||
/* mapping in base but not in ext, move it */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -392,7 +419,8 @@ checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
|||
}
|
||||
|
||||
static uint8_t
|
||||
checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
||||
checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
||||
UBool moveToExt, UBool intersectBase) {
|
||||
UCMapping *mb, *me;
|
||||
int32_t *baseMap, *extMap;
|
||||
int32_t b, e, bLimit, eLimit, cmp;
|
||||
|
@ -412,17 +440,23 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mo
|
|||
|
||||
for(;;) {
|
||||
/* skip irrelevant mappings on both sides */
|
||||
for(;;) {
|
||||
for(;; ++b) {
|
||||
if(b==bLimit) {
|
||||
return result;
|
||||
}
|
||||
mb=base->mappings+baseMap[b];
|
||||
|
||||
if(isSISO && intersectBase==2 && mb->bLen==1) {
|
||||
/*
|
||||
* comparing an EBCDIC_STATEFUL base against a DBCS extension:
|
||||
* leave SBCS base mappings alone
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
if(mb->f==0 || mb->f==3) {
|
||||
break;
|
||||
}
|
||||
|
||||
++b;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
|
@ -441,18 +475,23 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mo
|
|||
/* compare the base and extension mappings */
|
||||
cmp=compareBytes(base, mb, ext, me, TRUE);
|
||||
if(cmp<0) {
|
||||
if(intersectBase) {
|
||||
/* mapping in base but not in ext, move it */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
|
||||
/*
|
||||
* does mb map from an input sequence that is a prefix of me's?
|
||||
* for SI/SO tables, a single byte is never a prefix because it
|
||||
* occurs in a separate single-byte state
|
||||
*/
|
||||
if( mb->bLen<me->bLen &&
|
||||
} else if( mb->bLen<me->bLen &&
|
||||
(!isSISO || mb->bLen>1) &&
|
||||
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
||||
) {
|
||||
if(moveToExt) {
|
||||
/* mark this mapping to be moved to the extension table */
|
||||
mb->f|=MOVE_TO_EXT;
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -473,7 +512,11 @@ checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mo
|
|||
if( mb->f==me->f && mb->uLen==me->uLen &&
|
||||
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
||||
) {
|
||||
me->f|=REMOVE_MAPPING;
|
||||
me->moveFlag|=REMOVE_MAPPING;
|
||||
result|=NEEDS_MOVE;
|
||||
} else if(intersectBase) {
|
||||
/* mapping in base but not in ext, move it */
|
||||
mb->moveFlag|=MOVE_TO_EXT;
|
||||
result|=NEEDS_MOVE;
|
||||
} else {
|
||||
fprintf(stderr,
|
||||
|
@ -515,12 +558,18 @@ ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
|
|||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
|
||||
ucm_checkBaseExt(UCMStates *baseStates,
|
||||
UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
|
||||
UBool intersectBase) {
|
||||
uint8_t result;
|
||||
|
||||
/* if we have an extension table, we must always use precision flags */
|
||||
if(base->flagsType!=UCM_FLAGS_EXPLICIT || ext->flagsType!=UCM_FLAGS_EXPLICIT) {
|
||||
fprintf(stderr, "ucm error: the base or extension table contains mappings without precision flags\n");
|
||||
if(base->flagsType&UCM_FLAGS_IMPLICIT) {
|
||||
fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
|
||||
return FALSE;
|
||||
}
|
||||
if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
|
||||
fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
@ -530,8 +579,8 @@ ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mov
|
|||
|
||||
/* check */
|
||||
result=
|
||||
checkBaseExtUnicode(base, ext, moveToExt)|
|
||||
checkBaseExtBytes(baseStates, base, ext, moveToExt);
|
||||
checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
|
||||
checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
|
||||
|
||||
if(result&HAS_ERRORS) {
|
||||
return FALSE;
|
||||
|
@ -539,9 +588,12 @@ ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool mov
|
|||
|
||||
if(result&NEEDS_MOVE) {
|
||||
moveMappings(ext, NULL);
|
||||
moveMappings(base, ext);
|
||||
moveMappings(base, moveTarget);
|
||||
ucm_sortTable(base);
|
||||
ucm_sortTable(ext);
|
||||
if(moveTarget!=NULL) {
|
||||
ucm_sortTable(moveTarget);
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
@ -640,6 +692,8 @@ ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
|
|||
++toUMapping;
|
||||
++toUIndex;
|
||||
}
|
||||
|
||||
fromUTable->isSorted=FALSE;
|
||||
}
|
||||
|
||||
/* separate extension mappings out of base table for rptp2ucm --------------- */
|
||||
|
@ -662,7 +716,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
|||
if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
|
||||
fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
|
||||
ucm_printMapping(table, m, stderr);
|
||||
m->f|=REMOVE_MAPPING;
|
||||
m->moveFlag|=REMOVE_MAPPING;
|
||||
needsMove=TRUE;
|
||||
continue;
|
||||
}
|
||||
|
@ -675,7 +729,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
|||
printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
|
||||
isOK=FALSE;
|
||||
} else if(type>0) {
|
||||
m->f|=MOVE_TO_EXT;
|
||||
m->moveFlag|=MOVE_TO_EXT;
|
||||
needsMove=TRUE;
|
||||
}
|
||||
}
|
||||
|
@ -685,7 +739,7 @@ ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
|||
}
|
||||
if(needsMove) {
|
||||
moveMappings(ucm->base, ucm->ext);
|
||||
return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, TRUE);
|
||||
return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
|
||||
} else {
|
||||
ucm_sortTable(ucm->base);
|
||||
return TRUE;
|
||||
|
@ -852,6 +906,17 @@ ucm_closeTable(UCMTable *table) {
|
|||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_resetTable(UCMTable *table) {
|
||||
if(table!=NULL) {
|
||||
table->mappingsLength=0;
|
||||
table->flagsType=0;
|
||||
table->unicodeMask=0;
|
||||
table->bytesLength=table->codePointsLength=0;
|
||||
table->isSorted=FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_addMapping(UCMTable *table,
|
||||
UCMapping *m,
|
||||
|
@ -946,6 +1011,8 @@ ucm_addMapping(UCMTable *table,
|
|||
|
||||
tm=table->mappings+table->mappingsLength++;
|
||||
uprv_memcpy(tm, m, sizeof(UCMapping));
|
||||
|
||||
table->isSorted=FALSE;
|
||||
}
|
||||
|
||||
U_CAPI UCMFile * U_EXPORT2
|
||||
|
@ -1051,7 +1118,61 @@ ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates
|
|||
UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
|
||||
uint8_t bytes[UCNV_EXT_MAX_BYTES];
|
||||
|
||||
const char *s;
|
||||
|
||||
/* ignore empty and comment lines */
|
||||
if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
return
|
||||
ucm_parseMappingLine(&m, codePoints, bytes, line) &&
|
||||
ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_readTable(UCMFile *ucm, FileStream* convFile,
|
||||
UBool forBase, UCMStates *baseStates,
|
||||
UErrorCode *pErrorCode) {
|
||||
char line[500];
|
||||
char *end;
|
||||
UBool isOK;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
isOK=TRUE;
|
||||
|
||||
for(;;) {
|
||||
/* read the next line */
|
||||
if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
|
||||
fprintf(stderr, "incomplete charmap section\n");
|
||||
isOK=FALSE;
|
||||
break;
|
||||
}
|
||||
|
||||
/* remove CR LF */
|
||||
end=uprv_strchr(line, 0);
|
||||
while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
|
||||
--end;
|
||||
}
|
||||
*end=0;
|
||||
|
||||
/* ignore empty and comment lines */
|
||||
if(line[0]==0 || line[0]=='#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* stop at the end of the mapping table */
|
||||
if(0==uprv_strcmp(line, "END CHARMAP")) {
|
||||
break;
|
||||
}
|
||||
|
||||
isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
|
||||
}
|
||||
|
||||
if(!isOK) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "ucnvmbcs.h"
|
||||
#include "ucnv_ext.h"
|
||||
#include "filestrm.h"
|
||||
#include <stdio.h>
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
@ -46,7 +47,7 @@ typedef struct UCMapping {
|
|||
uint32_t index;
|
||||
uint8_t bytes[4];
|
||||
} b;
|
||||
int8_t uLen, bLen, f;
|
||||
int8_t uLen, bLen, f, moveFlag;
|
||||
} UCMapping;
|
||||
|
||||
enum {
|
||||
|
@ -71,6 +72,7 @@ typedef struct UCMTable {
|
|||
|
||||
uint8_t unicodeMask;
|
||||
int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
|
||||
UBool isSorted;
|
||||
} UCMTable;
|
||||
|
||||
enum {
|
||||
|
@ -140,9 +142,21 @@ ucm_openTable(void);
|
|||
U_CAPI void U_EXPORT2
|
||||
ucm_closeTable(UCMTable *table);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_resetTable(UCMTable *table);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_sortTable(UCMTable *t);
|
||||
|
||||
/**
|
||||
* Read a table from a .ucm file, from after the CHARMAP line to
|
||||
* including the END CHARMAP line.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_readTable(UCMFile *ucm, FileStream* convFile,
|
||||
UBool forBase, UCMStates *baseStates,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Check the validity of mappings against a base table's states;
|
||||
* necessary for extension-only tables that were read before their base tables.
|
||||
|
@ -152,9 +166,22 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
|
|||
|
||||
/**
|
||||
* Check a base table against an extension table.
|
||||
* Set moveToExt=TRUE for where base and extension tables are parsed
|
||||
* from a single file,
|
||||
* and moveToExt=FALSE for where the extension table is in a separate file.
|
||||
* Set the moveTarget!=NULL if it is possible to move mappings from the base.
|
||||
* This is the case where base and extension tables are parsed from a single file
|
||||
* (moveTarget==ext)
|
||||
* or when delta file mappings are subtracted from a base table.
|
||||
*
|
||||
* When a base table cannot be modified because a delta file is parsed in makeconv,
|
||||
* then set moveTarget=NULL.
|
||||
*
|
||||
* if(intersectBase) then mappings that exist in the base table but not in
|
||||
* the extension table are moved to moveTarget instead of showing an error.
|
||||
*
|
||||
* Special mode: If the base table is an SISO table (indicated in the baseStates)
|
||||
* and intersectBase==2 for a DBCS extension table, then SBCS mappings are
|
||||
* not moved out of the base unless their Unicode input requires it.
|
||||
* This helps ucmkbase generate base tables for where the dbcsonly converter
|
||||
* option will be employed.
|
||||
*
|
||||
* For both tables in the same file, the extension table is automatically
|
||||
* built.
|
||||
|
@ -164,6 +191,12 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
|
|||
*
|
||||
* Sort both tables, and then for each mapping direction:
|
||||
*
|
||||
* If intersectBase is TRUE and the base table contains a mapping
|
||||
* that does not exist in the extension table, then this mapping is moved
|
||||
* to moveTarget.
|
||||
*
|
||||
* - otherwise -
|
||||
*
|
||||
* If the base table contains a mapping for which the input sequence is
|
||||
* the same as the extension input, then
|
||||
* - if the output is the same: remove the extension mapping
|
||||
|
@ -171,13 +204,14 @@ ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
|
|||
*
|
||||
* If the base table contains a mapping for which the input sequence is
|
||||
* a prefix of the extension input, then
|
||||
* - if moveToExt: move the base mapping to the extension table
|
||||
* - if moveTarget!=NULL: move the base mapping to the moveTarget table
|
||||
* - else: error
|
||||
*
|
||||
* @return FALSE in case of an irreparable error
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt);
|
||||
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
||||
UCMTable *moveTarget, UBool intersectBase);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
|
||||
|
|
Loading…
Add table
Reference in a new issue