ICU-4268 ISO 2022 converters must not convert SO/SI/ESC

X-SVN-Rev: 17796
This commit is contained in:
Markus Scherer 2005-06-03 20:17:54 +00:00
parent ff5e6e08e7
commit 2aced37b83
5 changed files with 114 additions and 20 deletions

View file

@ -84,6 +84,14 @@ static const char SHIFT_OUT_STR[] = "\x0E";
#define V_TAB 0x0B
#define SPACE 0x20
/*
* ISO 2022 control codes must not be converted from Unicode
* because they would mess up the byte stream.
* The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
* corresponding to SO, SI, and ESC.
*/
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
/* for ISO-2022-JP and -CN implementations */
typedef enum {
/* shared values */
@ -1324,7 +1332,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
sourceChar = *(source++);
/*check if the char is a First surrogate*/
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE(sourceChar)) {
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
getTrail:
/*look ahead to find the trail surrogate*/
@ -1358,6 +1366,14 @@ getTrail:
}
}
/* do not convert SO/SI/ESC */
if(IS_2022_CONTROL(sourceChar)) {
/* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
args->converter->fromUChar32=sourceChar;
break;
}
/* do the conversion */
if(choiceCount == 0) {
@ -1901,6 +1917,15 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
if(target < (unsigned char*) args->targetLimit){
sourceChar = *source++;
/* do not convert SO/SI/ESC */
if(IS_2022_CONTROL(sourceChar)) {
/* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
args->converter->fromUChar32=sourceChar;
break;
}
/* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
sourceChar,&targetByteUnit,args->converter->useFallback);*/
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
@ -1997,7 +2022,6 @@ getTrail:
}
args->converter->fromUChar32=sourceChar;
args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
break;
}
} /* end if(myTargetIndex<myTargetLength) */
@ -2442,6 +2466,14 @@ getTrail:
/* do the conversion */
if(sourceChar <= 0x007f ){
/* do not convert SO/SI/ESC */
if(IS_2022_CONTROL(sourceChar)) {
/* callback(illegal) */
*err=U_ILLEGAL_CHAR_FOUND;
args->converter->fromUChar32=sourceChar;
break;
}
/* US-ASCII */
if(pFromU2022State->g == 0) {
buffer[0] = (char)sourceChar;
@ -3028,17 +3060,18 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
/* there is only one converter for KR, and it is not in the myConverterArray[] */
cnvData->currentConverter->sharedData->impl->getUnicodeSet(
cnvData->currentConverter, sa, which, pErrorCode);
return;
/* the loop over myConverterArray[] will simply not find another converter */
break;
default:
break;
}
/*
* TODO: need to make this version-specific for CN.
* Version-specific for CN:
* CN version 0 does not map CNS planes 3..7 although
* they are all available in the CNS conversion table;
* CN version 1 does map them all.
* The two versions need to create different Unicode sets.
* The two versions create different Unicode sets.
*/
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
if(cnvData->myConverterArray[i]!=NULL) {
@ -3056,6 +3089,15 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
}
}
}
/*
* ISO 2022 converters must not convert SO/SI/ESC despite what
* sub-converters do by themselves.
* Remove these characters from the set.
*/
sa->remove(sa->set, 0x0e);
sa->remove(sa->set, 0x0f);
sa->remove(sa->set, 0x1b);
}
static const UConverterImpl _ISO2022Impl={

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2004, International Business Machines
* Copyright (C) 2003-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -51,7 +51,8 @@ ucnv_getUnicodeSet(const UConverter *cnv,
NULL,
uset_add,
uset_addRange,
uset_addString
uset_addString,
uset_remove
};
sa.set=setFillIn;

View file

@ -33,6 +33,9 @@ USetAddRange(USet *set, UChar32 start, UChar32 end);
typedef void U_CALLCONV
USetAddString(USet *set, const UChar *str, int32_t length);
typedef void U_CALLCONV
USetRemove(USet *set, UChar32 c);
/**
* Interface for adding items to a USet, to keep low-level code from
* statically depending on the USet implementation.
@ -43,6 +46,7 @@ struct USetAdder {
USetAdd *add;
USetAddRange *addRange;
USetAddString *addString;
USetRemove *remove;
};
typedef struct USetAdder USetAdder;

View file

@ -1,6 +1,6 @@
# *******************************************************************************
# *
# * Copyright (C) 1995-2001, International Business Machines
# * Copyright (C) 1995-2005, International Business Machines
# * Corporation and others. All Rights Reserved.
# *
# *******************************************************************************
@ -38,6 +38,12 @@ CHARMAP
<U000B> \x0B |0
<U000C> \x0C |0
<U000D> \x0D |0
# an ISO-2022 converter must not convert SO/SI/ESC (Jitterbug 4268)
# use <subchar1>
<U000E> \x1A |2
<U000F> \x1A |2
<U0010> \x10 |0
<U0011> \x11 |0
<U0012> \x12 |0
@ -49,7 +55,12 @@ CHARMAP
<U0018> \x18 |0
<U0019> \x19 |0
<U001A> \x1A |0
<U001B> \x1B |0
# an ISO-2022 converter must not convert SO/SI/ESC (Jitterbug 4268)
# <U001B> \x1B |0
# use <subchar1>
<U001B> \x1A |2
<U001C> \x1C |0
<U001D> \x1D |0
<U001E> \x1E |0

View file

@ -452,6 +452,36 @@ conversion {
// See ucnv_extContinueMatchFromU() comment
// "the match did not use all of preFromU[] - keep the rest for replay"
// do not convert SO/SI/ESC
{
"iso-2022-jp",
"A\x0eB\x0f\x09\x1bC",
:bin{ 411a421a091a43 },
:intvector{ 0,1,2,3,4,5,6 },
:int{1}, :int{1}, "", "?", ""
}
{
"iso-2022-cn",
"A\x0eB\x0f\x09\x1bC",
:bin{ 411a421a091a43 },
:intvector{ 0,1,2,3,4,5,6 },
:int{1}, :int{1}, "", "?", ""
}
{
"iso-2022-kr",
"A\x0eB\x0f\x09\x1bC",
:bin{ 1b242943411a421a091a43 },
:intvector{ -1,-1,-1,-1,0,1,2,3,4,5,6 },
:int{1}, :int{1}, "", "?", ""
}
{
"ibm-25546",
"A\x0eB\x0f\x09\x1bC",
:bin{ 1b242943411a421a091a43 },
:intvector{ -1,-1,-1,-1,0,1,2,3,4,5,6 },
:int{1}, :int{1}, "", "?", ""
}
// test ISO 8859-1/7 vs. JIS X 0201
{
"ISO-2022-JP-2",
@ -937,39 +967,45 @@ conversion {
// which - numeric UConverterUnicodeSet value
Headers { "charset", "map", "mapnot", "which" }
Cases {
// ISO-2022-KR
// versions of ISO-2022-KR
{
"ISO-2022-KR",
"[\x00-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]",
"[\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]",
"[\x0e\x0f\x1b\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]",
:int{0}
}
{
"ibm-25546",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac01\uffe6]",
"[\x0e\x0f\x1b\x80-\xa0\xa3\xa5\xff-\u0110\uac02\uffe7-\U0010ffff]",
:int{0}
}
// versions of ISO-2022-JP
{
"ISO-2022-JP",
"[\x00-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
"[\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
:int{0}
}
{
"ISO-2022-JP-2",
"[\x00-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
"[\uffe7-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
"[\x0e\x0f\x1b\uffe7-\U0010ffff]",
:int{0}
}
// versions of ISO-2022-CN
{
"ISO-2022-CN",
"[\x00-\x7f\u4e00\u4e01\u9f98\ufe6b]",
"[\u4e29\uffe6-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u4e00\u4e01\u9f98\ufe6b]",
"[\x0e\x0f\x1b\u4e29\uffe6-\U0010ffff]",
:int{0}
}
{
"ISO-2022-CN-EXT",
"[\x00-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]",
"[\U00020001\U00020002\U0002a6d7-\U0010ffff]",
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]",
"[\x0e\x0f\x1b\U00020001\U00020002\U0002a6d7-\U0010ffff]",
:int{0}
}