mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-4268 ISO 2022 converters must not convert SO/SI/ESC
X-SVN-Rev: 17796
This commit is contained in:
parent
ff5e6e08e7
commit
2aced37b83
5 changed files with 114 additions and 20 deletions
|
@ -84,6 +84,14 @@ static const char SHIFT_OUT_STR[] = "\x0E";
|
|||
#define V_TAB 0x0B
|
||||
#define SPACE 0x20
|
||||
|
||||
/*
|
||||
* ISO 2022 control codes must not be converted from Unicode
|
||||
* because they would mess up the byte stream.
|
||||
* The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
|
||||
* corresponding to SO, SI, and ESC.
|
||||
*/
|
||||
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
|
||||
|
||||
/* for ISO-2022-JP and -CN implementations */
|
||||
typedef enum {
|
||||
/* shared values */
|
||||
|
@ -1324,7 +1332,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
|
|||
|
||||
sourceChar = *(source++);
|
||||
/*check if the char is a First surrogate*/
|
||||
if(UTF_IS_SURROGATE(sourceChar)) {
|
||||
if(UTF_IS_SURROGATE(sourceChar)) {
|
||||
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
|
||||
getTrail:
|
||||
/*look ahead to find the trail surrogate*/
|
||||
|
@ -1358,6 +1366,14 @@ getTrail:
|
|||
}
|
||||
}
|
||||
|
||||
/* do not convert SO/SI/ESC */
|
||||
if(IS_2022_CONTROL(sourceChar)) {
|
||||
/* callback(illegal) */
|
||||
*err=U_ILLEGAL_CHAR_FOUND;
|
||||
args->converter->fromUChar32=sourceChar;
|
||||
break;
|
||||
}
|
||||
|
||||
/* do the conversion */
|
||||
|
||||
if(choiceCount == 0) {
|
||||
|
@ -1901,6 +1917,15 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
|
|||
|
||||
if(target < (unsigned char*) args->targetLimit){
|
||||
sourceChar = *source++;
|
||||
|
||||
/* do not convert SO/SI/ESC */
|
||||
if(IS_2022_CONTROL(sourceChar)) {
|
||||
/* callback(illegal) */
|
||||
*err=U_ILLEGAL_CHAR_FOUND;
|
||||
args->converter->fromUChar32=sourceChar;
|
||||
break;
|
||||
}
|
||||
|
||||
/* length= ucnv_MBCSFromUChar32(converterData->currentConverter->sharedData,
|
||||
sourceChar,&targetByteUnit,args->converter->useFallback);*/
|
||||
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,&length,MBCS_OUTPUT_2);
|
||||
|
@ -1997,7 +2022,6 @@ getTrail:
|
|||
}
|
||||
|
||||
args->converter->fromUChar32=sourceChar;
|
||||
args->converter->fromUnicodeStatus = (int32_t)isTargetByteDBCS;
|
||||
break;
|
||||
}
|
||||
} /* end if(myTargetIndex<myTargetLength) */
|
||||
|
@ -2442,6 +2466,14 @@ getTrail:
|
|||
|
||||
/* do the conversion */
|
||||
if(sourceChar <= 0x007f ){
|
||||
/* do not convert SO/SI/ESC */
|
||||
if(IS_2022_CONTROL(sourceChar)) {
|
||||
/* callback(illegal) */
|
||||
*err=U_ILLEGAL_CHAR_FOUND;
|
||||
args->converter->fromUChar32=sourceChar;
|
||||
break;
|
||||
}
|
||||
|
||||
/* US-ASCII */
|
||||
if(pFromU2022State->g == 0) {
|
||||
buffer[0] = (char)sourceChar;
|
||||
|
@ -3028,17 +3060,18 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||
/* there is only one converter for KR, and it is not in the myConverterArray[] */
|
||||
cnvData->currentConverter->sharedData->impl->getUnicodeSet(
|
||||
cnvData->currentConverter, sa, which, pErrorCode);
|
||||
return;
|
||||
/* the loop over myConverterArray[] will simply not find another converter */
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: need to make this version-specific for CN.
|
||||
* Version-specific for CN:
|
||||
* CN version 0 does not map CNS planes 3..7 although
|
||||
* they are all available in the CNS conversion table;
|
||||
* CN version 1 does map them all.
|
||||
* The two versions need to create different Unicode sets.
|
||||
* The two versions create different Unicode sets.
|
||||
*/
|
||||
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
||||
if(cnvData->myConverterArray[i]!=NULL) {
|
||||
|
@ -3056,6 +3089,15 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ISO 2022 converters must not convert SO/SI/ESC despite what
|
||||
* sub-converters do by themselves.
|
||||
* Remove these characters from the set.
|
||||
*/
|
||||
sa->remove(sa->set, 0x0e);
|
||||
sa->remove(sa->set, 0x0f);
|
||||
sa->remove(sa->set, 0x1b);
|
||||
}
|
||||
|
||||
static const UConverterImpl _ISO2022Impl={
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2004, International Business Machines
|
||||
* Copyright (C) 2003-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -51,7 +51,8 @@ ucnv_getUnicodeSet(const UConverter *cnv,
|
|||
NULL,
|
||||
uset_add,
|
||||
uset_addRange,
|
||||
uset_addString
|
||||
uset_addString,
|
||||
uset_remove
|
||||
};
|
||||
sa.set=setFillIn;
|
||||
|
||||
|
|
|
@ -33,6 +33,9 @@ USetAddRange(USet *set, UChar32 start, UChar32 end);
|
|||
typedef void U_CALLCONV
|
||||
USetAddString(USet *set, const UChar *str, int32_t length);
|
||||
|
||||
typedef void U_CALLCONV
|
||||
USetRemove(USet *set, UChar32 c);
|
||||
|
||||
/**
|
||||
* Interface for adding items to a USet, to keep low-level code from
|
||||
* statically depending on the USet implementation.
|
||||
|
@ -43,6 +46,7 @@ struct USetAdder {
|
|||
USetAdd *add;
|
||||
USetAddRange *addRange;
|
||||
USetAddString *addString;
|
||||
USetRemove *remove;
|
||||
};
|
||||
typedef struct USetAdder USetAdder;
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# *******************************************************************************
|
||||
# *
|
||||
# * Copyright (C) 1995-2001, International Business Machines
|
||||
# * Copyright (C) 1995-2005, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# *
|
||||
# *******************************************************************************
|
||||
|
@ -38,6 +38,12 @@ CHARMAP
|
|||
<U000B> \x0B |0
|
||||
<U000C> \x0C |0
|
||||
<U000D> \x0D |0
|
||||
|
||||
# an ISO-2022 converter must not convert SO/SI/ESC (Jitterbug 4268)
|
||||
# use <subchar1>
|
||||
<U000E> \x1A |2
|
||||
<U000F> \x1A |2
|
||||
|
||||
<U0010> \x10 |0
|
||||
<U0011> \x11 |0
|
||||
<U0012> \x12 |0
|
||||
|
@ -49,7 +55,12 @@ CHARMAP
|
|||
<U0018> \x18 |0
|
||||
<U0019> \x19 |0
|
||||
<U001A> \x1A |0
|
||||
<U001B> \x1B |0
|
||||
|
||||
# an ISO-2022 converter must not convert SO/SI/ESC (Jitterbug 4268)
|
||||
# <U001B> \x1B |0
|
||||
# use <subchar1>
|
||||
<U001B> \x1A |2
|
||||
|
||||
<U001C> \x1C |0
|
||||
<U001D> \x1D |0
|
||||
<U001E> \x1E |0
|
||||
|
|
58
icu4c/source/test/testdata/conversion.txt
vendored
58
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -452,6 +452,36 @@ conversion {
|
|||
// See ucnv_extContinueMatchFromU() comment
|
||||
// "the match did not use all of preFromU[] - keep the rest for replay"
|
||||
|
||||
// do not convert SO/SI/ESC
|
||||
{
|
||||
"iso-2022-jp",
|
||||
"A\x0eB\x0f\x09\x1bC",
|
||||
:bin{ 411a421a091a43 },
|
||||
:intvector{ 0,1,2,3,4,5,6 },
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
{
|
||||
"iso-2022-cn",
|
||||
"A\x0eB\x0f\x09\x1bC",
|
||||
:bin{ 411a421a091a43 },
|
||||
:intvector{ 0,1,2,3,4,5,6 },
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
{
|
||||
"iso-2022-kr",
|
||||
"A\x0eB\x0f\x09\x1bC",
|
||||
:bin{ 1b242943411a421a091a43 },
|
||||
:intvector{ -1,-1,-1,-1,0,1,2,3,4,5,6 },
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
{
|
||||
"ibm-25546",
|
||||
"A\x0eB\x0f\x09\x1bC",
|
||||
:bin{ 1b242943411a421a091a43 },
|
||||
:intvector{ -1,-1,-1,-1,0,1,2,3,4,5,6 },
|
||||
:int{1}, :int{1}, "", "?", ""
|
||||
}
|
||||
|
||||
// test ISO 8859-1/7 vs. JIS X 0201
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
|
@ -937,39 +967,45 @@ conversion {
|
|||
// which - numeric UConverterUnicodeSet value
|
||||
Headers { "charset", "map", "mapnot", "which" }
|
||||
Cases {
|
||||
// ISO-2022-KR
|
||||
// versions of ISO-2022-KR
|
||||
{
|
||||
"ISO-2022-KR",
|
||||
"[\x00-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]",
|
||||
"[\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]",
|
||||
"[\x0e\x0f\x1b\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
{
|
||||
"ibm-25546",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac01\uffe6]",
|
||||
"[\x0e\x0f\x1b\x80-\xa0\xa3\xa5\xff-\u0110\uac02\uffe7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
|
||||
// versions of ISO-2022-JP
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
"[\x00-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
||||
"[\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
||||
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
"[\x00-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
||||
"[\uffe7-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
||||
"[\x0e\x0f\x1b\uffe7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
|
||||
// versions of ISO-2022-CN
|
||||
{
|
||||
"ISO-2022-CN",
|
||||
"[\x00-\x7f\u4e00\u4e01\u9f98\ufe6b]",
|
||||
"[\u4e29\uffe6-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u4e00\u4e01\u9f98\ufe6b]",
|
||||
"[\x0e\x0f\x1b\u4e29\uffe6-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
{
|
||||
"ISO-2022-CN-EXT",
|
||||
"[\x00-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]",
|
||||
"[\U00020001\U00020002\U0002a6d7-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]",
|
||||
"[\x0e\x0f\x1b\U00020001\U00020002\U0002a6d7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue