From 2dbef4e207cad580316795227a241eb3ac0804e5 Mon Sep 17 00:00:00 2001 From: Jim Snyder Grant Date: Fri, 31 Mar 2000 16:53:09 +0000 Subject: [PATCH] ICU-271 First-pass LMBCS support (Lotus Multi-byte Character set) X-SVN-Rev: 1042 --- icu4c/data/convrtrs.txt | 17 +- icu4c/data/ibm-851.ucm | 285 ++++++++ icu4c/data/lmb-excp.ucm | 315 +++++++++ icu4c/source/common/common.dsp | 4 + icu4c/source/common/ucnv_bld.c | 19 +- icu4c/source/common/ucnv_cnv.h | 4 +- icu4c/source/common/ucnv_lmb.c | 870 ++++++++++++++++++++++++ icu4c/source/common/unicode/ucnv_bld.h | 29 +- icu4c/source/data/mappings/convrtrs.txt | 17 +- icu4c/source/data/mappings/ibm-851.ucm | 285 ++++++++ icu4c/source/data/mappings/lmb-excp.ucm | 315 +++++++++ icu4c/source/test/cintltst/nucnvtst.c | 64 ++ icu4c/source/tools/makeconv/ucmfiles.mk | 3 +- 13 files changed, 2220 insertions(+), 7 deletions(-) create mode 100644 icu4c/data/ibm-851.ucm create mode 100644 icu4c/data/lmb-excp.ucm create mode 100644 icu4c/source/common/ucnv_lmb.c create mode 100644 icu4c/source/data/mappings/ibm-851.ucm create mode 100644 icu4c/source/data/mappings/lmb-excp.ucm diff --git a/icu4c/data/convrtrs.txt b/icu4c/data/convrtrs.txt index 9c1078f4479..02b6b07bfb8 100644 --- a/icu4c/data/convrtrs.txt +++ b/icu4c/data/convrtrs.txt @@ -47,7 +47,19 @@ UTF16_PlatformEndian iso-10646-ucs-2 csUnicode utf16 utf-16 ibm-1200 ibm1200 UTF16_OppositeEndian LATIN_1 iso-8859-1 iso_8859-1 ibm-819 ibm819 cp819 latin1 latin-1 ascii ascii-7 us-ascii 8859-1 csisolatin1 iso-ir-100 iso_8859-1:1978 #!!!!! There's whole lot of names for this - cp367 csASCII etc. ISO_2022 iso-2022 2022 cp2022 iso2022 iso_2022 -LMBCS +LMBCS-1 lmbcs +LMBCS-2 +LMBCS-3 +LMBCS-4 +LMBCS-5 +LMBCS-6 +LMBCS-8 +LMBCS-11 +LMBCS-16 +LMBCS-17 +LMBCS-18 +LMBCS-19 + # Table-based @@ -76,10 +88,13 @@ ibm-1383 euc-cn euccn ibm-eucCN # China EUC #ibm-1162 tis-620 cp874 windows-874 ms874 # Thai (w/ euro support) #what is the connection between this and the one below!!! ibm-874 ibm-1161 #same as 1162 (w/o euro update) ***This is commented out in Helena's +lmb-excp # special exceptions list for LMBCS algorithm + # Platform codepages ibm-437 ibm437 cp437 csPC8CodePage437 437 # PC US # HSYS: ibm-850 IBM850 cp850 850 csPC850Multilingual # PC latin1 +ibm-851 IBM851 cp851 851 csPC851 # PC DOS Greek (no euro) ibm-858 ibm858 cp858 # PC latin1 with Euro cp850 removed ibm-9044 IBM852 852 csPCp852 cp852 # PC latin2 (w/ euro update) #where should the names go here or below - inconsistency!!! ibm-852 # PC latin2 (w/o euro update) diff --git a/icu4c/data/ibm-851.ucm b/icu4c/data/ibm-851.ucm new file mode 100644 index 00000000000..4a40a1ec663 --- /dev/null +++ b/icu4c/data/ibm-851.ucm @@ -0,0 +1,285 @@ +# ****************************************************************************** +# * +# * Copyright (C) 1995-2000, International Business Machines +# * Corporation and others. All Rights Reserved. +# * +# ****************************************************************************** +# +# File created on Fri Feb 11 14:11:00 2000 +# +# File created manually +# from source files IBM-851.TXMAP100 +# +# Table Version : 1.00 +# + "IBM-851" + "AXXXX" + 1 + 1 + "SBCS" + \x7F +# +CHARMAP +# +# +#ISO 10646 IBM-851 +#_________ _________ + \x00 # ..NUL... + \x01 # ..SOH... + \x02 # ..STX... + \x03 # ..ETX... + \x04 # ..EOT... + \x05 # ..ENQ... + \x06 # ..ACK... + \x07 # ..BEL... + \x08 # ...BS... + \x09 # ...HT... + \x0A # ...LF... + \x0B # ...VT... + \x0C # ...FF... + \x0D # ...CR... + \x0E # .SO/LS1. + \x0F # .SI/LS0. + \x10 # ..DLE... + \x11 # ..DC1... + \x12 # ..DC2... + \x13 # ..DC3... + \x14 # ..DC4... + \x15 # ..NAK... + \x16 # ..SYN... + \x17 # ..ETB... + \x18 # ..CAN... + \x19 # ...EM... + \x1A # ..IFS... + \x1B # ..ESC... + \x1C # ..DEL... + \x1D # ...GS... + \x1E # ...RS... + \x1F # ...US... + \x20 # SP010000 + \x21 # SP020000 + \x22 # SP040000 + \x23 # SM010000 + \x24 # SC030000 + \x25 # SM020000 + \x26 # SM030000 + \x27 # SP050000 + \x28 # SP060000 + \x29 # SP070000 + \x2A # SM040000 + \x2B # SA010000 + \x2C # SP080000 + \x2D # SP100000 + \x2E # SP110000 + \x2F # SP120000 + \x30 # ND100000 + \x31 # ND010000 + \x32 # ND020000 + \x33 # ND030000 + \x34 # ND040000 + \x35 # ND050000 + \x36 # ND060000 + \x37 # ND070000 + \x38 # ND080000 + \x39 # ND090000 + \x3A # SP130000 + \x3B # SP140000 + \x3C # SA030000 + \x3D # SA040000 + \x3E # SA050000 + \x3F # SP150000 + \x40 # SM050000 + \x41 # LA020000 + \x42 # LB020000 + \x43 # LC020000 + \x44 # LD020000 + \x45 # LE020000 + \x46 # LF020000 + \x47 # LG020000 + \x48 # LH020000 + \x49 # LI020000 + \x4A # LJ020000 + \x4B # LK020000 + \x4C # LL020000 + \x4D # LM020000 + \x4E # LN020000 + \x4F # LO020000 + \x50 # LP020000 + \x51 # LQ020000 + \x52 # LR020000 + \x53 # LS020000 + \x54 # LT020000 + \x55 # LU020000 + \x56 # LV020000 + \x57 # LW020000 + \x58 # LX020000 + \x59 # LY020000 + \x5A # LZ020000 + \x5B # SM060000 + \x5C # SM070000 + \x5D # SM080000 + \x5E # SD150000 + \x5F # SP090000 + \x60 # SD130000 + \x61 # LA010000 + \x62 # LB010000 + \x63 # LC010000 + \x64 # LD010000 + \x65 # LE010000 + \x66 # LF010000 + \x67 # LG010000 + \x68 # LH010000 + \x69 # LI010000 + \x6A # LJ010000 + \x6B # LK010000 + \x6C # LL010000 + \x6D # LM010000 + \x6E # LN010000 + \x6F # LO010000 + \x70 # LP010000 + \x71 # LQ010000 + \x72 # LR010000 + \x73 # LS010000 + \x74 # LT010000 + \x75 # LU010000 + \x76 # LV010000 + \x77 # LW010000 + \x78 # LX010000 + \x79 # LY010000 + \x7A # LZ010000 + \x7B # SM110000 + \x7C # SM130000 + \x7D # SM140000 + \x7E # SD190000 + \x7F # ..SUB... + \x80 # LC420000 + \x81 # LU170000 + \x82 # LE110000 + \x83 # LA150000 + \x84 # LA170000 + \x85 # LA130000 + \x86 # GA120000 + \x87 # LC410000 + \x88 # LE150000 + \x89 # LE170000 + \x8A # LE130000 + \x8B # LI170000 + \x8C # LI150000 + \x8D # GE120000 + \x8E # LA180000 + \x8F # GE720000 + \x90 # GI120000 + \x92 # GO120000 + \x93 # LO150000 + \x94 # LO170000 + \x95 # GU120000 + \x96 # LU150000 + \x97 # LU130000 + \x98 # GO720000 + \x99 # LO180000 + \x9A # LU180000 + \x9B # GA110000 + \x9C # SC020000 + \x9D # GE110000 + \x9E # GE710000 + \x9F # GI110000 + \xA0 # GI170000 + \xA1 # GI730000 + \xA2 # GO110000 + \xA3 # GU110000 + \xA4 # GA020000 + \xA5 # GB020000 + \xA6 # GG020000 + \xA7 # GD020000 + \xA8 # GE020000 + \xA9 # GZ020000 + \xAA # GE320000 + \xAB # NF010000 + \xAC # GT620000 + \xAD # GI020000 + \xAE # SP170000 + \xAF # SP180000 + \xB0 # SF140000 + \xB1 # SF150000 + \xB2 # SF160000 + \xB3 # SF110000 + \xB4 # SF090000 + \xB5 # GK020000 + \xB6 # GL020000 + \xB7 # GM020000 + \xB8 # GN020000 + \xB9 # SF230000 + \xBA # SF240000 + \xBB # SF250000 + \xBC # SF260000 + \xBD # GX020000 + \xBE # GO020000 + \xBF # SF030000 + \xC0 # SF020000 + \xC1 # SF070000 + \xC2 # SF060000 + \xC3 # SF080000 + \xC4 # SF100000 + \xC5 # SF050000 + \xC6 # GP020000 + \xC7 # GR020000 + \xC8 # SF380000 + \xC9 # SF390000 + \xCA # SF400000 + \xCB # SF410000 + \xCC # SF420000 + \xCD # SF430000 + \xCE # SF440000 + \xCF # GS020000 + \xD0 # GT020000 + \xD1 # GU020000 + \xD2 # GF020000 + \xD3 # GH020000 + \xD4 # GP620000 + \xD5 # GO320000 + \xD6 # GA010000 + \xD7 # GB010000 + \xD8 # GG010000 + \xD9 # SF040000 + \xDA # SF010000 + \xDB # SF610000 + \xDC # SF570000 + \xDD # GD010000 + \xDE # GE010000 + \xDF # SF600000 + \xE0 # GZ010000 + \xE1 # GE310000 + \xE2 # GT610000 + \xE3 # GI010000 + \xE4 # GK010000 + \xE5 # GL010000 + \xE6 # GM010000 + \xE7 # GN010000 + \xE8 # GX010000 + \xE9 # GO010000 + \xEA # GP010000 + \xEB # GR010000 + \xEC # GS010000 + \xED # GS610000 + \xEE # GT010000 + \xEF # SD110000 + \xF0 # SP320000 + \xF1 # SA020000 + \xF2 # GU010000 + \xF3 # GF010000 + \xF4 # GH010000 + \xF5 # SM240000 + \xF6 # GP610000 + \xF7 # SD410000 + \xF8 # SM190000 + \xF9 # SD170000 + \xFA # GO310000 + \xFB # GU170000 + \xFC # GU730000 + \xFD # GO710000 + \xFE # SM470000 + \xFF # SP300000 +# +END CHARMAP +# +#________________________________________________________________________ diff --git a/icu4c/data/lmb-excp.ucm b/icu4c/data/lmb-excp.ucm new file mode 100644 index 00000000000..61b2a3e8386 --- /dev/null +++ b/icu4c/data/lmb-excp.ucm @@ -0,0 +1,315 @@ +# ******************************************************************************* +# * +# * Copyright (C) 1995-2000, International Business Machines +# * Corporation and others. All Rights Reserved. +# * +# ******************************************************************************* +# +# File created on Thu Feb 10 11:47:54 2000 +# +# File created manually from source file LMBCS.ALL +# +# Table Version : 1.00 +# + "lmb-excp" + "AXXXX" + 2 + 1 + "MBCS" + \x3F +# +CHARMAP +# +# +#ISO 10646 LMBCS +#_________ _________ + \x01\x27 + \x01\x23 + \x01\x33 + \x01\x6D + \x01\x24 + \x01\x34 + \x01\x21 + \x01\x31 + \x01\x6C + \x01\x3B + \x01\x15 + \x01\x20 + \x01\x30 + \x01\x67 + \x01\x25 + \x01\x35 + \x01\x14 + \x06\x2E + \x06\x01 + \x06\x02 + \x06\x03 + \x06\x04 + \x06\x05 + \x06\x06 + \x06\x07 + \x06\x08 + \x06\x09 + \x06\x0A + \x06\x0B + \x06\x0C + \x06\x0D + \x06\x0E + \x06\x0F + \x06\x10 + \x06\x11 + \x01\x72 + \x01\x73 + \x06\x12 + \x06\x13 + \x06\x14 + \x06\x15 + \x06\x16 + \x06\x17 + \x01\x61 + \x01\x60 + \x06\x18 + \x06\x19 + \x06\x1A + \x06\x1B + \x01\x7A + \x06\x1C + \x06\x1D + \x01\x66 + \x01\x65 + \x06\x1E + \x06\x1F + \x01\x64 + \x01\x78 + \x01\x79 + \x06\x20 + \x06\x21 + \x01\x40 + \x01\x41 + \x06\x22 + \x06\x23 + \x06\x24 + \x06\x25 + \x01\x74 + \x01\x75 + \x06\x26 + \x06\x27 + \x06\x28 + \x06\x29 + \x06\x2A + \x06\x2B + \x06\x2C + \x06\x2D + \x01\x42 + \x02\x07 + \x02\x08 + \x01\x6B + \x01\x68 + \x01\x43 + \x01\x22 + \x01\x32 + \x01\x44 + \x01\x6A + \x01\x69 + \x02\x01 + \x02\x06 + \x02\x02 + \x02\x03 + \x02\x04 + \x02\x6D + \x01\x29 + \x01\x2A + \x02\x05 + \x01\x39 + \x01\x2B + \x01\x2C + \x01\x37 + \x01\x26 + \x01\x38 + \x01\x36 + \x01\x70 + \x01\x71 + \x01\x07 + \x01\x28 + \x02\x7A + \x02\x69 + \x02\x6A + \x01\x2E + \x01\x2F + \x01\x13 + \x02\x09 + \x02\x78 + \x02\x7C + \x01\x7E + \x01\x7F + \x02\x52 + \x01\x77 + \x02\x53 + \x01\x76 + \x01\x4E + \x02\x51 + \x02\x16 + \x02\x15 + \x02\x14 + \x02\x13 + \x01\x1B + \x01\x18 + \x01\x1A + \x01\x19 + \x01\x1D + \x01\x12 + \x01\x17 + \x02\x1B + \x02\x18 + \x02\x1A + \x02\x19 + \x02\x1D + \x02\x12 + \x02\x66 + \x02\x64 + \x02\x50 + \x02\x67 + \x02\x7D + \x02\x41 + \x02\x5C + \x02\x5D + \x02\x5B + \x02\x79 + \x02\x7B + \x02\x6C + \x02\x6B + \x01\x1C + \x02\x40 + \x02\x6F + \x02\x6E + \x02\x65 + \x02\x71 + \x02\x77 + \x02\x76 + \x02\x70 + \x02\x73 + \x02\x72 + \x02\x7E + \x02\x7F + \x02\x5E + \x02\x5F + \x02\x61 + \x02\x60 + \x02\x63 + \x01\x7D + \x01\x4B + \x02\x74 + \x02\x75 + \x01\x55 + \x01\x56 + \x01\x5C + \x01\x5B + \x01\x54 + \x01\x53 + \x01\x5E + \x01\x5D + \x01\x46 + \x01\x47 + \x01\x59 + \x01\x5A + \x01\x51 + \x01\x52 + \x01\x5F + \x01\x50 + \x01\x58 + \x01\x57 + \x01\x48 + \x01\x49 + \x01\x16 + \x01\x1E + \x01\x10 + \x01\x1F + \x01\x11 + \x01\x4A + \x01\x09 + \x01\x08 + \x01\x0A + \x01\x01 + \x01\x02 + \x01\x0F + \x01\x0C + \x01\x0B + \x01\x06 + \x01\x05 + \x01\x03 + \x01\x04 + \x01\x0D + \x01\x0E + \x02\x62 + \x02\x0E + \x02\x0F + \x02\x10 + \x02\x11 + \x02\x3F + \x02\x17 + \x02\x1C + \x02\x1E + \x02\x68 + \x02\x5A + \x02\x59 + \x02\x58 + \x02\x57 + \x02\x56 + \x02\x55 + \x02\x54 + \x02\x4F + \x02\x4E + \x02\x4D + \x02\x4C + \x02\x4B + \x02\x4A + \x02\x49 + \x02\x48 + \x02\x47 + \x02\x46 + \x02\x45 + \x02\x44 + \x02\x43 + \x02\x42 + \x02\x3E + \x02\x3D + \x02\x3C + \x02\x3B + \x02\x3A + \x02\x39 + \x02\x38 + \x02\x37 + \x02\x36 + \x02\x35 + \x02\x34 + \x02\x33 + \x02\x32 + \x02\x31 + \x02\x30 + \x02\x2F + \x02\x2E + \x02\x2D + \x02\x2C + \x02\x2B + \x02\x2A + \x02\x29 + \x02\x28 + \x02\x27 + \x02\x26 + \x02\x25 + \x02\x24 + \x02\x23 + \x02\x22 + \x02\x21 + \x02\x20 + \x02\x1F + \x01\x7C + \x01\x63 + \x01\x62 + \x01\x4D + \x01\x4C + \x01\x3D +# +END CHARMAP +# +#________________________________________________________________________ diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp index 92d4478ca60..862df4d3a64 100644 --- a/icu4c/source/common/common.dsp +++ b/icu4c/source/common/common.dsp @@ -231,6 +231,10 @@ SOURCE=.\ucnv_io.c # End Source File # Begin Source File +SOURCE=.\ucnv_lmb.c +# End Source File +# Begin Source File + SOURCE=.\ucnv_utf.c # End Source File # Begin Source File diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index a7ff20b95ca..0006cbca854 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -35,7 +35,9 @@ static const UConverterSharedData * converterData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ &_SBCSData, &_DBCSData, &_MBCSData, &_Latin1Data, &_UTF8Data, &_UTF16BEData, &_UTF16LEData, &_EBCDICStatefulData, - &_ISO2022Data + &_ISO2022Data, + &_LMBCSData1,&_LMBCSData2, &_LMBCSData3, &_LMBCSData4, &_LMBCSData5, &_LMBCSData6, + &_LMBCSData8,&_LMBCSData11,&_LMBCSData16,&_LMBCSData17,&_LMBCSData18,&_LMBCSData19 }; static struct { @@ -53,9 +55,22 @@ static struct { { "UTF16_PlatformEndian", UCNV_UTF16_LittleEndian }, { "UTF16_OppositeEndian", UCNV_UTF16_BigEndian}, #endif - { "ISO_2022", UCNV_ISO_2022 } + { "ISO_2022", UCNV_ISO_2022 }, + { "LMBCS-1", UCNV_LMBCS_1 }, + { "LMBCS-2", UCNV_LMBCS_2 }, + { "LMBCS-3", UCNV_LMBCS_3 }, + { "LMBCS-4", UCNV_LMBCS_4 }, + { "LMBCS-5", UCNV_LMBCS_5 }, + { "LMBCS-6", UCNV_LMBCS_6 }, + { "LMBCS-8", UCNV_LMBCS_8 }, + { "LMBCS-11",UCNV_LMBCS_11 }, + { "LMBCS-16",UCNV_LMBCS_16 }, + { "LMBCS-17",UCNV_LMBCS_17 }, + { "LMBCS-18",UCNV_LMBCS_18 }, + { "LMBCS-19",UCNV_LMBCS_19 } }; + /*Takes an alias name gets an actual converter file name *goes to disk and opens it. *allocates the memory and returns a new UConverter object diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index b645d336008..4daa68959b5 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -194,7 +194,9 @@ struct UConverterImpl { extern const UConverterSharedData _SBCSData, _DBCSData, _MBCSData, _Latin1Data, _UTF8Data, _UTF16BEData, _UTF16LEData, _EBCDICStatefulData, - _ISO2022Data; + _ISO2022Data, + _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, + _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19; U_CDECL_END diff --git a/icu4c/source/common/ucnv_lmb.c b/icu4c/source/common/ucnv_lmb.c new file mode 100644 index 00000000000..465f7941d75 --- /dev/null +++ b/icu4c/source/common/ucnv_lmb.c @@ -0,0 +1,870 @@ +/* +********************************************************************** +* Copyright (C) 2000, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* file name: ucnv_lmb.cpp +* encoding: US-ASCII +* tab size: 4 (not used) +* indentation:4 +* +* created on: 2000feb09 +* created by: Brendan Murray +*/ + +#include "unicode/utypes.h" +#include "cmemory.h" +#include "ucmp16.h" +#include "ucmp8.h" +#include "unicode/ucnv_bld.h" +#include "unicode/ucnv.h" +#include "ucnv_cnv.h" + +/* LMBCS -------------------------------------------------------------------- */ + +/* Group bytes, and things that look like group bytes, should always be 8-bits */ +typedef uint8_t ulmbcs_grp_t; + + +/* Define some constants instead of using literals */ + + +/* LMBCS groups */ +#define ULMBCS_GRP_EXCEPT 0x00 /* placeholder index for 'oddballs' XY, where Y<0x80 */ +#define ULMBCS_GRP_L1 0x01 /* Latin-1 */ +#define ULMBCS_GRP_GR 0x02 /* Greek */ +#define ULMBCS_GRP_HE 0x03 /* Hebrew */ +#define ULMBCS_GRP_AR 0x04 /* Arabic */ +#define ULMBCS_GRP_RU 0x05 /* Cyrillic */ +#define ULMBCS_GRP_L2 0x06 /* Latin-2 */ +#define ULMBCS_GRP_TR 0x08 /* Turkish */ +#define ULMBCS_GRP_TH 0x0B /* Thai */ +#define ULMBCS_GRP_CTRL 0x0F /* C0/C1 controls */ +#define ULMBCS_GRP_JA 0x10 /* Japanese */ +#define ULMBCS_GRP_KO 0x11 /* Korean */ +#define ULMBCS_GRP_CN 0x12 /* Chinese PRC */ +#define ULMBCS_GRP_TW 0x13 /* Chinese Taiwan */ +#define ULMBCS_GRP_UNICODE 0x14 /* Unicode compatibility group */ +#define ULMBCS_GRP_LAST 0x14 /* last LMBCS group that means anything */ + +/* some special values that can appear in place of optimization groups */ +#define ULMBCS_HT 0x09 /* Fixed control char - Horizontal Tab */ +#define ULMBCS_LF 0x0A /* Fixed control char - Line Feed */ +#define ULMBCS_CR 0x0D /* Fixed control char - Carriage Return */ +#define ULMBCS_123SYSTEMRANGE 0x19 /* Fixed control char for 1-2-3 file data: start system range name */ +#define ULMBCS_DEFAULTOPTGROUP 0x1 /* default optimization group for LMBCS */ +#define ULMBCS_DOUBLEOPTGROUP 0x10 /* start of double-byte optimization groups */ + +/* parts of LMBCS values, or ranges for LMBCS data */ +#define ULMBCS_UNICOMPATZERO 0xF6 /* PUA range for Unicode chars containing LSB = 0 */ +#define ULMBCS_CTRLOFFSET 0x20 /* Offset of control range in group 0x0F */ +#define ULMBCS_C1START 0x80 /* Start of 'C1' upper ascii range in ANSI code pages */ +#define ULMBCS_C0END 0x1F /* last of the 'C0' lower ascii contraol range in ANSI code pages */ +#define ULMBCS_INVALIDCHAR 0xFFFF /* Invalid character value = convert failed */ + + +/* special return values for FindLMBCSUniRange */ +#define ULMBCS_AMBIGUOUS_SBCS 0x80 // could fit in more than one + // LMBCS sbcs native encoding (example: most accented latin) +#define ULMBCS_AMBIGUOUS_MBCS 0x81 // could fit in more than one + //LMBCS mbcs native encoding (example: Unihan) + +/* macro to check compatibility of groups */ +#define ULMBCS_AMBIGUOUS_MATCH(agroup, xgroup) \ + ((((agroup) == ULMBCS_AMBIGUOUS_SBCS) && \ + (xgroup) < ULMBCS_DOUBLEOPTGROUP) || \ + (((agroup) == ULMBCS_AMBIGUOUS_MBCS) && \ + (xgroup) >= ULMBCS_DOUBLEOPTGROUP)) + +/* Max size for 1 LMBCS char */ +#define ULMBCS_CHARSIZE_MAX 3 + + +/* JSGTODO: what is ICU standard debug assertion method? + Invent an all-crash stop here, for now */ +#if 1 +#define MyAssert(b) {if (!(b)) {*(char *)0 = 1;}} +#else +#define MyAssert(b) +#endif + + +/* Map Optimization group byte to converter name. Note the following: + 0x00 is dummy, and contains the name of the exceptions converter. + 0x02 is currently unavailable: NLTC have been asked to provide. + 0x0F and 0x14 are algorithmically calculated + 0x09, 0x0A, 0x0D are data bytes (HT, LF, CR) + 0x07, 0x0C and 0x0E are unused +*/ +static const char * OptGroupByteToCPName[ULMBCS_CTRLOFFSET] = { + /* 0x0000 */ "lmb-excp", /* No zero opt group: for non-standard entries */ + /* 0x0001 */ "ibm-850", + /* 0x0002 */ "ibm-851", + /* 0x0003 */ "ibm-1255", + /* 0x0004 */ "ibm-1256", + /* 0x0005 */ "ibm-1251", + /* 0x0006 */ "ibm-852", + /* 0x0007 */ NULL, /* Unused */ + /* 0x0008 */ "ibm-1254", + /* 0x0009 */ NULL, /* Control char HT */ + /* 0x000A */ NULL, /* Control char LF */ + /* 0x000B */ "ibm-874", + /* 0x000C */ NULL, /* Unused */ + /* 0x000D */ NULL, /* Control char CR */ + /* 0x000E */ NULL, /* Unused */ + /* 0x000F */ NULL, /* Control chars: 0x0F20 + C0/C1 character: algorithmic */ + /* 0x0010 */ "ibm-943", + /* 0x0011 */ "ibm-1361", + /* 0x0012 */ "ibm-950", + /* 0x0013 */ "ibm-1386" + + /* The rest are null, including the 0x0014 Unicode compatibility region + and 0x0019, the 1-2-3 system range control char */ + +}; + + + + + +/* map UNICODE ranges to converter indexes (or special values) */ + +ulmbcs_grp_t FindLMBCSUniRange(UChar uniChar, UErrorCode* err); + +struct _UniLMBCSGrpMap +{ + UChar uniStartRange; + UChar uniEndRange; + ulmbcs_grp_t GrpType; +} UniLMBCSGrpMap[] += +{ + 0x0001, 0x001F, ULMBCS_GRP_CTRL, + 0x0080, 0x009F, ULMBCS_GRP_CTRL, + 0x00A0, 0x0113, ULMBCS_AMBIGUOUS_SBCS, + 0x0115, 0x0120, ULMBCS_AMBIGUOUS_SBCS, + 0x0120, 0x012B, ULMBCS_GRP_EXCEPT, + 0x012C, 0x01CD, ULMBCS_AMBIGUOUS_SBCS, + 0x01CE, 0x01CE, ULMBCS_AMBIGUOUS_MBCS, + 0x01CF, 0x1FFF, ULMBCS_AMBIGUOUS_SBCS, + 0x2000, 0xFFFD, ULMBCS_AMBIGUOUS_MBCS, + 0xFFFF, 0xFFFF, +}; + +ulmbcs_grp_t FindLMBCSUniRange(UChar uniChar, UErrorCode* err) +{ + struct _UniLMBCSGrpMap * pTable = UniLMBCSGrpMap; + + while (uniChar > pTable->uniEndRange) + { + pTable++; + } + + if (uniChar >= pTable->uniStartRange) + { + return pTable->GrpType; + } + + if (pTable->uniStartRange == 0xFFFF) + { + *err = ULMBCS_INVALIDCHAR; + } + return ULMBCS_GRP_UNICODE; +} + +#if 0 +// JSGTODO (by Brendan?) some incomplete source data from Brendan to be integrated + + 0xFE30, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, + 0xFA2E, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, + 0xF8FF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, + 0xD7FF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, + 0xABFF, ULMBCS_GRP_KO, ULMBCS_FLAGS_UNICODE, + 0x9FFF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, + 0x31FF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, + 0x318F, ULMBCS_GRP_CN, ULMBCS_FLAGS_CONTINUE, + 0x3130, ULMBCS_GRP_KO, ULMBCS_FLAGS_UNICODE, + 0x3100, ULMBCS_GRP_CN, ULMBCS_FLAGS_CONTINUE, + 0x313F, ULMBCS_GRP_JA, ULMBCS_FLAGS_UNICODE, + 0x2FFF, ULMBCS_GRP_JA, ULMBCS_FLAGS_CONTINUE, + 0x2714, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, + 0x2000, ULMBCS_GRP_L1, ULMBCS_FLAGS_CONTINUE, + 0x0E5C, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, + 0x0E00, ULMBCS_GRP_TH, ULMBCS_FLAGS_UNICODE, + 0x06FF, ULMBCS_GRP_UNICODE, ULMBCS_FLAGS_UNICODE, + 0x0600, ULMBCS_GRP_AR, ULMBCS_FLAGS_UNICODE, + 0x0500, ULMBCS_GRP_HE, ULMBCS_FLAGS_UNICODE, + 0x0400, ULMBCS_GRP_RU, ULMBCS_FLAGS_UNICODE, + 0x0300, ULMBCS_GRP_GR, ULMBCS_FLAGS_UNICODE, + 0x001F, ULMBCS_GRP_L1, ULMBCS_FLAGS_CONTINUE, + 0x0000, ULMBCS_GRP_CTRL, ULMBCS_FLAGS_UNICODE +#endif + + +int LMBCSConversionWorker ( + UConverterDataLMBCS * extraInfo, ulmbcs_grp_t group, + uint8_t * pStartLMBCS, UChar * pUniChar, + ulmbcs_grp_t * lastConverterIndex, bool_t * groups_tried, + UErrorCode* err); + +int LMBCSConversionWorker ( + UConverterDataLMBCS * extraInfo, ulmbcs_grp_t group, + uint8_t * pStartLMBCS, UChar * pUniChar, + ulmbcs_grp_t * lastConverterIndex, bool_t * groups_tried, + UErrorCode * err) +{ + uint8_t * pLMBCS = pStartLMBCS; + UConverter * xcnv = extraInfo->OptGrpConverter[group]; + uint8_t mbChar [ULMBCS_CHARSIZE_MAX]; + uint8_t * pmbChar = mbChar; + bool_t isDoubleByteGroup = (group >= ULMBCS_DOUBLEOPTGROUP) ? TRUE : FALSE; + UErrorCode localErr = 0; + int bytesConverted =0; + + MyAssert(xcnv); + MyAssert(groupsubChar[0] || U_FAILURE(localErr) || !bytesConverted ) + { + // JSGTODO: are there some local failure modes that ought to be bubbled up in some other way? + groups_tried[group] = TRUE; + return 0; + } + + *lastConverterIndex = group; + + /* All initial byte values in lower ascii range should have been caught by now, + except with the exception group. + + Uncomment this assert to find them. + */ + + // MyAssert((*pmbChar <= ULMBCS_C0END) || (*pmbChar >= ULMBCS_C1START) || (group == ULMBCS_GRP_EXCEPT)); + + /* use converted data: first write 0, 1 or two group bytes */ + if (group != ULMBCS_GRP_EXCEPT && extraInfo->OptGroup != group) + { + *pLMBCS++ = group; + if (bytesConverted == 1 && isDoubleByteGroup) + { + *pLMBCS++ = group; + } + } + /* then move over the converted data */ + do + { + *pLMBCS++ = *pmbChar++; + } + while(--bytesConverted); + + return (pLMBCS - pStartLMBCS); +} + + +/* Convert Unicode string to LMBCS */ +void _LMBCSFromUnicode(UConverter* _this, + char** target, + const char* targetLimit, + const UChar** source, + const UChar* sourceLimit, + int32_t * offsets, + bool_t flush, + UErrorCode* err) +{ + ulmbcs_grp_t lastConverterIndex = 0; + UChar uniChar; + uint8_t LMBCS[ULMBCS_CHARSIZE_MAX]; + uint8_t * pLMBCS; + int bytes_written; + bool_t groups_tried[ULMBCS_GRP_LAST]; + UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; + + /* Arguments Check */ + if (!err || U_FAILURE(*err)) + { + return; + } + + if (sourceLimit < *source) + { + *err = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + + do + { + uniChar = *(*source)++; + bytes_written = 0; + pLMBCS = LMBCS; + + /* single byte matches */ + + if (uniChar == 0 || uniChar == ULMBCS_HT || uniChar == ULMBCS_CR || + uniChar == ULMBCS_LF || uniChar == ULMBCS_123SYSTEMRANGE || + ((uniChar >= ULMBCS_CTRLOFFSET) && (uniChar < ULMBCS_C1START))) + { + *pLMBCS++ = (uint8_t) uniChar; + bytes_written = 1; + } + + + if (!bytes_written) + { + /* Check by UNICODE range */ + ulmbcs_grp_t group = FindLMBCSUniRange(uniChar,err); + + if (group == ULMBCS_GRP_UNICODE) + { + /* encode into LMBCS Unicode range */ + uint8_t LowCh = (uint8_t) (uniChar & 0x00FF); + uint8_t HighCh = (uint8_t)(uniChar >> 8); + + *pLMBCS++ = ULMBCS_GRP_UNICODE; + + if (LowCh == 0) + { + *pLMBCS++ = ULMBCS_UNICOMPATZERO; + *pLMBCS++ = HighCh; + } + else + { + *pLMBCS++ = HighCh; + *pLMBCS++ = LowCh; + } + + bytes_written = pLMBCS - LMBCS; + } + else if (group == ULMBCS_GRP_CTRL) + { + /* Handle control characters here */ + if (uniChar <= ULMBCS_C0END) + { + *pLMBCS++ = ULMBCS_GRP_CTRL; + *pLMBCS++ = ULMBCS_CTRLOFFSET + (uint8_t) uniChar; + } + else if (uniChar >= ULMBCS_C1START && uniChar <= ULMBCS_C1START + ULMBCS_CTRLOFFSET) + { + *pLMBCS++ = ULMBCS_GRP_CTRL; + *pLMBCS++ = (uint8_t) (uniChar & 0x00FF); + } + bytes_written = pLMBCS - LMBCS; + } + else if (group < ULMBCS_GRP_UNICODE) + { + /* a specific converter has been identified - use it */ + bytes_written = LMBCSConversionWorker ( + extraInfo, group, pLMBCS, &uniChar, + &lastConverterIndex, groups_tried, err); + + MyAssert(bytes_written); /* table should never return unusable group */ + + } + else /* the ambiguous group cases */ + { + memset(groups_tried, 0, sizeof(groups_tried)); + + /* check for non-default optimization group */ + if (extraInfo->OptGroup != 1 + && ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->OptGroup)) + { + bytes_written = LMBCSConversionWorker (extraInfo, + extraInfo->OptGroup, pLMBCS, &uniChar, + &lastConverterIndex, groups_tried, err); + } + /* check for locale optimization group */ + if (!bytes_written + && (extraInfo->localeConverterIndex) + && (ULMBCS_AMBIGUOUS_MATCH(group, extraInfo->localeConverterIndex))) + { + bytes_written = LMBCSConversionWorker (extraInfo, + extraInfo->localeConverterIndex, pLMBCS, &uniChar, + &lastConverterIndex, groups_tried, err); + } + /* check for last optimization group used for this string */ + if (!bytes_written + && (lastConverterIndex) + && (ULMBCS_AMBIGUOUS_MATCH(group, lastConverterIndex))) + { + bytes_written = LMBCSConversionWorker (extraInfo, + lastConverterIndex, pLMBCS, &uniChar, + &lastConverterIndex, groups_tried, err); + + } + if (!bytes_written) + { + /* just check every matching converter */ + ulmbcs_grp_t grp_start; + ulmbcs_grp_t grp_end; + ulmbcs_grp_t grp_ix; + grp_start = (group == ULMBCS_AMBIGUOUS_MBCS) + ? ULMBCS_DOUBLEOPTGROUP + : ULMBCS_GRP_L1; + grp_end = (group == ULMBCS_AMBIGUOUS_MBCS) + ? ULMBCS_GRP_LAST-1 + : ULMBCS_GRP_TH; + + for (grp_ix = grp_start; + grp_ix <= grp_end && !bytes_written; + grp_ix++) + { + if (extraInfo->OptGrpConverter [grp_ix] && !groups_tried [grp_ix]) + { + bytes_written = LMBCSConversionWorker (extraInfo, + grp_ix, pLMBCS, &uniChar, + &lastConverterIndex, groups_tried, err); + } + } + + /* a final conversion fallback for sbcs to the exceptions group */ + if (!bytes_written && group == ULMBCS_AMBIGUOUS_SBCS) + { + bytes_written = LMBCSConversionWorker (extraInfo, + ULMBCS_GRP_EXCEPT, pLMBCS, &uniChar, + &lastConverterIndex, groups_tried, err); + } + /* all of our strategies failed. Fallback to Unicode. Consider adding these to table */ + + if (!bytes_written) + { + /* encode into LMBCS Unicode range */ + uint8_t LowCh = (uint8_t) uniChar; + uint8_t HighCh = (uint8_t)(uniChar >> 8); + + *pLMBCS++ = ULMBCS_GRP_UNICODE; + + if (LowCh == 0) + { + *pLMBCS++ = ULMBCS_UNICOMPATZERO; + *pLMBCS++ = HighCh; + } + else + { + *pLMBCS++ = HighCh; + *pLMBCS++ = LowCh; + } + + bytes_written = pLMBCS - LMBCS; + } + } + } + } + + if (*target + bytes_written > targetLimit) + { + /* JSGTODO deal with buffer running out here */ + } + + /* now that we are sure it all fits, move it in */ + for(pLMBCS = LMBCS; bytes_written--; *(*target)++ = *pLMBCS++) + { }; + + } + while (*source<= sourceLimit && + *target <= targetLimit && + !U_FAILURE(*err)); + + /* JSGTODO Check the various exit conditions */ +} + + + +/* Return the Unicode representation for the current LMBCS character */ +UChar _LMBCSGetNextUChar(UConverter* _this, + const char** source, + const char* sourceLimit, + UErrorCode* err) +{ + uint8_t CurByte; // A byte from the input stream + UChar uniChar; // an output UNICODE char + UChar mbChar; // an intermediate multi-byte value (mbcs or LMBCS) + CompactShortArray *MyCArray = NULL; + UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; + ulmbcs_grp_t group = 0; + UConverter* cnv = 0; + + /* Opt Group (or first data byte) */ + CurByte = *((uint8_t *) (*source)++); + uniChar = 0; + + // at entry of each if clause: + // 1. 'CurByte' points at the first byte of a LMBCS character + // 2. '*source'points to the next byte of the source stream after 'CurByte' + + // the job of each if clause is: + // 1. set '*source' to point at the beginning of next char (nop if LMBCS char is only 1 byte) + // 2. set 'uniChar' up with the right Unicode value, or set 'err' appropriately + + + // First lets check the simple fixed values. + if (CurByte == 0 || CurByte == ULMBCS_HT || CurByte == ULMBCS_CR || + CurByte == ULMBCS_LF || CurByte == ULMBCS_123SYSTEMRANGE || + ((CurByte >= ULMBCS_CTRLOFFSET) && (CurByte < ULMBCS_C1START))) + { + uniChar = CurByte; + } + else + if (CurByte == ULMBCS_GRP_CTRL) /* Control character group - no opt group update */ + { + if (*source >= sourceLimit) + { + *err = U_TRUNCATED_CHAR_FOUND; + } + else + { + uint8_t C0C1byte = *(*source)++; + uniChar = (C0C1byte < ULMBCS_C1START) ? C0C1byte - ULMBCS_CTRLOFFSET : C0C1byte; + } + } + else + if (CurByte == ULMBCS_GRP_UNICODE) /* Unicode compatibility group: BE as is */ + { + uint8_t HighCh, LowCh; + + + HighCh = *(*source)++; /* Big-endian Unicode in LMBCs compatibility group*/ + LowCh = *(*source)++; + + if (HighCh == ULMBCS_UNICOMPATZERO ) + { + HighCh = LowCh; + LowCh = 0; /* zero-byte in LSB special character */ + } + + uniChar = (HighCh << 8) | LowCh; + + } + + else if (CurByte <= ULMBCS_CTRLOFFSET) + { + group = CurByte; /* group byte is in the source */ + cnv = extraInfo->OptGrpConverter[group]; + + if (!cnv) + { + /* this is not a valid group byte - no converter*/ + *err = U_INVALID_CHAR_FOUND; + } + + + else if (group >= ULMBCS_DOUBLEOPTGROUP) /* double byte conversion */ + { + uint8_t HighCh, LowCh; + + + HighCh = *(*source)++; + LowCh = *(*source)++; + + /* check for LMBCS doubled-group-byte case */ + mbChar = (HighCh == group) ? LowCh : (HighCh<<8) | LowCh; + + MyCArray = cnv->sharedData->table->mbcs.toUnicode; + uniChar = (UChar) ucmp16_getu (MyCArray, mbChar); + + } + else /* single byte conversion */ + { + CurByte = *(*source)++; + if (CurByte >= ULMBCS_C1START) + { + uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte]; + } + else + { + /* The non-optimizable oddballs where there is an explicit byte + * AND the second byte is not in the upper ascii range + */ + cnv = extraInfo->OptGrpConverter [ULMBCS_GRP_EXCEPT]; + + /* Lookup value must include opt group */ + mbChar = (UChar)(group << 8) | (UChar) CurByte; + + MyCArray = cnv->sharedData->table->mbcs.toUnicode; + uniChar = (UChar) ucmp16_getu(MyCArray, mbChar); + + } + } + } + else if (CurByte >= ULMBCS_C1START) /* group byte is implicit */ + { + group = extraInfo->OptGroup; + cnv = extraInfo->OptGrpConverter[group]; + + if (group >= ULMBCS_DOUBLEOPTGROUP) /* double byte conversion */ + { + uint8_t HighCh, LowCh; + + // JSGTODO need to deal with case of single byte G1 + // chars in mbcs groups + + HighCh = CurByte; + LowCh = *(*source)++; + + mbChar = (HighCh<<8) | LowCh; + MyCArray = cnv->sharedData->table->mbcs.toUnicode; + uniChar = (UChar) ucmp16_getu (MyCArray, mbChar); + (*source) += sizeof(UChar); + } + else /* single byte conversion */ + { + uniChar = cnv->sharedData->table->sbcs.toUnicode[CurByte]; + } + } + else + { +#if DEBUG + // JSGTODO: assert here: we should never get here. +#endif + + } +// JSGTODO: need to correctly deal with partial chars + return uniChar; +} + + + +void _LMBCSToUnicodeWithOffsets(UConverter* _this, + UChar** target, + const UChar* targetLimit, + const char** source, + const char* sourceLimit, + int32_t* offsets, + bool_t flush, + UErrorCode* err) +{ + UChar uniChar; // an output UNICODE char + CompactShortArray *MyCArray = NULL; + UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; + ulmbcs_grp_t group = 0; + UConverter* cnv = 0; + const char * pStartLMBCS = *source; + + if (!err || U_FAILURE(*err)) + { + return; + } + if ((_this == NULL) || (targetLimit < *target) || (sourceLimit < *source)) + { + *err = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + +#if 0 // JSGTODOD - restore incomplete char handling + + /* Have we arrived here from a prior conversion ending with a partial char? + The only possible configurations are: + 1. mode contains the group byte of SBCS LMBCS char; + 2. mode contains the group byte of MBCS LMBCS char + For both continue with next char in input buffer + 3. mode contains group byte + 1st data byte of MBCS LMBCS char + Partially process & get the second data byte + 4. mode contains both group bytes of double group-byte MBCS LMBCS char + Nuke contents after setting up converter & continue with buffer data + */ + if (_this->toUnicodeStatus) + { + mbChar = (UChar) _this->mode; /* Restore the previously calculated char */ + + _this->toUnicodeStatus = 0; /* Reset other fields*/ + _this->invalidCharLength = 0; + + /* Check if this is a partial MBCS char (fall through if SBCS) */ + if (mbChar > 0xFF) + { + /* Select the correct converter */ + group = (mbChar >> 8) & 0x00FF; + cnv = extraInfo->OptGrpConverter[group]; + + /* Pick up the converter table */ + MyCArray = cnv->sharedData->table->mbcs.toUnicode; + + /* Use only data byte: NULL if the character has pair of group-bytes */ + if (mbChar & 0x00FF < ULMBCS_MAXGRPBYTE) + CurByte = 0; + else + CurByte = ((mbChar & 0x00FF) << 8); + + /* Add the current char from the buffer */ + CurByte |= *((uint8_t *) (*source)++); + + goto continueWithPartialMBCSChar; + + } + else + { + goto continueWithPartialChar; + } + } +#endif + + + + /* Process from source to limit */ + while (!*err && sourceLimit > *source && targetLimit > *target) + { + if(offsets) + { + *offsets = (*source) - pStartLMBCS; + } + + uniChar = _LMBCSGetNextUChar(_this, source, sourceLimit, err); + + + // last step is always to move the new value into the buffer + if (U_SUCCESS(*err) && uniChar != missingUCharMarker) + { + // JSGTODO deal with missingUCharMarker case for error/info reporting. + *(*target)++ = uniChar; + if(offsets) + { + offsets++; + } + + } + } +#if 0 + // JSGTODO restore partial char handling + /* Check to see if we've fallen through because of a partial char */ + if (*err == U_TRUNCATED_CHAR_FOUND) + { + _this->mode = mbChar; /* Save current partial char */ + } +#endif +} + + + +/* Convert LMBCS string to Unicode */ +void _LMBCSToUnicode(UConverter* _this, + UChar** target, + const UChar* targetLimit, + const char** source, + const char* sourceLimit, + int32_t* offsets, + bool_t flush, + UErrorCode* err) +{ + _LMBCSToUnicodeWithOffsets(_this, target, targetLimit, source, sourceLimit, offsets, flush,err); +} + + + +static void _LMBCSOpenWorker(UConverter* _this, + const char* name, + const char* locale, + UErrorCode* err, + ulmbcs_grp_t OptGroup + ) +{ + UConverterDataLMBCS * extraInfo = uprv_malloc (sizeof (UConverterDataLMBCS)); + + if(extraInfo != NULL) + { + + ulmbcs_grp_t i; + ulmbcs_grp_t imax; + + imax = sizeof(extraInfo->OptGrpConverter)/sizeof(extraInfo->OptGrpConverter[0]); + + for (i=0; i < imax; i++) + { + extraInfo->OptGrpConverter[i] = + (OptGroupByteToCPName[i] != NULL) ? + ucnv_open(OptGroupByteToCPName[i], err) : NULL; + } + + extraInfo->OptGroup = OptGroup; + /* JSGTODO: add LocaleConverterIndex logic here */ + extraInfo->localeConverterIndex = 0; + } + else + { + *err = U_MEMORY_ALLOCATION_ERROR; + } + + _this->extraInfo = extraInfo; +} + + + + +static void _LMBCSClose(UConverter * _this) +{ + if (_this->extraInfo != NULL) + { + ulmbcs_grp_t Ix; + + for (Ix=0; Ix < ULMBCS_GRP_UNICODE; Ix++) + { + UConverterDataLMBCS * extraInfo = (UConverterDataLMBCS *) _this->extraInfo; + if (extraInfo->OptGrpConverter[Ix] != NULL) + ucnv_close (extraInfo->OptGrpConverter[Ix]); + } + uprv_free (_this->extraInfo); + } +} + + + +#define DEFINE_LMBCS_OPEN(n) \ +static void _LMBCSOpen##n(UConverter* _this,const char* name,const char* locale,UErrorCode* err) \ +{ _LMBCSOpenWorker(_this, name,locale, err, n);} \ + + +DEFINE_LMBCS_OPEN(1) +DEFINE_LMBCS_OPEN(2) +DEFINE_LMBCS_OPEN(3) +DEFINE_LMBCS_OPEN(4) +DEFINE_LMBCS_OPEN(5) +DEFINE_LMBCS_OPEN(6) +DEFINE_LMBCS_OPEN(8) +DEFINE_LMBCS_OPEN(11) +DEFINE_LMBCS_OPEN(16) +DEFINE_LMBCS_OPEN(17) +DEFINE_LMBCS_OPEN(18) +DEFINE_LMBCS_OPEN(19) + +#define DECLARE_LMBCS_DATA(n) \ + static const UConverterImpl _LMBCSImpl##n={\ + UCNV_LMBCS_##n,\ + NULL,NULL,\ + _LMBCSOpen##n,\ + _LMBCSClose,\ + NULL,\ + _LMBCSToUnicode,\ + _LMBCSToUnicodeWithOffsets,\ + _LMBCSFromUnicode,\ + NULL,\ + _LMBCSGetNextUChar,\ + NULL\ +};\ +extern const UConverterSharedData _LMBCSData##n={\ + sizeof(UConverterSharedData), ~0,\ + NULL, NULL, &_LMBCSImpl##n, "LMBCS_" ## #n,\ + 0, UCNV_IBM, UCNV_LMBCS_1, 1, 1,\ + { 0, 1, 0x3f, 0, 0, 0 }\ +}; + +DECLARE_LMBCS_DATA(1) +DECLARE_LMBCS_DATA(2) +DECLARE_LMBCS_DATA(3) +DECLARE_LMBCS_DATA(4) +DECLARE_LMBCS_DATA(5) +DECLARE_LMBCS_DATA(6) +DECLARE_LMBCS_DATA(8) +DECLARE_LMBCS_DATA(11) +DECLARE_LMBCS_DATA(16) +DECLARE_LMBCS_DATA(17) +DECLARE_LMBCS_DATA(18) +DECLARE_LMBCS_DATA(19) + + + + diff --git a/icu4c/source/common/unicode/ucnv_bld.h b/icu4c/source/common/unicode/ucnv_bld.h index df2ef936f11..8a7d7031c53 100644 --- a/icu4c/source/common/unicode/ucnv_bld.h +++ b/icu4c/source/common/unicode/ucnv_bld.h @@ -60,8 +60,24 @@ typedef enum { UCNV_UTF16_LittleEndian = 6, UCNV_EBCDIC_STATEFUL = 7, UCNV_ISO_2022 = 8, + + UCNV_LMBCS_1 = 9, + UCNV_LMBCS_2, + UCNV_LMBCS_3, + UCNV_LMBCS_4, + UCNV_LMBCS_5, + UCNV_LMBCS_6, + UCNV_LMBCS_8, + UCNV_LMBCS_11, + UCNV_LMBCS_16, + UCNV_LMBCS_17, + UCNV_LMBCS_18, + UCNV_LMBCS_19, + UCNV_LMBCS_LAST = UCNV_LMBCS_19, + /* Number of converter types for which we have conversion routines. */ - UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES = 9 + UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES = UCNV_LMBCS_LAST+1 + } UConverterType; /* ### move the following typedef and array into implementation files! */ @@ -256,6 +272,17 @@ typedef struct } UConverterDataISO2022; + +typedef struct + { + UConverter *OptGrpConverter[0x20]; /* Converter per Opt. grp. */ + uint8_t OptGroup; /* default Opt. grp. for this LMBCS session */ + uint8_t localeConverterIndex; /* reasonable locale match for index */ + + } +UConverterDataLMBCS; + + #define CONVERTER_FILE_EXTENSION ".cnv" #endif /* _UCNV_BLD */ diff --git a/icu4c/source/data/mappings/convrtrs.txt b/icu4c/source/data/mappings/convrtrs.txt index 9c1078f4479..02b6b07bfb8 100644 --- a/icu4c/source/data/mappings/convrtrs.txt +++ b/icu4c/source/data/mappings/convrtrs.txt @@ -47,7 +47,19 @@ UTF16_PlatformEndian iso-10646-ucs-2 csUnicode utf16 utf-16 ibm-1200 ibm1200 UTF16_OppositeEndian LATIN_1 iso-8859-1 iso_8859-1 ibm-819 ibm819 cp819 latin1 latin-1 ascii ascii-7 us-ascii 8859-1 csisolatin1 iso-ir-100 iso_8859-1:1978 #!!!!! There's whole lot of names for this - cp367 csASCII etc. ISO_2022 iso-2022 2022 cp2022 iso2022 iso_2022 -LMBCS +LMBCS-1 lmbcs +LMBCS-2 +LMBCS-3 +LMBCS-4 +LMBCS-5 +LMBCS-6 +LMBCS-8 +LMBCS-11 +LMBCS-16 +LMBCS-17 +LMBCS-18 +LMBCS-19 + # Table-based @@ -76,10 +88,13 @@ ibm-1383 euc-cn euccn ibm-eucCN # China EUC #ibm-1162 tis-620 cp874 windows-874 ms874 # Thai (w/ euro support) #what is the connection between this and the one below!!! ibm-874 ibm-1161 #same as 1162 (w/o euro update) ***This is commented out in Helena's +lmb-excp # special exceptions list for LMBCS algorithm + # Platform codepages ibm-437 ibm437 cp437 csPC8CodePage437 437 # PC US # HSYS: ibm-850 IBM850 cp850 850 csPC850Multilingual # PC latin1 +ibm-851 IBM851 cp851 851 csPC851 # PC DOS Greek (no euro) ibm-858 ibm858 cp858 # PC latin1 with Euro cp850 removed ibm-9044 IBM852 852 csPCp852 cp852 # PC latin2 (w/ euro update) #where should the names go here or below - inconsistency!!! ibm-852 # PC latin2 (w/o euro update) diff --git a/icu4c/source/data/mappings/ibm-851.ucm b/icu4c/source/data/mappings/ibm-851.ucm new file mode 100644 index 00000000000..4a40a1ec663 --- /dev/null +++ b/icu4c/source/data/mappings/ibm-851.ucm @@ -0,0 +1,285 @@ +# ****************************************************************************** +# * +# * Copyright (C) 1995-2000, International Business Machines +# * Corporation and others. All Rights Reserved. +# * +# ****************************************************************************** +# +# File created on Fri Feb 11 14:11:00 2000 +# +# File created manually +# from source files IBM-851.TXMAP100 +# +# Table Version : 1.00 +# + "IBM-851" + "AXXXX" + 1 + 1 + "SBCS" + \x7F +# +CHARMAP +# +# +#ISO 10646 IBM-851 +#_________ _________ + \x00 # ..NUL... + \x01 # ..SOH... + \x02 # ..STX... + \x03 # ..ETX... + \x04 # ..EOT... + \x05 # ..ENQ... + \x06 # ..ACK... + \x07 # ..BEL... + \x08 # ...BS... + \x09 # ...HT... + \x0A # ...LF... + \x0B # ...VT... + \x0C # ...FF... + \x0D # ...CR... + \x0E # .SO/LS1. + \x0F # .SI/LS0. + \x10 # ..DLE... + \x11 # ..DC1... + \x12 # ..DC2... + \x13 # ..DC3... + \x14 # ..DC4... + \x15 # ..NAK... + \x16 # ..SYN... + \x17 # ..ETB... + \x18 # ..CAN... + \x19 # ...EM... + \x1A # ..IFS... + \x1B # ..ESC... + \x1C # ..DEL... + \x1D # ...GS... + \x1E # ...RS... + \x1F # ...US... + \x20 # SP010000 + \x21 # SP020000 + \x22 # SP040000 + \x23 # SM010000 + \x24 # SC030000 + \x25 # SM020000 + \x26 # SM030000 + \x27 # SP050000 + \x28 # SP060000 + \x29 # SP070000 + \x2A # SM040000 + \x2B # SA010000 + \x2C # SP080000 + \x2D # SP100000 + \x2E # SP110000 + \x2F # SP120000 + \x30 # ND100000 + \x31 # ND010000 + \x32 # ND020000 + \x33 # ND030000 + \x34 # ND040000 + \x35 # ND050000 + \x36 # ND060000 + \x37 # ND070000 + \x38 # ND080000 + \x39 # ND090000 + \x3A # SP130000 + \x3B # SP140000 + \x3C # SA030000 + \x3D # SA040000 + \x3E # SA050000 + \x3F # SP150000 + \x40 # SM050000 + \x41 # LA020000 + \x42 # LB020000 + \x43 # LC020000 + \x44 # LD020000 + \x45 # LE020000 + \x46 # LF020000 + \x47 # LG020000 + \x48 # LH020000 + \x49 # LI020000 + \x4A # LJ020000 + \x4B # LK020000 + \x4C # LL020000 + \x4D # LM020000 + \x4E # LN020000 + \x4F # LO020000 + \x50 # LP020000 + \x51 # LQ020000 + \x52 # LR020000 + \x53 # LS020000 + \x54 # LT020000 + \x55 # LU020000 + \x56 # LV020000 + \x57 # LW020000 + \x58 # LX020000 + \x59 # LY020000 + \x5A # LZ020000 + \x5B # SM060000 + \x5C # SM070000 + \x5D # SM080000 + \x5E # SD150000 + \x5F # SP090000 + \x60 # SD130000 + \x61 # LA010000 + \x62 # LB010000 + \x63 # LC010000 + \x64 # LD010000 + \x65 # LE010000 + \x66 # LF010000 + \x67 # LG010000 + \x68 # LH010000 + \x69 # LI010000 + \x6A # LJ010000 + \x6B # LK010000 + \x6C # LL010000 + \x6D # LM010000 + \x6E # LN010000 + \x6F # LO010000 + \x70 # LP010000 + \x71 # LQ010000 + \x72 # LR010000 + \x73 # LS010000 + \x74 # LT010000 + \x75 # LU010000 + \x76 # LV010000 + \x77 # LW010000 + \x78 # LX010000 + \x79 # LY010000 + \x7A # LZ010000 + \x7B # SM110000 + \x7C # SM130000 + \x7D # SM140000 + \x7E # SD190000 + \x7F # ..SUB... + \x80 # LC420000 + \x81 # LU170000 + \x82 # LE110000 + \x83 # LA150000 + \x84 # LA170000 + \x85 # LA130000 + \x86 # GA120000 + \x87 # LC410000 + \x88 # LE150000 + \x89 # LE170000 + \x8A # LE130000 + \x8B # LI170000 + \x8C # LI150000 + \x8D # GE120000 + \x8E # LA180000 + \x8F # GE720000 + \x90 # GI120000 + \x92 # GO120000 + \x93 # LO150000 + \x94 # LO170000 + \x95 # GU120000 + \x96 # LU150000 + \x97 # LU130000 + \x98 # GO720000 + \x99 # LO180000 + \x9A # LU180000 + \x9B # GA110000 + \x9C # SC020000 + \x9D # GE110000 + \x9E # GE710000 + \x9F # GI110000 + \xA0 # GI170000 + \xA1 # GI730000 + \xA2 # GO110000 + \xA3 # GU110000 + \xA4 # GA020000 + \xA5 # GB020000 + \xA6 # GG020000 + \xA7 # GD020000 + \xA8 # GE020000 + \xA9 # GZ020000 + \xAA # GE320000 + \xAB # NF010000 + \xAC # GT620000 + \xAD # GI020000 + \xAE # SP170000 + \xAF # SP180000 + \xB0 # SF140000 + \xB1 # SF150000 + \xB2 # SF160000 + \xB3 # SF110000 + \xB4 # SF090000 + \xB5 # GK020000 + \xB6 # GL020000 + \xB7 # GM020000 + \xB8 # GN020000 + \xB9 # SF230000 + \xBA # SF240000 + \xBB # SF250000 + \xBC # SF260000 + \xBD # GX020000 + \xBE # GO020000 + \xBF # SF030000 + \xC0 # SF020000 + \xC1 # SF070000 + \xC2 # SF060000 + \xC3 # SF080000 + \xC4 # SF100000 + \xC5 # SF050000 + \xC6 # GP020000 + \xC7 # GR020000 + \xC8 # SF380000 + \xC9 # SF390000 + \xCA # SF400000 + \xCB # SF410000 + \xCC # SF420000 + \xCD # SF430000 + \xCE # SF440000 + \xCF # GS020000 + \xD0 # GT020000 + \xD1 # GU020000 + \xD2 # GF020000 + \xD3 # GH020000 + \xD4 # GP620000 + \xD5 # GO320000 + \xD6 # GA010000 + \xD7 # GB010000 + \xD8 # GG010000 + \xD9 # SF040000 + \xDA # SF010000 + \xDB # SF610000 + \xDC # SF570000 + \xDD # GD010000 + \xDE # GE010000 + \xDF # SF600000 + \xE0 # GZ010000 + \xE1 # GE310000 + \xE2 # GT610000 + \xE3 # GI010000 + \xE4 # GK010000 + \xE5 # GL010000 + \xE6 # GM010000 + \xE7 # GN010000 + \xE8 # GX010000 + \xE9 # GO010000 + \xEA # GP010000 + \xEB # GR010000 + \xEC # GS010000 + \xED # GS610000 + \xEE # GT010000 + \xEF # SD110000 + \xF0 # SP320000 + \xF1 # SA020000 + \xF2 # GU010000 + \xF3 # GF010000 + \xF4 # GH010000 + \xF5 # SM240000 + \xF6 # GP610000 + \xF7 # SD410000 + \xF8 # SM190000 + \xF9 # SD170000 + \xFA # GO310000 + \xFB # GU170000 + \xFC # GU730000 + \xFD # GO710000 + \xFE # SM470000 + \xFF # SP300000 +# +END CHARMAP +# +#________________________________________________________________________ diff --git a/icu4c/source/data/mappings/lmb-excp.ucm b/icu4c/source/data/mappings/lmb-excp.ucm new file mode 100644 index 00000000000..61b2a3e8386 --- /dev/null +++ b/icu4c/source/data/mappings/lmb-excp.ucm @@ -0,0 +1,315 @@ +# ******************************************************************************* +# * +# * Copyright (C) 1995-2000, International Business Machines +# * Corporation and others. All Rights Reserved. +# * +# ******************************************************************************* +# +# File created on Thu Feb 10 11:47:54 2000 +# +# File created manually from source file LMBCS.ALL +# +# Table Version : 1.00 +# + "lmb-excp" + "AXXXX" + 2 + 1 + "MBCS" + \x3F +# +CHARMAP +# +# +#ISO 10646 LMBCS +#_________ _________ + \x01\x27 + \x01\x23 + \x01\x33 + \x01\x6D + \x01\x24 + \x01\x34 + \x01\x21 + \x01\x31 + \x01\x6C + \x01\x3B + \x01\x15 + \x01\x20 + \x01\x30 + \x01\x67 + \x01\x25 + \x01\x35 + \x01\x14 + \x06\x2E + \x06\x01 + \x06\x02 + \x06\x03 + \x06\x04 + \x06\x05 + \x06\x06 + \x06\x07 + \x06\x08 + \x06\x09 + \x06\x0A + \x06\x0B + \x06\x0C + \x06\x0D + \x06\x0E + \x06\x0F + \x06\x10 + \x06\x11 + \x01\x72 + \x01\x73 + \x06\x12 + \x06\x13 + \x06\x14 + \x06\x15 + \x06\x16 + \x06\x17 + \x01\x61 + \x01\x60 + \x06\x18 + \x06\x19 + \x06\x1A + \x06\x1B + \x01\x7A + \x06\x1C + \x06\x1D + \x01\x66 + \x01\x65 + \x06\x1E + \x06\x1F + \x01\x64 + \x01\x78 + \x01\x79 + \x06\x20 + \x06\x21 + \x01\x40 + \x01\x41 + \x06\x22 + \x06\x23 + \x06\x24 + \x06\x25 + \x01\x74 + \x01\x75 + \x06\x26 + \x06\x27 + \x06\x28 + \x06\x29 + \x06\x2A + \x06\x2B + \x06\x2C + \x06\x2D + \x01\x42 + \x02\x07 + \x02\x08 + \x01\x6B + \x01\x68 + \x01\x43 + \x01\x22 + \x01\x32 + \x01\x44 + \x01\x6A + \x01\x69 + \x02\x01 + \x02\x06 + \x02\x02 + \x02\x03 + \x02\x04 + \x02\x6D + \x01\x29 + \x01\x2A + \x02\x05 + \x01\x39 + \x01\x2B + \x01\x2C + \x01\x37 + \x01\x26 + \x01\x38 + \x01\x36 + \x01\x70 + \x01\x71 + \x01\x07 + \x01\x28 + \x02\x7A + \x02\x69 + \x02\x6A + \x01\x2E + \x01\x2F + \x01\x13 + \x02\x09 + \x02\x78 + \x02\x7C + \x01\x7E + \x01\x7F + \x02\x52 + \x01\x77 + \x02\x53 + \x01\x76 + \x01\x4E + \x02\x51 + \x02\x16 + \x02\x15 + \x02\x14 + \x02\x13 + \x01\x1B + \x01\x18 + \x01\x1A + \x01\x19 + \x01\x1D + \x01\x12 + \x01\x17 + \x02\x1B + \x02\x18 + \x02\x1A + \x02\x19 + \x02\x1D + \x02\x12 + \x02\x66 + \x02\x64 + \x02\x50 + \x02\x67 + \x02\x7D + \x02\x41 + \x02\x5C + \x02\x5D + \x02\x5B + \x02\x79 + \x02\x7B + \x02\x6C + \x02\x6B + \x01\x1C + \x02\x40 + \x02\x6F + \x02\x6E + \x02\x65 + \x02\x71 + \x02\x77 + \x02\x76 + \x02\x70 + \x02\x73 + \x02\x72 + \x02\x7E + \x02\x7F + \x02\x5E + \x02\x5F + \x02\x61 + \x02\x60 + \x02\x63 + \x01\x7D + \x01\x4B + \x02\x74 + \x02\x75 + \x01\x55 + \x01\x56 + \x01\x5C + \x01\x5B + \x01\x54 + \x01\x53 + \x01\x5E + \x01\x5D + \x01\x46 + \x01\x47 + \x01\x59 + \x01\x5A + \x01\x51 + \x01\x52 + \x01\x5F + \x01\x50 + \x01\x58 + \x01\x57 + \x01\x48 + \x01\x49 + \x01\x16 + \x01\x1E + \x01\x10 + \x01\x1F + \x01\x11 + \x01\x4A + \x01\x09 + \x01\x08 + \x01\x0A + \x01\x01 + \x01\x02 + \x01\x0F + \x01\x0C + \x01\x0B + \x01\x06 + \x01\x05 + \x01\x03 + \x01\x04 + \x01\x0D + \x01\x0E + \x02\x62 + \x02\x0E + \x02\x0F + \x02\x10 + \x02\x11 + \x02\x3F + \x02\x17 + \x02\x1C + \x02\x1E + \x02\x68 + \x02\x5A + \x02\x59 + \x02\x58 + \x02\x57 + \x02\x56 + \x02\x55 + \x02\x54 + \x02\x4F + \x02\x4E + \x02\x4D + \x02\x4C + \x02\x4B + \x02\x4A + \x02\x49 + \x02\x48 + \x02\x47 + \x02\x46 + \x02\x45 + \x02\x44 + \x02\x43 + \x02\x42 + \x02\x3E + \x02\x3D + \x02\x3C + \x02\x3B + \x02\x3A + \x02\x39 + \x02\x38 + \x02\x37 + \x02\x36 + \x02\x35 + \x02\x34 + \x02\x33 + \x02\x32 + \x02\x31 + \x02\x30 + \x02\x2F + \x02\x2E + \x02\x2D + \x02\x2C + \x02\x2B + \x02\x2A + \x02\x29 + \x02\x28 + \x02\x27 + \x02\x26 + \x02\x25 + \x02\x24 + \x02\x23 + \x02\x22 + \x02\x21 + \x02\x20 + \x02\x1F + \x01\x7C + \x01\x63 + \x01\x62 + \x01\x4D + \x01\x4C + \x01\x3D +# +END CHARMAP +# +#________________________________________________________________________ diff --git a/icu4c/source/test/cintltst/nucnvtst.c b/icu4c/source/test/cintltst/nucnvtst.c index 0a6ed3c5d85..1ddcdf509f7 100644 --- a/icu4c/source/test/cintltst/nucnvtst.c +++ b/icu4c/source/test/cintltst/nucnvtst.c @@ -30,6 +30,7 @@ void TestNewConvertWithBufferSizes(int32_t osize, int32_t isize) ; void TestConverterTypesAndStarters(void); void TestAmbiguous(void); void TestUTF8(void); +void TestLMBCS(void); void TestJitterbug255(void); #define NEW_MAX_BUFFER 999 @@ -105,6 +106,7 @@ void addTestNewConvert(TestNode** root) addTest(root, &TestConverterTypesAndStarters, "tsconv/nucnvtst/TestConverterTypesAndStarters"); addTest(root, &TestAmbiguous, "tsconv/nucnvtst/TestAmbiguous"); addTest(root, &TestUTF8, "tsconv/nucnvtst/TestUTF8"); + addTest(root, &TestLMBCS, "tsconv/nucnvtst/TestLMBCS"); addTest(root, &TestJitterbug255, "tsconv/nucnvtst/TestJitterbug255"); } @@ -748,6 +750,68 @@ TestUTF8() { ucnv_close(cnv); } +void +TestLMBCS() { + /* test input */ + static const uint8_t in[]={ + 0x61, + 0x01, 0x29, + 0x81, + 0xA0, + 0x0F, 0x27, + 0x0F, 0x91, + 0x14, 0x0a, 0x74, + 0x14, 0xF6, 0x02, + 0x10, 0x88, 0xA0 + }; + + /* expected test results */ + static const uint32_t results[]={ + /* number of bytes read, code point */ + 1, 0x0061, + 2, 0x2013, + 1, 0x00FC, + 1, 0x00E1, + 2, 0x0007, + 2, 0x0091, + 3, 0x0a74, + 3, 0x0200, + 3, 0x5516 + + }; + + const char *s=(const char *)in, *s0, *limit=(const char *)in+sizeof(in); + const uint32_t *r=results; + + UErrorCode errorCode=U_ZERO_ERROR; + uint32_t c; + + UConverter *cnv=ucnv_open("LMBCS-1", &errorCode); + if(U_FAILURE(errorCode)) { + log_err("Unable to open a LMBCS-1 converter: %s\n", u_errorName(errorCode)); + } + else + { + + while(s