diff --git a/icu4c/source/common/ucnv_u16.c b/icu4c/source/common/ucnv_u16.c index ac5d3316c4a..927f9247438 100644 --- a/icu4c/source/common/ucnv_u16.c +++ b/icu4c/source/common/ucnv_u16.c @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2002-2009, International Business Machines +* Copyright (C) 2002-2010, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucnv_u16.c @@ -1279,11 +1279,23 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { } } +static const UConverterSharedData _UTF16v2Data; + static void _UTF16Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *pErrorCode) { - if(UCNV_GET_VERSION(cnv)<=1) { + if(UCNV_GET_VERSION(cnv)<=2) { + if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { + /* + * Switch implementation, and switch the staticData that's different + * and was copied into the UConverter. + * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) + * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. + */ + cnv->sharedData=&_UTF16v2Data; + uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); + } _UTF16Reset(cnv, UCNV_RESET_BOTH); } else { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; @@ -1294,8 +1306,10 @@ static const char * _UTF16GetName(const UConverter *cnv) { if(UCNV_GET_VERSION(cnv)==0) { return "UTF-16"; - } else { + } else if(UCNV_GET_VERSION(cnv)==1) { return "UTF-16,version=1"; + } else { + return "UTF-16,version=2"; } } @@ -1303,7 +1317,7 @@ const UConverterSharedData _UTF16Data; #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) -#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data) +#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data) static void _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, @@ -1503,4 +1517,45 @@ const UConverterSharedData _UTF16Data = { 0 }; +static const UConverterImpl _UTF16v2Impl = { + UCNV_UTF16, + + NULL, + NULL, + + _UTF16Open, + NULL, + _UTF16Reset, + + _UTF16ToUnicodeWithOffsets, + _UTF16ToUnicodeWithOffsets, + _UTF16BEFromUnicodeWithOffsets, + _UTF16BEFromUnicodeWithOffsets, + _UTF16GetNextUChar, + + NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ + _UTF16GetName, + NULL, + NULL, + ucnv_getNonSurrogateUnicodeSet +}; + +static const UConverterStaticData _UTF16v2StaticData = { + sizeof(UConverterStaticData), + "UTF-16,version=2", + 1204, /* CCSID for BOM sensitive UTF-16 */ + UCNV_IBM, UCNV_UTF16, 2, 2, + { 0xff, 0xfd, 0, 0 }, 2, + FALSE, FALSE, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +static const UConverterSharedData _UTF16v2Data = { + sizeof(UConverterSharedData), ~((uint32_t) 0), + NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, + 0 +}; + #endif diff --git a/icu4c/source/data/mappings/convrtrs.txt b/icu4c/source/data/mappings/convrtrs.txt index 256d5245f00..d047b16902b 100644 --- a/icu4c/source/data/mappings/convrtrs.txt +++ b/icu4c/source/data/mappings/convrtrs.txt @@ -1,6 +1,6 @@ # ****************************************************************************** # * -# * Copyright (C) 1995-2009, International Business Machines +# * Copyright (C) 1995-2010, International Business Machines # * Corporation and others. All Rights Reserved. # * # ****************************************************************************** @@ -271,6 +271,16 @@ UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA } # and a UCNV_ILLEGAL UConverterCallbackReason. UTF-16,version=1 +# This is the same as standard UTF-16 but always writes a big-endian byte stream, +# regardless of the platform endianness, as expected by the Java compatibility tests. +# See the java.nio.charset.Charset API documentation at +# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html +# or a newer version of this document. +# +# From Unicode: Write BE BOM and BE bytes +# To Unicode: Detects and consumes BOM. Defaults to BE. +UTF-16,version=2 + # Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants. # Presumably, these behave analogously to the UTF-16 variants with similar names. # UTF_32BE_BOM x-UTF-32BE-BOM diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 49d701e24b2..4dc7278050a 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -23,7 +23,7 @@ conversion:table(nofallback) { "Run intltest conversion\n" "Charset names starting with '*' are for testdata names.\n" - "Charset names starting with '+' are for charsets current not supported in ICU4J.\n" + "Charset names starting with '+' are for charsets currently not supported in ICU4J.\n" "ICU callbacks are specified as strings with pairs of characters, each optional.\n" "Callback function - '?'=Sub '0'=Skip '.'=Stop '&'=Escape\n" @@ -56,25 +56,31 @@ conversion:table(nofallback) { { "UTF-16", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } { "UTF-16", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } // Java "Unicode" requires a BOM - { "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } - { "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } - { "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } // Standard UTF-16BE { "UTF-16BE", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } { "UTF-16BE", :bin{ feff0061 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } { "UTF-16BE", :bin{ fffe0061 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } // Java "UnicodeBig" requires a BE BOM or no BOM; it consumes the BE BOM - { "+UTF-16BE,version=1", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } - { "+UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } - { "+UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16BE,version=1",:bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } // Standard UTF-16LE { "UTF-16LE", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } { "UTF-16LE", :bin{ fffe6100 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } { "UTF-16LE", :bin{ feff6100 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } // Java "UnicodeLittle" requires an LE BOM or no BOM; it consumes the LE BOM - { "+UTF-16LE,version=1", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } - { "+UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } - { "+x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16LE,version=1",:bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + + // Test ticket 7704: implement Java-compatible "UTF-16" converter. + // Same as standard UTF-16 but fromUnicode always writes big-endian byte stream. + { "+UTF-16,version=2", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16,version=2", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } + { "+UTF-16,version=2", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } // Test ticket 5691: consistent illegal sequences // The following test cases are for illegal character byte sequences. @@ -972,6 +978,10 @@ conversion:table(nofallback) { // Java "UnicodeLittle" writes a BOM { "+UnicodeLittle", "a", :bin{ fffe6100 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" } + // Test ticket 7704: implement Java-compatible "UTF-16" converter. + // Same as standard UTF-16 but fromUnicode always writes big-endian byte stream. + { "+UTF-16,version=2", "a", :bin{ feff0061 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" } + // Test bug 6071 (1:2 Unicode:charset SBCS mapping). { "*test1bmp",