ICU-7704 add converter compatible with Java UTF-16: always writes big-endian regardless of platform

X-SVN-Rev: 28089
This commit is contained in:
Markus Scherer 2010-05-23 05:02:54 +00:00
parent 2510483f25
commit 5ab64ef670
3 changed files with 90 additions and 15 deletions
icu4c/source
common
data/mappings
test/testdata

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2009, International Business Machines
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnv_u16.c
@ -1279,11 +1279,23 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
}
}
static const UConverterSharedData _UTF16v2Data;
static void
_UTF16Open(UConverter *cnv,
UConverterLoadArgs *pArgs,
UErrorCode *pErrorCode) {
if(UCNV_GET_VERSION(cnv)<=1) {
if(UCNV_GET_VERSION(cnv)<=2) {
if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
/*
* Switch implementation, and switch the staticData that's different
* and was copied into the UConverter.
* (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
* UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
*/
cnv->sharedData=&_UTF16v2Data;
uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
}
_UTF16Reset(cnv, UCNV_RESET_BOTH);
} else {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
@ -1294,8 +1306,10 @@ static const char *
_UTF16GetName(const UConverter *cnv) {
if(UCNV_GET_VERSION(cnv)==0) {
return "UTF-16";
} else {
} else if(UCNV_GET_VERSION(cnv)==1) {
return "UTF-16,version=1";
} else {
return "UTF-16,version=2";
}
}
@ -1303,7 +1317,7 @@ const UConverterSharedData _UTF16Data;
#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data)
#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
static void
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
@ -1503,4 +1517,45 @@ const UConverterSharedData _UTF16Data = {
0
};
static const UConverterImpl _UTF16v2Impl = {
UCNV_UTF16,
NULL,
NULL,
_UTF16Open,
NULL,
_UTF16Reset,
_UTF16ToUnicodeWithOffsets,
_UTF16ToUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
_UTF16BEFromUnicodeWithOffsets,
_UTF16GetNextUChar,
NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
_UTF16GetName,
NULL,
NULL,
ucnv_getNonSurrogateUnicodeSet
};
static const UConverterStaticData _UTF16v2StaticData = {
sizeof(UConverterStaticData),
"UTF-16,version=2",
1204, /* CCSID for BOM sensitive UTF-16 */
UCNV_IBM, UCNV_UTF16, 2, 2,
{ 0xff, 0xfd, 0, 0 }, 2,
FALSE, FALSE,
0,
0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
};
static const UConverterSharedData _UTF16v2Data = {
sizeof(UConverterSharedData), ~((uint32_t) 0),
NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,
0
};
#endif

View file

@ -1,6 +1,6 @@
# ******************************************************************************
# *
# * Copyright (C) 1995-2009, International Business Machines
# * Copyright (C) 1995-2010, International Business Machines
# * Corporation and others. All Rights Reserved.
# *
# ******************************************************************************
@ -271,6 +271,16 @@ UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA }
# and a UCNV_ILLEGAL UConverterCallbackReason.
UTF-16,version=1
# This is the same as standard UTF-16 but always writes a big-endian byte stream,
# regardless of the platform endianness, as expected by the Java compatibility tests.
# See the java.nio.charset.Charset API documentation at
# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
# or a newer version of this document.
#
# From Unicode: Write BE BOM and BE bytes
# To Unicode: Detects and consumes BOM. Defaults to BE.
UTF-16,version=2
# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
# Presumably, these behave analogously to the UTF-16 variants with similar names.
# UTF_32BE_BOM x-UTF-32BE-BOM

View file

@ -23,7 +23,7 @@ conversion:table(nofallback) {
"Run intltest conversion\n"
"Charset names starting with '*' are for testdata names.\n"
"Charset names starting with '+' are for charsets current not supported in ICU4J.\n"
"Charset names starting with '+' are for charsets currently not supported in ICU4J.\n"
"ICU callbacks are specified as strings with pairs of characters, each optional.\n"
"Callback function - '?'=Sub '0'=Skip '.'=Stop '&'=Escape\n"
@ -56,25 +56,31 @@ conversion:table(nofallback) {
{ "UTF-16", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "UTF-16", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Java "Unicode" requires a BOM
{ "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Standard UTF-16BE
{ "UTF-16BE", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "UTF-16BE", :bin{ feff0061 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "UTF-16BE", :bin{ fffe0061 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Java "UnicodeBig" requires a BE BOM or no BOM; it consumes the BE BOM
{ "+UTF-16BE,version=1", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16BE,version=1",:bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Standard UTF-16LE
{ "UTF-16LE", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "UTF-16LE", :bin{ fffe6100 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "UTF-16LE", :bin{ feff6100 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Java "UnicodeLittle" requires an LE BOM or no BOM; it consumes the LE BOM
{ "+UTF-16LE,version=1", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16LE,version=1",:bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Test ticket 7704: implement Java-compatible "UTF-16" converter.
// Same as standard UTF-16 but fromUnicode always writes big-endian byte stream.
{ "+UTF-16,version=2", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=2", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
{ "+UTF-16,version=2", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
// Test ticket 5691: consistent illegal sequences
// The following test cases are for illegal character byte sequences.
@ -972,6 +978,10 @@ conversion:table(nofallback) {
// Java "UnicodeLittle" writes a BOM
{ "+UnicodeLittle", "a", :bin{ fffe6100 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" }
// Test ticket 7704: implement Java-compatible "UTF-16" converter.
// Same as standard UTF-16 but fromUnicode always writes big-endian byte stream.
{ "+UTF-16,version=2", "a", :bin{ feff0061 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" }
// Test bug 6071 (1:2 Unicode:charset SBCS mapping).
{
"*test1bmp",