mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-7704 add converter compatible with Java UTF-16: always writes big-endian regardless of platform
X-SVN-Rev: 28089
This commit is contained in:
parent
2510483f25
commit
5ab64ef670
3 changed files with 90 additions and 15 deletions
icu4c/source
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2009, International Business Machines
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ucnv_u16.c
|
||||
|
@ -1279,11 +1279,23 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
|
|||
}
|
||||
}
|
||||
|
||||
static const UConverterSharedData _UTF16v2Data;
|
||||
|
||||
static void
|
||||
_UTF16Open(UConverter *cnv,
|
||||
UConverterLoadArgs *pArgs,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(UCNV_GET_VERSION(cnv)<=1) {
|
||||
if(UCNV_GET_VERSION(cnv)<=2) {
|
||||
if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
|
||||
/*
|
||||
* Switch implementation, and switch the staticData that's different
|
||||
* and was copied into the UConverter.
|
||||
* (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
|
||||
* UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
|
||||
*/
|
||||
cnv->sharedData=&_UTF16v2Data;
|
||||
uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
|
||||
}
|
||||
_UTF16Reset(cnv, UCNV_RESET_BOTH);
|
||||
} else {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -1294,8 +1306,10 @@ static const char *
|
|||
_UTF16GetName(const UConverter *cnv) {
|
||||
if(UCNV_GET_VERSION(cnv)==0) {
|
||||
return "UTF-16";
|
||||
} else {
|
||||
} else if(UCNV_GET_VERSION(cnv)==1) {
|
||||
return "UTF-16,version=1";
|
||||
} else {
|
||||
return "UTF-16,version=2";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1303,7 +1317,7 @@ const UConverterSharedData _UTF16Data;
|
|||
|
||||
#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
|
||||
#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
|
||||
#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data)
|
||||
#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
|
||||
|
||||
static void
|
||||
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
|
@ -1503,4 +1517,45 @@ const UConverterSharedData _UTF16Data = {
|
|||
0
|
||||
};
|
||||
|
||||
static const UConverterImpl _UTF16v2Impl = {
|
||||
UCNV_UTF16,
|
||||
|
||||
NULL,
|
||||
NULL,
|
||||
|
||||
_UTF16Open,
|
||||
NULL,
|
||||
_UTF16Reset,
|
||||
|
||||
_UTF16ToUnicodeWithOffsets,
|
||||
_UTF16ToUnicodeWithOffsets,
|
||||
_UTF16BEFromUnicodeWithOffsets,
|
||||
_UTF16BEFromUnicodeWithOffsets,
|
||||
_UTF16GetNextUChar,
|
||||
|
||||
NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
|
||||
_UTF16GetName,
|
||||
NULL,
|
||||
NULL,
|
||||
ucnv_getNonSurrogateUnicodeSet
|
||||
};
|
||||
|
||||
static const UConverterStaticData _UTF16v2StaticData = {
|
||||
sizeof(UConverterStaticData),
|
||||
"UTF-16,version=2",
|
||||
1204, /* CCSID for BOM sensitive UTF-16 */
|
||||
UCNV_IBM, UCNV_UTF16, 2, 2,
|
||||
{ 0xff, 0xfd, 0, 0 }, 2,
|
||||
FALSE, FALSE,
|
||||
0,
|
||||
0,
|
||||
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
|
||||
};
|
||||
|
||||
static const UConverterSharedData _UTF16v2Data = {
|
||||
sizeof(UConverterSharedData), ~((uint32_t) 0),
|
||||
NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,
|
||||
0
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# ******************************************************************************
|
||||
# *
|
||||
# * Copyright (C) 1995-2009, International Business Machines
|
||||
# * Copyright (C) 1995-2010, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# *
|
||||
# ******************************************************************************
|
||||
|
@ -271,6 +271,16 @@ UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA }
|
|||
# and a UCNV_ILLEGAL UConverterCallbackReason.
|
||||
UTF-16,version=1
|
||||
|
||||
# This is the same as standard UTF-16 but always writes a big-endian byte stream,
|
||||
# regardless of the platform endianness, as expected by the Java compatibility tests.
|
||||
# See the java.nio.charset.Charset API documentation at
|
||||
# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
|
||||
# or a newer version of this document.
|
||||
#
|
||||
# From Unicode: Write BE BOM and BE bytes
|
||||
# To Unicode: Detects and consumes BOM. Defaults to BE.
|
||||
UTF-16,version=2
|
||||
|
||||
# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
|
||||
# Presumably, these behave analogously to the UTF-16 variants with similar names.
|
||||
# UTF_32BE_BOM x-UTF-32BE-BOM
|
||||
|
|
30
icu4c/source/test/testdata/conversion.txt
vendored
30
icu4c/source/test/testdata/conversion.txt
vendored
|
@ -23,7 +23,7 @@ conversion:table(nofallback) {
|
|||
"Run intltest conversion\n"
|
||||
|
||||
"Charset names starting with '*' are for testdata names.\n"
|
||||
"Charset names starting with '+' are for charsets current not supported in ICU4J.\n"
|
||||
"Charset names starting with '+' are for charsets currently not supported in ICU4J.\n"
|
||||
|
||||
"ICU callbacks are specified as strings with pairs of characters, each optional.\n"
|
||||
"Callback function - '?'=Sub '0'=Skip '.'=Stop '&'=Escape\n"
|
||||
|
@ -56,25 +56,31 @@ conversion:table(nofallback) {
|
|||
{ "UTF-16", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "UTF-16", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
// Java "Unicode" requires a BOM
|
||||
{ "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
// Standard UTF-16BE
|
||||
{ "UTF-16BE", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "UTF-16BE", :bin{ feff0061 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "UTF-16BE", :bin{ fffe0061 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
// Java "UnicodeBig" requires a BE BOM or no BOM; it consumes the BE BOM
|
||||
{ "+UTF-16BE,version=1", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16BE,version=1",:bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
// Standard UTF-16LE
|
||||
{ "UTF-16LE", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "UTF-16LE", :bin{ fffe6100 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "UTF-16LE", :bin{ feff6100 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
// Java "UnicodeLittle" requires an LE BOM or no BOM; it consumes the LE BOM
|
||||
{ "+UTF-16LE,version=1", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16LE,version=1",:bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
|
||||
// Test ticket 7704: implement Java-compatible "UTF-16" converter.
|
||||
// Same as standard UTF-16 but fromUnicode always writes big-endian byte stream.
|
||||
{ "+UTF-16,version=2", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=2", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
{ "+UTF-16,version=2", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
|
||||
|
||||
// Test ticket 5691: consistent illegal sequences
|
||||
// The following test cases are for illegal character byte sequences.
|
||||
|
@ -972,6 +978,10 @@ conversion:table(nofallback) {
|
|||
// Java "UnicodeLittle" writes a BOM
|
||||
{ "+UnicodeLittle", "a", :bin{ fffe6100 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" }
|
||||
|
||||
// Test ticket 7704: implement Java-compatible "UTF-16" converter.
|
||||
// Same as standard UTF-16 but fromUnicode always writes big-endian byte stream.
|
||||
{ "+UTF-16,version=2", "a", :bin{ feff0061 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" }
|
||||
|
||||
// Test bug 6071 (1:2 Unicode:charset SBCS mapping).
|
||||
{
|
||||
"*test1bmp",
|
||||
|
|
Loading…
Add table
Reference in a new issue