ICU-7704 add converter compatible with Java UTF-16: always writes big-endian regardless of platform

X-SVN-Rev: 28089
2025-04-13 08:53:20 +00:00 · 2010-05-23 05:02:54 +00:00 · 2010-05-23 05:02:54 +00:00 · 5ab64ef670
commit 5ab64ef670
parent 2510483f25
3 changed files with 90 additions and 15 deletions
--- a/icu4c/source/common/ucnv_u16.c
+++ b/icu4c/source/common/ucnv_u16.c
@ -1,6 +1,6 @@
 /*  
 **********************************************************************
-*   Copyright (C) 2002-2009, International Business Machines
+*   Copyright (C) 2002-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  ucnv_u16.c
@ -1279,11 +1279,23 @@ _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
    }
 }

+static const UConverterSharedData _UTF16v2Data;
+
 static void
 _UTF16Open(UConverter *cnv,
           UConverterLoadArgs *pArgs,
           UErrorCode *pErrorCode) {
-    if(UCNV_GET_VERSION(cnv)<=1) {
+    if(UCNV_GET_VERSION(cnv)<=2) {
+        if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
+            /*
+             * Switch implementation, and switch the staticData that's different
+             * and was copied into the UConverter.
+             * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
+             * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
+             */
+            cnv->sharedData=&_UTF16v2Data;
+            uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
+        }
        _UTF16Reset(cnv, UCNV_RESET_BOTH);
    } else {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
@ -1294,8 +1306,10 @@ static const char *
 _UTF16GetName(const UConverter *cnv) {
    if(UCNV_GET_VERSION(cnv)==0) {
        return "UTF-16";
-    } else {
+    } else if(UCNV_GET_VERSION(cnv)==1) {
        return "UTF-16,version=1";
+    } else {
+        return "UTF-16,version=2";
    }
 }

@ -1303,7 +1317,7 @@ const UConverterSharedData _UTF16Data;

 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
-#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data)
+#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)

 static void
 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
@ -1503,4 +1517,45 @@ const UConverterSharedData _UTF16Data = {
    0
 };

+static const UConverterImpl _UTF16v2Impl = {
+    UCNV_UTF16,
+
+    NULL,
+    NULL,
+
+    _UTF16Open,
+    NULL,
+    _UTF16Reset,
+
+    _UTF16ToUnicodeWithOffsets,
+    _UTF16ToUnicodeWithOffsets,
+    _UTF16BEFromUnicodeWithOffsets,
+    _UTF16BEFromUnicodeWithOffsets,
+    _UTF16GetNextUChar,
+
+    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
+    _UTF16GetName,
+    NULL,
+    NULL,
+    ucnv_getNonSurrogateUnicodeSet
+};
+
+static const UConverterStaticData _UTF16v2StaticData = {
+    sizeof(UConverterStaticData),
+    "UTF-16,version=2",
+    1204, /* CCSID for BOM sensitive UTF-16 */
+    UCNV_IBM, UCNV_UTF16, 2, 2,
+    { 0xff, 0xfd, 0, 0 }, 2,
+    FALSE, FALSE,
+    0,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+
+static const UConverterSharedData _UTF16v2Data = {
+    sizeof(UConverterSharedData), ~((uint32_t) 0),
+    NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, 
+    0
+};
+
 #endif
--- a/icu4c/source/data/mappings/convrtrs.txt
+++ b/icu4c/source/data/mappings/convrtrs.txt
@ -1,6 +1,6 @@
 # ******************************************************************************
 # *
-# *   Copyright (C) 1995-2009, International Business Machines
+# *   Copyright (C) 1995-2010, International Business Machines
 # *   Corporation and others.  All Rights Reserved.
 # *
 # ******************************************************************************
@ -271,6 +271,16 @@ UTF-16LE,version=1		UnicodeLittle { JAVA* }  x-UTF-16LE-BOM { JAVA }
 #   and a UCNV_ILLEGAL UConverterCallbackReason.
 UTF-16,version=1

+# This is the same as standard UTF-16 but always writes a big-endian byte stream,
+# regardless of the platform endianness, as expected by the Java compatibility tests.
+# See the java.nio.charset.Charset API documentation at
+# http://java.sun.com/javase/6/docs/api/java/nio/charset/Charset.html
+# or a newer version of this document.
+#
+# From Unicode: Write BE BOM and BE bytes
+# To Unicode: Detects and consumes BOM. Defaults to BE.
+UTF-16,version=2
+
 # Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants.
 # Presumably, these behave analogously to the UTF-16 variants with similar names.
 # UTF_32BE_BOM  x-UTF-32BE-BOM
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@ -23,7 +23,7 @@ conversion:table(nofallback) {
      "Run intltest conversion\n"

      "Charset names starting with '*' are for testdata names.\n"
-      "Charset names starting with '+' are for charsets current not supported in ICU4J.\n"
+      "Charset names starting with '+' are for charsets currently not supported in ICU4J.\n"

      "ICU callbacks are specified as strings with pairs of characters, each optional.\n"
      "Callback function - '?'=Sub '0'=Skip '.'=Stop '&'=Escape\n"
@ -56,25 +56,31 @@ conversion:table(nofallback) {
        { "UTF-16",             :bin{ feff0061 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
        { "UTF-16",             :bin{ fffe0061 }, "\u6100",       :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
        // Java "Unicode" requires a BOM
-        { "+UTF-16,version=1",   :bin{ 00610062 }, "\\x00\\x61b",  :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
-        { "+UTF-16,version=1",   :bin{ feff0061 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
-        { "+UTF-16,version=1",   :bin{ fffe0061 }, "\u6100",       :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16,version=1",  :bin{ 00610062 }, "\\x00\\x61b",  :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16,version=1",  :bin{ feff0061 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16,version=1",  :bin{ fffe0061 }, "\u6100",       :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
        // Standard UTF-16BE
        { "UTF-16BE",           :bin{ 00610062 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        { "UTF-16BE",           :bin{ feff0061 }, "\ufeffa",      :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        { "UTF-16BE",           :bin{ fffe0061 }, "\ufffea",      :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        // Java "UnicodeBig" requires a BE BOM or no BOM; it consumes the BE BOM
-        { "+UTF-16BE,version=1", :bin{ 00610062 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
-        { "+UnicodeBig",         :bin{ feff0061 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
-        { "+UnicodeBig",         :bin{ fffe0061 }, "\\xFF\\xFEa",  :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16BE,version=1",:bin{ 00610062 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UnicodeBig",        :bin{ feff0061 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UnicodeBig",        :bin{ fffe0061 }, "\\xFF\\xFEa",  :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        // Standard UTF-16LE
        { "UTF-16LE",           :bin{ 61006200 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        { "UTF-16LE",           :bin{ fffe6100 }, "\ufeffa",      :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        { "UTF-16LE",           :bin{ feff6100 }, "\ufffea",      :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
        // Java "UnicodeLittle" requires an LE BOM or no BOM; it consumes the LE BOM
-        { "+UTF-16LE,version=1", :bin{ 61006200 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
-        { "+UnicodeLittle",      :bin{ fffe6100 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
-        { "+x-UTF-16LE-BOM",     :bin{ feff6100 }, "\\xFE\\xFFa",  :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16LE,version=1",:bin{ 61006200 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UnicodeLittle",     :bin{ fffe6100 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+x-UTF-16LE-BOM",    :bin{ feff6100 }, "\\xFE\\xFFa",  :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+
+        // Test ticket 7704: implement Java-compatible "UTF-16" converter.
+        // Same as standard UTF-16 but fromUnicode always writes big-endian byte stream.
+        { "+UTF-16,version=2",  :bin{ 00610062 }, "ab",           :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16,version=2",  :bin{ feff0061 }, "a",            :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }
+        { "+UTF-16,version=2",  :bin{ fffe0061 }, "\u6100",       :intvector{ 2 },   :int{1}, :int{0}, "", "&C", :bin{""} }

        // Test ticket 5691: consistent illegal sequences
        // The following test cases are for illegal character byte sequences.
@ -972,6 +978,10 @@ conversion:table(nofallback) {
        // Java "UnicodeLittle" writes a BOM
        { "+UnicodeLittle", "a", :bin{ fffe6100 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" }

+        // Test ticket 7704: implement Java-compatible "UTF-16" converter.
+        // Same as standard UTF-16 but fromUnicode always writes big-endian byte stream.
+        { "+UTF-16,version=2", "a", :bin{ feff0061 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" }
+
        // Test bug 6071 (1:2 Unicode:charset SBCS mapping).
        {
          "*test1bmp",