ICU-11479 precompute Java canonical names for not-only-ICU Charsets, make CharsetProviderICU all-static again (see ticket #9973), simplify some test code

X-SVN-Rev: 36966
2025-04-19 11:45:45 +00:00 · 2015-01-16 17:35:52 +00:00 · 2015-01-16 17:35:52 +00:00 · 1604e1772d
commit 1604e1772d
parent 6572a72c63
3 changed files with 204 additions and 162 deletions
--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetICU.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetICU.java
@ -1,9 +1,7 @@
 /**
 *******************************************************************************
-* Copyright (C) 2006-2014, International Business Machines Corporation and    *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*
+* Copyright (C) 2006-2015, International Business Machines Corporation and
+* others. All Rights Reserved.
 *******************************************************************************
 */ 

@ -33,7 +31,6 @@ import com.ibm.icu.text.UnicodeSet;
 public abstract class CharsetICU extends Charset{

     String icuCanonicalName;
-     String javaCanonicalName;
     int options;

     float  maxCharsPerByte;
@ -87,7 +84,6 @@ public abstract class CharsetICU extends Charset{
        if(canonicalName.length() == 0){
            throw new IllegalCharsetNameException(canonicalName);
        }
-        this.javaCanonicalName = canonicalName;
        this.icuCanonicalName  = icuCanonicalName;
    }
    
--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetProviderICU.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetProviderICU.java
@ -1,9 +1,7 @@
 /**
 *******************************************************************************
-* Copyright (C) 2006-2014, International Business Machines Corporation and    *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*
+* Copyright (C) 2006-2015, International Business Machines Corporation and
+* others. All Rights Reserved.
 *******************************************************************************
 */

@ -13,8 +11,11 @@ import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.nio.charset.spi.CharsetProvider;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;

 import com.ibm.icu.impl.InvalidFormatException;
@ -26,37 +27,86 @@ import com.ibm.icu.impl.InvalidFormatException;
 * @stable ICU 3.6
 */
 public final class CharsetProviderICU extends CharsetProvider{
-    private String optionsString;
-    
+    /**
+     * List of available ICU Charsets, empty during static initialization.
+     */
+    private static List<Charset> icuCharsets = Collections.<Charset>emptyList();
+    /**
+     * Maps uppercased Java charset names and aliases to canonical Java charset names.
+     */
+    private static final Map<String, String> javaNamesMap = new HashMap<String, String>();
+
+    static {
+        // This loop will exclude ICU charsets because Charset.availableCharsets() calls
+        // our charsets() which returns an empty iterator
+        // until we have tried to open all of the ICU charsets and built icuCharsets.
+        // We can only open ICU charsets when we have the javaNamesMap,
+        // for getting the Java canonical name.
+        for (Map.Entry<String, Charset> nameAndCharset : Charset.availableCharsets().entrySet()) {
+            String canonicalName = nameAndCharset.getKey();
+            javaNamesMap.put(ASCII.toUpperCase(canonicalName), canonicalName);
+            for (String alias : nameAndCharset.getValue().aliases()) {
+                javaNamesMap.put(ASCII.toUpperCase(alias), canonicalName);
+            }
+        }
+    }
+
+    /**
+     * Simpler/faster methods for ASCII than ones based on Unicode data.
+     * TODO: There should be code like this somewhere already??
+     */
+    private static final class ASCII {
+        static String toUpperCase(String s) {
+            for (int i = 0; i < s.length(); ++i) {
+                char c = s.charAt(i);
+                if ('a' <= c && c <= 'z') {
+                    StringBuilder sb = new StringBuilder(s.length());
+                    sb.append(s, 0, i).append((char)(c - 0x20));
+                    while (++i < s.length()) {
+                        c = s.charAt(i);
+                        if ('a' <= c && c <= 'z') { c = (char)(c - 0x20); }
+                        sb.append(c);
+                    }
+                    return sb.toString();
+                }
+            }
+            return s;
+        }
+    }
+
    /**
     * Default constructor 
     * @stable ICU 3.6
     */
    public CharsetProviderICU() {
-        optionsString = null;
    }

    /**
-     * Constructs a charset for the given charset name. 
+     * Constructs a Charset for the given charset name. 
     * Implements the abstract method of super class.
     * @param charsetName charset name
-     * @return charset objet for the given charset name, null if unsupported
+     * @return Charset object for the given charset name, null if unsupported
     * @stable ICU 3.6
     */
    public final Charset charsetForName(String charsetName){
        try{
            // extract the options from the charset name
-            charsetName = processOptions(charsetName);
+            String optionsString = "";
+            if (charsetName.endsWith(UConverterConstants.OPTION_SWAP_LFNL_STRING)) {
+                /* Remove and save the swap lfnl option string portion of the charset name. */
+                optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING;
+                charsetName = charsetName.substring(0, charsetName.length() - optionsString.length());
+            }
            // get the canonical name
            String icuCanonicalName = getICUCanonicalName(charsetName);      
-    
-                // create the converter object and return it
+
+            // create the converter object and return it
            if(icuCanonicalName==null || icuCanonicalName.length()==0){
                // Try the original name, may be something added and not in the alias table. 
                // Will get an unsupported encoding exception if it doesn't work.
-                return getCharset(charsetName);
+                icuCanonicalName = charsetName;
            }
-            return getCharset(icuCanonicalName);
+            return getCharset(icuCanonicalName, optionsString);
        }catch(UnsupportedCharsetException ex){
        }catch(IOException ex){
        }
@ -144,19 +194,15 @@ public final class CharsetProviderICU extends CharsetProvider{
            throw new UnsupportedCharsetException(enc);
        } 
    }
-    private Charset getCharset(String icuCanonicalName) throws IOException{
-       String[] aliases = getAliases(icuCanonicalName);    
+    private static final Charset getCharset(String icuCanonicalName, String optionsString)
+            throws IOException {
+       String[] aliases = getAliases(icuCanonicalName);
       String canonicalName = getJavaCanonicalName(icuCanonicalName);
-       
+
       /* Concat the option string to the icuCanonicalName so that the options can be handled properly
        * by the actual charset.
        */
-       if (optionsString != null) {
-           icuCanonicalName = icuCanonicalName.concat(optionsString);
-           optionsString = null;
-       }
-       
-       return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases));
+       return (CharsetICU.getCharset(icuCanonicalName + optionsString, canonicalName, aliases));
    }
    /**
     * Gets the canonical name of the converter as defined by Java
@ -213,34 +259,18 @@ public final class CharsetProviderICU extends CharsetProvider{
                    cName = "x-"+ name;
                }
            }
-            /* After getting the java canonical name from ICU alias table, get the
-             * java canonical name from the current JDK. This is neccessary because
+            /* After getting the Java canonical name from the ICU alias table, get the
+             * Java canonical name from the current JDK. This is necessary because
             * different versions of the JVM (Sun and IBM) may have a different
-             * canonical name then the one given by ICU. So the java canonical name
-             * will depend on the current JVM.  Since java cannot use the ICU canonical 
-             * we have to try to use a java compatible name.
+             * canonical name than the one given by ICU. So the Java canonical name
+             * will depend on the current JVM.  Since Java cannot use the ICU canonical name
+             * we have to try to use a Java compatible name.
             */
            if (cName != null) {
-                try {
-                    if (Charset.isSupported(cName)) {
-                        String testName = Charset.forName(cName).name();
-                        /* Ensure that the java canonical name works in ICU */
-                        if (!testName.equals(cName)) {
-                            if (getICUCanonicalName(testName).length() > 0) {
-                                cName = testName;
-                            }
-                        }
-                    }
-                } catch (Exception e) {
-                    // Any exception in the try block above
-                    // must result Java's canonical name to be
-                    // null. This block is necessary to reset
-                    // gettingJavaCanonicalName to true always.
-                    // See #9966.
-                    // Note: The use of static gettingJavaCanonicalName
-                    // looks really dangerous and obviously thread unsafe.
-                    // We should revisit this code later. See #9973
-                    cName = null;
+                String testName = javaNamesMap.get(ASCII.toUpperCase(cName));
+                if (testName != null && !testName.equals(cName) &&
+                        getICUCanonicalName(testName).length() > 0) {
+                    cName = testName;
                }
            }
            return cName;
@ -283,46 +313,61 @@ public final class CharsetProviderICU extends CharsetProvider{
    
    }

-    private void putCharsets(Map<Charset, String> map){
+    /**
+     * Lazy-init the icuCharsets list.
+     * Could be done during static initialization if constructing all of the Charsets
+     * were cheap enough. See ICU ticket #11481.
+     */
+    private static final synchronized void loadAvailableICUCharsets() {
+        // The Java names Map is empty during static initialization when we are
+        // just about to build it.
+        if (!icuCharsets.isEmpty() || javaNamesMap.isEmpty()) {
+            return;
+        }
+        List<Charset> icucs = new LinkedList<Charset>();
        int num = UConverterAlias.countAvailable();
-        for(int i=0;i<num;i++) {
+        for (int i = 0; i < num; ++i) {
            String name = UConverterAlias.getAvailableName(i);
            try {
-                Charset cs = getCharset(name);
-                map.put(cs, getJavaCanonicalName(name));
-            }catch(UnsupportedCharsetException ex){
-            }catch (IOException e) {
+                Charset cs = getCharset(name, "");
+                icucs.add(cs);
+            } catch(UnsupportedCharsetException ex) {
+            } catch(IOException e) {
            }
            // add only charsets that can be created!
        }
+        // Unmodifiable so that charsets().next().remove() cannot change it.
+        icuCharsets = Collections.unmodifiableList(icucs);
    }

    /**
-     * Returns an iterator for the available charsets.
+     * Returns an iterator for the available ICU Charsets.
     * Implements the abstract method of super class.
-     * @return Iterator the charset name iterator
+     * @return the Charset iterator
     * @stable ICU 3.6
     */
-    public final Iterator<Charset> charsets(){
-        HashMap<Charset, String> map = new HashMap<Charset, String>();
-        putCharsets(map);
-        return map.keySet().iterator();
+    public final Iterator<Charset> charsets() {
+        loadAvailableICUCharsets();
+        return icuCharsets.iterator();
    }
-    
+
    /**
-     * Gets the canonical names of available converters 
+     * Gets the canonical names of available ICU converters 
     * @return array of available converter names
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
-     public static final String[] getAvailableNames(){
-        CharsetProviderICU provider = new CharsetProviderICU();
-        HashMap<Charset, String> map = new HashMap<Charset, String>();
-        provider.putCharsets(map);
-        return map.values().toArray(new String[0]);
+     public static final String[] getAvailableNames() {
+        loadAvailableICUCharsets();
+        String[] names = new String[icuCharsets.size()];
+        int i = 0;
+        for (Charset cs : icuCharsets) {
+            names[i++] = cs.name();
+        }
+        return names;
    }
-     
+
    /**
     * Return all names available
     * @return String[] an array of all available names
@ -338,15 +383,4 @@ public final class CharsetProviderICU extends CharsetProvider{
        }
        return names;
    }
-    
-    private String processOptions(String charsetName) {
-        if (charsetName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) {
-            /* Remove and save the swap lfnl option string portion of the charset name. */
-            optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING;
-            
-            charsetName = charsetName.substring(0, charsetName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING));
-        }
-        
-        return charsetName;
-    }
 }
--- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
@ -1,9 +1,7 @@
 /**
 *******************************************************************************
-* Copyright (C) 2006-2014, International Business Machines Corporation and    *
-* others. All Rights Reserved.                                                *
-*******************************************************************************
-*
+* Copyright (C) 2006-2015, International Business Machines Corporation and
+* others. All Rights Reserved.
 *******************************************************************************
 */

@ -36,49 +34,6 @@ import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;

 public class TestCharset extends TestFmwk {
-    private String m_encoding = "UTF-16";
-    CharsetDecoder m_decoder = null;
-    CharsetEncoder m_encoder = null;
-    Charset m_charset =null;
-    static final String unistr = "abcd\ud800\udc00\u1234\u00a5\u3000\r\n";
-    static final byte[] byteStr ={   
-            (byte) 0x00,(byte) 'a',
-            (byte) 0x00,(byte) 'b',
-            (byte) 0x00,(byte) 'c',
-            (byte) 0x00,(byte) 'd',
-            (byte) 0xd8,(byte) 0x00,
-            (byte) 0xdc,(byte) 0x00,
-            (byte) 0x12,(byte) 0x34,
-            (byte) 0x00,(byte) 0xa5,
-            (byte) 0x30,(byte) 0x00,
-            (byte) 0x00,(byte) 0x0d,
-            (byte) 0x00,(byte) 0x0a };
-    static final byte[] expectedByteStr ={
-        (byte) 0xfe,(byte) 0xff,
-        (byte) 0x00,(byte) 'a',
-        (byte) 0x00,(byte) 'b',
-        (byte) 0x00,(byte) 'c',
-        (byte) 0x00,(byte) 'd',
-        (byte) 0xd8,(byte) 0x00,
-        (byte) 0xdc,(byte) 0x00,
-        (byte) 0x12,(byte) 0x34,
-        (byte) 0x00,(byte) 0xa5,
-        (byte) 0x30,(byte) 0x00,
-        (byte) 0x00,(byte) 0x0d,
-        (byte) 0x00,(byte) 0x0a };
-    
-    protected void init(){
-        try{
-            CharsetProviderICU provider = new CharsetProviderICU();
-            //Charset charset = CharsetICU.forName(encoding);
-            m_charset = provider.charsetForName(m_encoding);
-            m_decoder = (CharsetDecoder) m_charset.newDecoder();
-            m_encoder = (CharsetEncoder) m_charset.newEncoder();   
-        }catch(MissingResourceException ex){
-            warnln("Could not load charset data");
-        }
-    }
-    
    public static void main(String[] args) throws Exception {
        new TestCharset().run(args);
    }
@ -1180,23 +1135,63 @@ public class TestCharset extends TestFmwk {
 //    }
    

-    public void TestAPISemantics(/*String encoding*/) 
-                throws Exception {
-        int rc;
+    public void TestAPISemantics(/*String encoding*/) {
+        String encoding = "UTF-16";
+        CharsetDecoder decoder = null;
+        CharsetEncoder encoder = null;
+        try {
+            CharsetProviderICU provider = new CharsetProviderICU();
+            Charset charset = provider.charsetForName(encoding);
+            decoder = charset.newDecoder();
+            encoder = charset.newEncoder();
+        } catch(MissingResourceException ex) {
+            warnln("Could not load charset data: " + encoding);
+            return;
+        }
+
+        final String unistr = "abcd\ud800\udc00\u1234\u00a5\u3000\r\n";
+        final byte[] byteStr = {
+            (byte) 0x00,(byte) 'a',
+            (byte) 0x00,(byte) 'b',
+            (byte) 0x00,(byte) 'c',
+            (byte) 0x00,(byte) 'd',
+            (byte) 0xd8,(byte) 0x00,
+            (byte) 0xdc,(byte) 0x00,
+            (byte) 0x12,(byte) 0x34,
+            (byte) 0x00,(byte) 0xa5,
+            (byte) 0x30,(byte) 0x00,
+            (byte) 0x00,(byte) 0x0d,
+            (byte) 0x00,(byte) 0x0a
+        };
+        final byte[] expectedByteStr = {
+            (byte) 0xfe,(byte) 0xff,
+            (byte) 0x00,(byte) 'a',
+            (byte) 0x00,(byte) 'b',
+            (byte) 0x00,(byte) 'c',
+            (byte) 0x00,(byte) 'd',
+            (byte) 0xd8,(byte) 0x00,
+            (byte) 0xdc,(byte) 0x00,
+            (byte) 0x12,(byte) 0x34,
+            (byte) 0x00,(byte) 0xa5,
+            (byte) 0x30,(byte) 0x00,
+            (byte) 0x00,(byte) 0x0d,
+            (byte) 0x00,(byte) 0x0a
+        };
+
        ByteBuffer byes = ByteBuffer.wrap(byteStr);
        CharBuffer uniVal = CharBuffer.wrap(unistr);
        ByteBuffer expected = ByteBuffer.wrap(expectedByteStr);
-        
-        rc = 0;
-        if(m_decoder==null){
+
+        int rc = 0;
+        if(decoder==null){
            warnln("Could not load decoder.");
            return;
        }
-        m_decoder.reset();
+        decoder.reset();
        /* Convert the whole buffer to Unicode */
        try {
            CharBuffer chars = CharBuffer.allocate(unistr.length());
-            CoderResult result = m_decoder.decode(byes, chars, false);
+            CoderResult result = decoder.decode(byes, chars, false);

            if (result.isError()) {
                errln("ToChars encountered Error");
@ -1223,11 +1218,11 @@ public class TestCharset extends TestFmwk {
        try {
            CharBuffer chars = CharBuffer.allocate(unistr.length());
            ByteBuffer b = ByteBuffer.wrap(byteStr);
-            m_decoder.reset();
+            decoder.reset();
            CoderResult result=null;
            for (int i = 1; i <= byteStr.length; i++) {
                b.limit(i);
-                result = m_decoder.decode(b, chars, false);
+                result = decoder.decode(b, chars, false);
                if(result.isOverflow()){
                    errln("ToChars single threw an overflow exception");
                }
@ -1253,11 +1248,11 @@ public class TestCharset extends TestFmwk {
        /* Convert the buffer one at a time to Unicode */
        try {
            CharBuffer chars = CharBuffer.allocate(unistr.length());
-            m_decoder.reset();
+            decoder.reset();
            byes.rewind();
            for (int i = 1; i <= byteStr.length; i++) {
                byes.limit(i);
-                CoderResult result = m_decoder.decode(byes, chars, false);
+                CoderResult result = decoder.decode(byes, chars, false);
                if (result.isError()) {
                    errln("Error while decoding: "+result.toString());
                }
@ -1289,8 +1284,8 @@ public class TestCharset extends TestFmwk {
        /* Convert the whole buffer from unicode */
        try {
            ByteBuffer bytes = ByteBuffer.allocate(expectedByteStr.length);
-            m_encoder.reset();
-            CoderResult result = m_encoder.encode(uniVal, bytes, false);
+            encoder.reset();
+            CoderResult result = encoder.encode(uniVal, bytes, false);
            if (result.isError()) {
                errln("FromChars reported error: " + result.toString());
                rc = 1;
@ -1315,11 +1310,11 @@ public class TestCharset extends TestFmwk {
        try {
            ByteBuffer bytes = ByteBuffer.allocate(expectedByteStr.length);
            CharBuffer c = CharBuffer.wrap(unistr);
-            m_encoder.reset();
+            encoder.reset();
            CoderResult result= null;
            for (int i = 1; i <= unistr.length(); i++) {
                c.limit(i);
-                result = m_encoder.encode(c, bytes, false);
+                result = encoder.encode(c, bytes, false);
                if(result.isOverflow()){
                    errln("FromChars single threw an overflow exception");
                }
@ -1349,12 +1344,12 @@ public class TestCharset extends TestFmwk {
        /* Convert one char at a time to unicode */
        try {
            ByteBuffer bytes = ByteBuffer.allocate(expectedByteStr.length);
-            m_encoder.reset();
+            encoder.reset();
            char[] temp = unistr.toCharArray();
            CoderResult result=null;
            for (int i = 0; i <= temp.length; i++) {
                uniVal.limit(i);
-                result = m_encoder.encode(uniVal, bytes, false);
+                result = encoder.encode(uniVal, bytes, false);
                if(result.isOverflow()){
                    errln("FromChars simple threw an overflow exception");
                }
@ -1378,7 +1373,7 @@ public class TestCharset extends TestFmwk {
            rc = 9;
        }
        if (rc != 0) {
-            errln("Test Simple FromChars " + m_encoding + " --FAILED");
+            errln("Test Simple FromChars " + encoding + " --FAILED");
        }
    }

@ -1585,13 +1580,18 @@ public class TestCharset extends TestFmwk {
            '\u22B5','\u22B6','\u22B7','\u22B8','\u22B9',
            '\u22BA','\u22BB','\u22BC','\u22BD','\u22BE' 
            };
-        if(m_encoder==null){
-            warnln("Could not load encoder.");
+        String encoding = "UTF-16";
+        CharsetEncoder encoder = null;
+        try {
+            CharsetProviderICU provider = new CharsetProviderICU();
+            Charset charset = provider.charsetForName(encoding);
+            encoder = charset.newEncoder();
+        } catch(MissingResourceException ex) {
+            warnln("Could not load charset data: " + encoding);
            return;
        }
-        m_encoder.reset();
-        if (!m_encoder.canEncode(new String(mySource))) {
-            errln("Test canConvert() " + m_encoding + " failed. "+m_encoder);
+        if (!encoder.canEncode(new String(mySource))) {
+            errln("Test canConvert() " + encoding + " failed. "+encoder);
        }

    }
@ -2121,15 +2121,27 @@ public class TestCharset extends TestFmwk {
    }

    public void convertAllTest(ByteBuffer bSource, CharBuffer uSource) throws Exception {
+        String encoding = "UTF-16";
+        CharsetDecoder decoder = null;
+        CharsetEncoder encoder = null;
+        try {
+            CharsetProviderICU provider = new CharsetProviderICU();
+            Charset charset = provider.charsetForName(encoding);
+            decoder = charset.newDecoder();
+            encoder = charset.newEncoder();
+        } catch(MissingResourceException ex) {
+            warnln("Could not load charset data: " + encoding);
+            return;
+        }
        {
            try {
-                m_decoder.reset();
+                decoder.reset();
                ByteBuffer mySource = bSource.duplicate();
-                CharBuffer myTarget = m_decoder.decode(mySource);
+                CharBuffer myTarget = decoder.decode(mySource);
                if (!equals(myTarget, uSource)) {
                    errln(
                        "--Test convertAll() "
-                            + m_encoding
+                            + encoding
                            + " to Unicode  --FAILED");
                }
            } catch (Exception e) {
@ -2139,13 +2151,13 @@ public class TestCharset extends TestFmwk {
        }
        {
            try {
-                m_encoder.reset();
+                encoder.reset();
                CharBuffer mySource = CharBuffer.wrap(uSource);
-                ByteBuffer myTarget = m_encoder.encode(mySource);
+                ByteBuffer myTarget = encoder.encode(mySource);
                if (!equals(myTarget, bSource)) {
                    errln(
                        "--Test convertAll() "
-                            + m_encoding
+                            + encoding
                            + " to Unicode  --FAILED");
                }
            } catch (Exception e) {