ICU-11479 precompute Java canonical names for not-only-ICU Charsets, make CharsetProviderICU all-static again (see ticket #9973), simplify some test code

X-SVN-Rev: 36966
This commit is contained in:
Markus Scherer 2015-01-16 17:35:52 +00:00
parent 6572a72c63
commit 1604e1772d
3 changed files with 204 additions and 162 deletions

View file

@ -1,9 +1,7 @@
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* Copyright (C) 2006-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -33,7 +31,6 @@ import com.ibm.icu.text.UnicodeSet;
public abstract class CharsetICU extends Charset{
String icuCanonicalName;
String javaCanonicalName;
int options;
float maxCharsPerByte;
@ -87,7 +84,6 @@ public abstract class CharsetICU extends Charset{
if(canonicalName.length() == 0){
throw new IllegalCharsetNameException(canonicalName);
}
this.javaCanonicalName = canonicalName;
this.icuCanonicalName = icuCanonicalName;
}

View file

@ -1,9 +1,7 @@
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* Copyright (C) 2006-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -13,8 +11,11 @@ import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.charset.spi.CharsetProvider;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.ibm.icu.impl.InvalidFormatException;
@ -26,37 +27,86 @@ import com.ibm.icu.impl.InvalidFormatException;
* @stable ICU 3.6
*/
public final class CharsetProviderICU extends CharsetProvider{
private String optionsString;
/**
* List of available ICU Charsets, empty during static initialization.
*/
private static List<Charset> icuCharsets = Collections.<Charset>emptyList();
/**
* Maps uppercased Java charset names and aliases to canonical Java charset names.
*/
private static final Map<String, String> javaNamesMap = new HashMap<String, String>();
static {
// This loop will exclude ICU charsets because Charset.availableCharsets() calls
// our charsets() which returns an empty iterator
// until we have tried to open all of the ICU charsets and built icuCharsets.
// We can only open ICU charsets when we have the javaNamesMap,
// for getting the Java canonical name.
for (Map.Entry<String, Charset> nameAndCharset : Charset.availableCharsets().entrySet()) {
String canonicalName = nameAndCharset.getKey();
javaNamesMap.put(ASCII.toUpperCase(canonicalName), canonicalName);
for (String alias : nameAndCharset.getValue().aliases()) {
javaNamesMap.put(ASCII.toUpperCase(alias), canonicalName);
}
}
}
/**
* Simpler/faster methods for ASCII than ones based on Unicode data.
* TODO: There should be code like this somewhere already??
*/
private static final class ASCII {
static String toUpperCase(String s) {
for (int i = 0; i < s.length(); ++i) {
char c = s.charAt(i);
if ('a' <= c && c <= 'z') {
StringBuilder sb = new StringBuilder(s.length());
sb.append(s, 0, i).append((char)(c - 0x20));
while (++i < s.length()) {
c = s.charAt(i);
if ('a' <= c && c <= 'z') { c = (char)(c - 0x20); }
sb.append(c);
}
return sb.toString();
}
}
return s;
}
}
/**
* Default constructor
* @stable ICU 3.6
*/
public CharsetProviderICU() {
optionsString = null;
}
/**
* Constructs a charset for the given charset name.
* Constructs a Charset for the given charset name.
* Implements the abstract method of super class.
* @param charsetName charset name
* @return charset objet for the given charset name, null if unsupported
* @return Charset object for the given charset name, null if unsupported
* @stable ICU 3.6
*/
public final Charset charsetForName(String charsetName){
try{
// extract the options from the charset name
charsetName = processOptions(charsetName);
String optionsString = "";
if (charsetName.endsWith(UConverterConstants.OPTION_SWAP_LFNL_STRING)) {
/* Remove and save the swap lfnl option string portion of the charset name. */
optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING;
charsetName = charsetName.substring(0, charsetName.length() - optionsString.length());
}
// get the canonical name
String icuCanonicalName = getICUCanonicalName(charsetName);
// create the converter object and return it
// create the converter object and return it
if(icuCanonicalName==null || icuCanonicalName.length()==0){
// Try the original name, may be something added and not in the alias table.
// Will get an unsupported encoding exception if it doesn't work.
return getCharset(charsetName);
icuCanonicalName = charsetName;
}
return getCharset(icuCanonicalName);
return getCharset(icuCanonicalName, optionsString);
}catch(UnsupportedCharsetException ex){
}catch(IOException ex){
}
@ -144,19 +194,15 @@ public final class CharsetProviderICU extends CharsetProvider{
throw new UnsupportedCharsetException(enc);
}
}
private Charset getCharset(String icuCanonicalName) throws IOException{
String[] aliases = getAliases(icuCanonicalName);
private static final Charset getCharset(String icuCanonicalName, String optionsString)
throws IOException {
String[] aliases = getAliases(icuCanonicalName);
String canonicalName = getJavaCanonicalName(icuCanonicalName);
/* Concat the option string to the icuCanonicalName so that the options can be handled properly
* by the actual charset.
*/
if (optionsString != null) {
icuCanonicalName = icuCanonicalName.concat(optionsString);
optionsString = null;
}
return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases));
return (CharsetICU.getCharset(icuCanonicalName + optionsString, canonicalName, aliases));
}
/**
* Gets the canonical name of the converter as defined by Java
@ -213,34 +259,18 @@ public final class CharsetProviderICU extends CharsetProvider{
cName = "x-"+ name;
}
}
/* After getting the java canonical name from ICU alias table, get the
* java canonical name from the current JDK. This is neccessary because
/* After getting the Java canonical name from the ICU alias table, get the
* Java canonical name from the current JDK. This is necessary because
* different versions of the JVM (Sun and IBM) may have a different
* canonical name then the one given by ICU. So the java canonical name
* will depend on the current JVM. Since java cannot use the ICU canonical
* we have to try to use a java compatible name.
* canonical name than the one given by ICU. So the Java canonical name
* will depend on the current JVM. Since Java cannot use the ICU canonical name
* we have to try to use a Java compatible name.
*/
if (cName != null) {
try {
if (Charset.isSupported(cName)) {
String testName = Charset.forName(cName).name();
/* Ensure that the java canonical name works in ICU */
if (!testName.equals(cName)) {
if (getICUCanonicalName(testName).length() > 0) {
cName = testName;
}
}
}
} catch (Exception e) {
// Any exception in the try block above
// must result Java's canonical name to be
// null. This block is necessary to reset
// gettingJavaCanonicalName to true always.
// See #9966.
// Note: The use of static gettingJavaCanonicalName
// looks really dangerous and obviously thread unsafe.
// We should revisit this code later. See #9973
cName = null;
String testName = javaNamesMap.get(ASCII.toUpperCase(cName));
if (testName != null && !testName.equals(cName) &&
getICUCanonicalName(testName).length() > 0) {
cName = testName;
}
}
return cName;
@ -283,46 +313,61 @@ public final class CharsetProviderICU extends CharsetProvider{
}
private void putCharsets(Map<Charset, String> map){
/**
* Lazy-init the icuCharsets list.
* Could be done during static initialization if constructing all of the Charsets
* were cheap enough. See ICU ticket #11481.
*/
private static final synchronized void loadAvailableICUCharsets() {
// The Java names Map is empty during static initialization when we are
// just about to build it.
if (!icuCharsets.isEmpty() || javaNamesMap.isEmpty()) {
return;
}
List<Charset> icucs = new LinkedList<Charset>();
int num = UConverterAlias.countAvailable();
for(int i=0;i<num;i++) {
for (int i = 0; i < num; ++i) {
String name = UConverterAlias.getAvailableName(i);
try {
Charset cs = getCharset(name);
map.put(cs, getJavaCanonicalName(name));
}catch(UnsupportedCharsetException ex){
}catch (IOException e) {
Charset cs = getCharset(name, "");
icucs.add(cs);
} catch(UnsupportedCharsetException ex) {
} catch(IOException e) {
}
// add only charsets that can be created!
}
// Unmodifiable so that charsets().next().remove() cannot change it.
icuCharsets = Collections.unmodifiableList(icucs);
}
/**
* Returns an iterator for the available charsets.
* Returns an iterator for the available ICU Charsets.
* Implements the abstract method of super class.
* @return Iterator the charset name iterator
* @return the Charset iterator
* @stable ICU 3.6
*/
public final Iterator<Charset> charsets(){
HashMap<Charset, String> map = new HashMap<Charset, String>();
putCharsets(map);
return map.keySet().iterator();
public final Iterator<Charset> charsets() {
loadAvailableICUCharsets();
return icuCharsets.iterator();
}
/**
* Gets the canonical names of available converters
* Gets the canonical names of available ICU converters
* @return array of available converter names
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final String[] getAvailableNames(){
CharsetProviderICU provider = new CharsetProviderICU();
HashMap<Charset, String> map = new HashMap<Charset, String>();
provider.putCharsets(map);
return map.values().toArray(new String[0]);
public static final String[] getAvailableNames() {
loadAvailableICUCharsets();
String[] names = new String[icuCharsets.size()];
int i = 0;
for (Charset cs : icuCharsets) {
names[i++] = cs.name();
}
return names;
}
/**
* Return all names available
* @return String[] an array of all available names
@ -338,15 +383,4 @@ public final class CharsetProviderICU extends CharsetProvider{
}
return names;
}
private String processOptions(String charsetName) {
if (charsetName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) {
/* Remove and save the swap lfnl option string portion of the charset name. */
optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING;
charsetName = charsetName.substring(0, charsetName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING));
}
return charsetName;
}
}

View file

@ -1,9 +1,7 @@
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* Copyright (C) 2006-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
@ -36,49 +34,6 @@ import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class TestCharset extends TestFmwk {
private String m_encoding = "UTF-16";
CharsetDecoder m_decoder = null;
CharsetEncoder m_encoder = null;
Charset m_charset =null;
static final String unistr = "abcd\ud800\udc00\u1234\u00a5\u3000\r\n";
static final byte[] byteStr ={
(byte) 0x00,(byte) 'a',
(byte) 0x00,(byte) 'b',
(byte) 0x00,(byte) 'c',
(byte) 0x00,(byte) 'd',
(byte) 0xd8,(byte) 0x00,
(byte) 0xdc,(byte) 0x00,
(byte) 0x12,(byte) 0x34,
(byte) 0x00,(byte) 0xa5,
(byte) 0x30,(byte) 0x00,
(byte) 0x00,(byte) 0x0d,
(byte) 0x00,(byte) 0x0a };
static final byte[] expectedByteStr ={
(byte) 0xfe,(byte) 0xff,
(byte) 0x00,(byte) 'a',
(byte) 0x00,(byte) 'b',
(byte) 0x00,(byte) 'c',
(byte) 0x00,(byte) 'd',
(byte) 0xd8,(byte) 0x00,
(byte) 0xdc,(byte) 0x00,
(byte) 0x12,(byte) 0x34,
(byte) 0x00,(byte) 0xa5,
(byte) 0x30,(byte) 0x00,
(byte) 0x00,(byte) 0x0d,
(byte) 0x00,(byte) 0x0a };
protected void init(){
try{
CharsetProviderICU provider = new CharsetProviderICU();
//Charset charset = CharsetICU.forName(encoding);
m_charset = provider.charsetForName(m_encoding);
m_decoder = (CharsetDecoder) m_charset.newDecoder();
m_encoder = (CharsetEncoder) m_charset.newEncoder();
}catch(MissingResourceException ex){
warnln("Could not load charset data");
}
}
public static void main(String[] args) throws Exception {
new TestCharset().run(args);
}
@ -1180,23 +1135,63 @@ public class TestCharset extends TestFmwk {
// }
public void TestAPISemantics(/*String encoding*/)
throws Exception {
int rc;
public void TestAPISemantics(/*String encoding*/) {
String encoding = "UTF-16";
CharsetDecoder decoder = null;
CharsetEncoder encoder = null;
try {
CharsetProviderICU provider = new CharsetProviderICU();
Charset charset = provider.charsetForName(encoding);
decoder = charset.newDecoder();
encoder = charset.newEncoder();
} catch(MissingResourceException ex) {
warnln("Could not load charset data: " + encoding);
return;
}
final String unistr = "abcd\ud800\udc00\u1234\u00a5\u3000\r\n";
final byte[] byteStr = {
(byte) 0x00,(byte) 'a',
(byte) 0x00,(byte) 'b',
(byte) 0x00,(byte) 'c',
(byte) 0x00,(byte) 'd',
(byte) 0xd8,(byte) 0x00,
(byte) 0xdc,(byte) 0x00,
(byte) 0x12,(byte) 0x34,
(byte) 0x00,(byte) 0xa5,
(byte) 0x30,(byte) 0x00,
(byte) 0x00,(byte) 0x0d,
(byte) 0x00,(byte) 0x0a
};
final byte[] expectedByteStr = {
(byte) 0xfe,(byte) 0xff,
(byte) 0x00,(byte) 'a',
(byte) 0x00,(byte) 'b',
(byte) 0x00,(byte) 'c',
(byte) 0x00,(byte) 'd',
(byte) 0xd8,(byte) 0x00,
(byte) 0xdc,(byte) 0x00,
(byte) 0x12,(byte) 0x34,
(byte) 0x00,(byte) 0xa5,
(byte) 0x30,(byte) 0x00,
(byte) 0x00,(byte) 0x0d,
(byte) 0x00,(byte) 0x0a
};
ByteBuffer byes = ByteBuffer.wrap(byteStr);
CharBuffer uniVal = CharBuffer.wrap(unistr);
ByteBuffer expected = ByteBuffer.wrap(expectedByteStr);
rc = 0;
if(m_decoder==null){
int rc = 0;
if(decoder==null){
warnln("Could not load decoder.");
return;
}
m_decoder.reset();
decoder.reset();
/* Convert the whole buffer to Unicode */
try {
CharBuffer chars = CharBuffer.allocate(unistr.length());
CoderResult result = m_decoder.decode(byes, chars, false);
CoderResult result = decoder.decode(byes, chars, false);
if (result.isError()) {
errln("ToChars encountered Error");
@ -1223,11 +1218,11 @@ public class TestCharset extends TestFmwk {
try {
CharBuffer chars = CharBuffer.allocate(unistr.length());
ByteBuffer b = ByteBuffer.wrap(byteStr);
m_decoder.reset();
decoder.reset();
CoderResult result=null;
for (int i = 1; i <= byteStr.length; i++) {
b.limit(i);
result = m_decoder.decode(b, chars, false);
result = decoder.decode(b, chars, false);
if(result.isOverflow()){
errln("ToChars single threw an overflow exception");
}
@ -1253,11 +1248,11 @@ public class TestCharset extends TestFmwk {
/* Convert the buffer one at a time to Unicode */
try {
CharBuffer chars = CharBuffer.allocate(unistr.length());
m_decoder.reset();
decoder.reset();
byes.rewind();
for (int i = 1; i <= byteStr.length; i++) {
byes.limit(i);
CoderResult result = m_decoder.decode(byes, chars, false);
CoderResult result = decoder.decode(byes, chars, false);
if (result.isError()) {
errln("Error while decoding: "+result.toString());
}
@ -1289,8 +1284,8 @@ public class TestCharset extends TestFmwk {
/* Convert the whole buffer from unicode */
try {
ByteBuffer bytes = ByteBuffer.allocate(expectedByteStr.length);
m_encoder.reset();
CoderResult result = m_encoder.encode(uniVal, bytes, false);
encoder.reset();
CoderResult result = encoder.encode(uniVal, bytes, false);
if (result.isError()) {
errln("FromChars reported error: " + result.toString());
rc = 1;
@ -1315,11 +1310,11 @@ public class TestCharset extends TestFmwk {
try {
ByteBuffer bytes = ByteBuffer.allocate(expectedByteStr.length);
CharBuffer c = CharBuffer.wrap(unistr);
m_encoder.reset();
encoder.reset();
CoderResult result= null;
for (int i = 1; i <= unistr.length(); i++) {
c.limit(i);
result = m_encoder.encode(c, bytes, false);
result = encoder.encode(c, bytes, false);
if(result.isOverflow()){
errln("FromChars single threw an overflow exception");
}
@ -1349,12 +1344,12 @@ public class TestCharset extends TestFmwk {
/* Convert one char at a time to unicode */
try {
ByteBuffer bytes = ByteBuffer.allocate(expectedByteStr.length);
m_encoder.reset();
encoder.reset();
char[] temp = unistr.toCharArray();
CoderResult result=null;
for (int i = 0; i <= temp.length; i++) {
uniVal.limit(i);
result = m_encoder.encode(uniVal, bytes, false);
result = encoder.encode(uniVal, bytes, false);
if(result.isOverflow()){
errln("FromChars simple threw an overflow exception");
}
@ -1378,7 +1373,7 @@ public class TestCharset extends TestFmwk {
rc = 9;
}
if (rc != 0) {
errln("Test Simple FromChars " + m_encoding + " --FAILED");
errln("Test Simple FromChars " + encoding + " --FAILED");
}
}
@ -1585,13 +1580,18 @@ public class TestCharset extends TestFmwk {
'\u22B5','\u22B6','\u22B7','\u22B8','\u22B9',
'\u22BA','\u22BB','\u22BC','\u22BD','\u22BE'
};
if(m_encoder==null){
warnln("Could not load encoder.");
String encoding = "UTF-16";
CharsetEncoder encoder = null;
try {
CharsetProviderICU provider = new CharsetProviderICU();
Charset charset = provider.charsetForName(encoding);
encoder = charset.newEncoder();
} catch(MissingResourceException ex) {
warnln("Could not load charset data: " + encoding);
return;
}
m_encoder.reset();
if (!m_encoder.canEncode(new String(mySource))) {
errln("Test canConvert() " + m_encoding + " failed. "+m_encoder);
if (!encoder.canEncode(new String(mySource))) {
errln("Test canConvert() " + encoding + " failed. "+encoder);
}
}
@ -2121,15 +2121,27 @@ public class TestCharset extends TestFmwk {
}
public void convertAllTest(ByteBuffer bSource, CharBuffer uSource) throws Exception {
String encoding = "UTF-16";
CharsetDecoder decoder = null;
CharsetEncoder encoder = null;
try {
CharsetProviderICU provider = new CharsetProviderICU();
Charset charset = provider.charsetForName(encoding);
decoder = charset.newDecoder();
encoder = charset.newEncoder();
} catch(MissingResourceException ex) {
warnln("Could not load charset data: " + encoding);
return;
}
{
try {
m_decoder.reset();
decoder.reset();
ByteBuffer mySource = bSource.duplicate();
CharBuffer myTarget = m_decoder.decode(mySource);
CharBuffer myTarget = decoder.decode(mySource);
if (!equals(myTarget, uSource)) {
errln(
"--Test convertAll() "
+ m_encoding
+ encoding
+ " to Unicode --FAILED");
}
} catch (Exception e) {
@ -2139,13 +2151,13 @@ public class TestCharset extends TestFmwk {
}
{
try {
m_encoder.reset();
encoder.reset();
CharBuffer mySource = CharBuffer.wrap(uSource);
ByteBuffer myTarget = m_encoder.encode(mySource);
ByteBuffer myTarget = encoder.encode(mySource);
if (!equals(myTarget, bSource)) {
errln(
"--Test convertAll() "
+ m_encoding
+ encoding
+ " to Unicode --FAILED");
}
} catch (Exception e) {