From a40423e92aae8240cdee4c4fcc1a337ab71db61e Mon Sep 17 00:00:00 2001 From: Michael Ow Date: Mon, 6 Oct 2008 16:35:08 +0000 Subject: [PATCH] ICU-6562 Port over from ICU4C to ICU4J the EBCDIC SwapLFNL code. Allow converter with options in its name to be processed correct by the Charset. X-SVN-Rev: 24723 --- .../src/com/ibm/icu/charset/CharsetMBCS.java | 189 ++++++++++++++++-- .../ibm/icu/charset/CharsetProviderICU.java | 31 ++- .../ibm/icu/charset/UConverterConstants.java | 2 + 3 files changed, 209 insertions(+), 13 deletions(-) diff --git a/icu4j/src/com/ibm/icu/charset/CharsetMBCS.java b/icu4j/src/com/ibm/icu/charset/CharsetMBCS.java index 9b9076123d5..32e44b66230 100644 --- a/icu4j/src/com/ibm/icu/charset/CharsetMBCS.java +++ b/icu4j/src/com/ibm/icu/charset/CharsetMBCS.java @@ -156,7 +156,14 @@ class CharsetMBCS extends CharsetICU { public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath, ClassLoader loader) throws InvalidFormatException { super(icuCanonicalName, javaCanonicalName, aliases); - + + /* See if the icuCanonicalName contains certain option information. */ + if (icuCanonicalName.contains(UConverterConstants.OPTION_SWAP_LFNL_STRING)) { + options = UConverterConstants.OPTION_SWAP_LFNL; + icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING)); + super.icuCanonicalName = icuCanonicalName; + } + // now try to load the data sharedData = loadConverter(1, icuCanonicalName, classPath, loader); @@ -169,9 +176,8 @@ class CharsetMBCS extends CharsetICU { subChar1 = sharedData.staticData.subChar1; fromUSubstitution = new byte[sharedData.staticData.subCharLen]; System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen); - - // TODO: pass options - initializeConverter(0); + + initializeConverter(options); } public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases) @@ -881,14 +887,15 @@ class CharsetMBCS extends CharsetICU { // agljport:todo umtx_unlock(NULL); if (!isCached) { - // agljport:fix if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { - // agljport:fix if(U_FAILURE(*pErrorCode)) { - // agljport:fix return; /* something went wrong */ - // agljport:fix } - - /* the option does not apply, remove it */ - // agljport:fix cnv->options=options&=~UCNV_OPTION_SWAP_LFNL; - // agljport:fix } + try { + if (!EBCDICSwapLFNL()) { + /* this option does not apply, remove it */ + this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL; + } + } catch (Exception e) { + /* something went wrong. */ + return; + } } } @@ -914,6 +921,150 @@ class CharsetMBCS extends CharsetICU { } } } + /* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/ + /* + * This code modifies a standard EBCDIC<->Unicode mappling table for + * OS/390 (z/OS) Unix System Services (Open Edition). + * The difference is in the mapping of Line Feed and New Line control codes: + * Standard EBDIC maps + * + * \x25 |0 + * \x15 |0 + * + * but OS/390 USS EBCDIC swaps the control codes for LF and NL, + * mapping + * + * \x15 |0 + * \x25 |0 + * + * This code modifies a loaded standard EBCDIC<->Unicode mapping table + * by copying it into allocated memory and swapping the LF and NL values. + * It allows to support the same EBCDIC charset in both version without + * duplicating the entire installed table. + */ + /* standard EBCDIC codes */ + private static final short EBCDIC_LF = 0x0025; + private static final short EBCDIC_NL = 0x0015; + + /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ + private static final short EBCDIC_RT_LF = 0x0f25; + private static final short EBCDIC_RT_NL = 0x0f15; + + /* Unicode code points */ + private static final short U_LF = 0x000A; + private static final short U_NL = 0x0085; + + private boolean EBCDICSwapLFNL() throws Exception { + UConverterMBCSTable mbcsTable; + + char[] table; + byte[] results; + byte[] bytes; + + int[][] newStateTable; + byte[] newResults; + String newName; + + int stage2Entry; + int size, sizeofFromUBytes; + + mbcsTable = sharedData.mbcs; + + table = mbcsTable.fromUnicodeTable; + bytes = mbcsTable.fromUnicodeBytes; + results = bytes; + + /* + * Check that this is an EBCDIC table with SBCS portion - + * SBCS or EBCDIC with standard EBCDIC LF and NL mappings. + * + * If not, ignore the option Options are always ignored if they do not apply. + */ + if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) && + mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && + mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) { + return false; + } + + if (mbcsTable.outputType == MBCS_OUTPUT_1) { + if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && + EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) { + return false; + } + } else /* MBCS_OUTPUT_2_SISO */ { + stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF); + if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) && + EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF))) { + return false; + } + + stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL); + if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) && + EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL))) { + return false; + } + } + + if (mbcsTable.fromUBytesLength > 0) { + /* + * We _know_ the number of bytes in the fromUnicodeBytes array + * starting with header.version 4.1. + */ + sizeofFromUBytes = mbcsTable.fromUBytesLength; + } else { + /* + * Otherwise: + * There used to be code to enumerate the fromUnicode + * trie and find the highest entry, but it was removed in ICU 3.2 + * because it was not tested and caused a low code coverage number. + */ + throw new Exception("U_INVALID_FORMAT_ERROR"); + } + + /* + * The table has an appropriate format. + * Allocate and build + * - a modified to-Unicode state table + * - a modified from-Unicode output array + * - a converter name string with the swap option appended + */ + size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20; + + /* copy and modify the to-Unicode state table */ + newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length]; + for (int i = 0; i < newStateTable.length; i++) { + System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length); + } + + newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); + newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); + + /* copy and modify the from-Unicode result table */ + newResults = new byte[sizeofFromUBytes]; + System.arraycopy(bytes, 0, newResults, 0, sizeofFromUBytes); + /* conveniently, the table access macros work on the left side of expressions */ + if (mbcsTable.outputType == MBCS_OUTPUT_1) { + MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL); + MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF); + } else /* MBCS_OUTPUT_2_SISO */ { + stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF); + MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL); + + stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL); + MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF); + } + + /* set the canonical converter name */ + newName = new String(icuCanonicalName); + newName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING); + + if (mbcsTable.swapLFNLStateTable == null) { + mbcsTable.swapLFNLStateTable = newStateTable; + mbcsTable.swapLFNLFromUnicodeBytes = newResults; + mbcsTable.swapLFNLName = newName; + } + return true; + } /** * MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3 @@ -1056,6 +1207,14 @@ class CharsetMBCS extends CharsetICU { int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array return (char) (((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (results[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK)); } + + /* single-byte fromUnicode: set the 16-bit result word with newValue*/ + static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, byte[] results, int c, int newValue) { + int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); + int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array + results[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK); + results[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK); + } /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ static int MBCS_STAGE_2_FROM_U(char[] table, int c) { @@ -1073,6 +1232,12 @@ class CharsetMBCS extends CharsetICU { int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf)); return (char) (((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK)); } + + static void MBCS_VALUE_2_FROM_STAGE_2_SET(byte[] bytes, int stage2Entry, int c, int newValue) { + int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf)); + bytes[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK); + bytes[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK); + } private static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) { int i = 4 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf)); diff --git a/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java b/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java index 21b225f7f9b..878e193361d 100644 --- a/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java +++ b/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java @@ -26,6 +26,8 @@ import com.ibm.icu.impl.InvalidFormatException; * @stable ICU 3.6 */ public final class CharsetProviderICU extends CharsetProvider{ + private static String optionsString = null; + private static boolean gettingJavaCanonicalName = false; /** * Default constructor @@ -43,6 +45,8 @@ public final class CharsetProviderICU extends CharsetProvider{ */ public final Charset charsetForName(String charsetName){ try{ + // extract the options from the charset name + charsetName = processOptions(charsetName); // get the canonical name String icuCanonicalName = getICUCanonicalName(charsetName); @@ -142,7 +146,20 @@ public final class CharsetProviderICU extends CharsetProvider{ private static final Charset getCharset(String icuCanonicalName) throws IOException{ String[] aliases = (String[])getAliases(icuCanonicalName); String canonicalName = getJavaCanonicalName(icuCanonicalName); - return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases)); + + /* Concat the option string to the icuCanonicalName so that the options can be handled properly + * by the actual charset. + * Note: getJavaCanonicalName() may eventually call this method so skip the concatenation part + * during getJavaCanonicalName() call. + */ + if (gettingJavaCanonicalName) { + gettingJavaCanonicalName = false; + } else if (optionsString != null) { + icuCanonicalName = icuCanonicalName.concat(optionsString); + optionsString = null; + } + + return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases)); } /** * Gets the canonical name of the converter as defined by Java @@ -206,6 +223,7 @@ public final class CharsetProviderICU extends CharsetProvider{ * we have to try to use a java compatible name. */ if (cName != null) { + gettingJavaCanonicalName = true; if (Charset.isSupported(cName)) { cName = Charset.forName(cName).name(); } @@ -300,4 +318,15 @@ public final class CharsetProviderICU extends CharsetProvider{ } return names; } + + private static final String processOptions(String charsetName) { + if (charsetName.contains(UConverterConstants.OPTION_SWAP_LFNL_STRING)) { + /* Remove and save the swap lfnl option string portion of the charset name. */ + optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING; + + charsetName = charsetName.substring(0, charsetName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING)); + } + + return charsetName; + } } diff --git a/icu4j/src/com/ibm/icu/charset/UConverterConstants.java b/icu4j/src/com/ibm/icu/charset/UConverterConstants.java index 4c0c17e04e7..2fdf5070e47 100644 --- a/icu4j/src/com/ibm/icu/charset/UConverterConstants.java +++ b/icu4j/src/com/ibm/icu/charset/UConverterConstants.java @@ -73,6 +73,8 @@ interface UConverterConstants { static final int OPTION_SWAP_LFNL = 0x10; static final int OPTION_MAC = 0x20; //agljport:comment added for Mac ISCII encodings + static final String OPTION_SWAP_LFNL_STRING = ",swaplfnl"; + /** values for the unicodeMask */ static final int HAS_SUPPLEMENTARY = 1; static final int HAS_SURROGATES = 2;