ICU-6562 Port over from ICU4C to ICU4J the EBCDIC SwapLFNL code. Allow converter with options in its name to be processed correct by the Charset.

X-SVN-Rev: 24723
This commit is contained in:
Michael Ow 2008-10-06 16:35:08 +00:00
parent 2b50e0382e
commit a40423e92a
3 changed files with 209 additions and 13 deletions

View file

@ -156,7 +156,14 @@ class CharsetMBCS extends CharsetICU {
public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath,
ClassLoader loader) throws InvalidFormatException {
super(icuCanonicalName, javaCanonicalName, aliases);
/* See if the icuCanonicalName contains certain option information. */
if (icuCanonicalName.contains(UConverterConstants.OPTION_SWAP_LFNL_STRING)) {
options = UConverterConstants.OPTION_SWAP_LFNL;
icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING));
super.icuCanonicalName = icuCanonicalName;
}
// now try to load the data
sharedData = loadConverter(1, icuCanonicalName, classPath, loader);
@ -169,9 +176,8 @@ class CharsetMBCS extends CharsetICU {
subChar1 = sharedData.staticData.subChar1;
fromUSubstitution = new byte[sharedData.staticData.subCharLen];
System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen);
// TODO: pass options
initializeConverter(0);
initializeConverter(options);
}
public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases)
@ -881,14 +887,15 @@ class CharsetMBCS extends CharsetICU {
// agljport:todo umtx_unlock(NULL);
if (!isCached) {
// agljport:fix if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {
// agljport:fix if(U_FAILURE(*pErrorCode)) {
// agljport:fix return; /* something went wrong */
// agljport:fix }
/* the option does not apply, remove it */
// agljport:fix cnv->options=options&=~UCNV_OPTION_SWAP_LFNL;
// agljport:fix }
try {
if (!EBCDICSwapLFNL()) {
/* this option does not apply, remove it */
this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL;
}
} catch (Exception e) {
/* something went wrong. */
return;
}
}
}
@ -914,6 +921,150 @@ class CharsetMBCS extends CharsetICU {
}
}
}
/* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/
/*
* This code modifies a standard EBCDIC<->Unicode mappling table for
* OS/390 (z/OS) Unix System Services (Open Edition).
* The difference is in the mapping of Line Feed and New Line control codes:
* Standard EBDIC maps
*
* <U000A> \x25 |0
* <U0085> \x15 |0
*
* but OS/390 USS EBCDIC swaps the control codes for LF and NL,
* mapping
*
* <U000A> \x15 |0
* <U0085> \x25 |0
*
* This code modifies a loaded standard EBCDIC<->Unicode mapping table
* by copying it into allocated memory and swapping the LF and NL values.
* It allows to support the same EBCDIC charset in both version without
* duplicating the entire installed table.
*/
/* standard EBCDIC codes */
private static final short EBCDIC_LF = 0x0025;
private static final short EBCDIC_NL = 0x0015;
/* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */
private static final short EBCDIC_RT_LF = 0x0f25;
private static final short EBCDIC_RT_NL = 0x0f15;
/* Unicode code points */
private static final short U_LF = 0x000A;
private static final short U_NL = 0x0085;
private boolean EBCDICSwapLFNL() throws Exception {
UConverterMBCSTable mbcsTable;
char[] table;
byte[] results;
byte[] bytes;
int[][] newStateTable;
byte[] newResults;
String newName;
int stage2Entry;
int size, sizeofFromUBytes;
mbcsTable = sharedData.mbcs;
table = mbcsTable.fromUnicodeTable;
bytes = mbcsTable.fromUnicodeBytes;
results = bytes;
/*
* Check that this is an EBCDIC table with SBCS portion -
* SBCS or EBCDIC with standard EBCDIC LF and NL mappings.
*
* If not, ignore the option Options are always ignored if they do not apply.
*/
if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) &&
mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) &&
mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) {
return false;
}
if (mbcsTable.outputType == MBCS_OUTPUT_1) {
if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&
EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) {
return false;
}
} else /* MBCS_OUTPUT_2_SISO */ {
stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF);
if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) &&
EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF))) {
return false;
}
stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL);
if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) &&
EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL))) {
return false;
}
}
if (mbcsTable.fromUBytesLength > 0) {
/*
* We _know_ the number of bytes in the fromUnicodeBytes array
* starting with header.version 4.1.
*/
sizeofFromUBytes = mbcsTable.fromUBytesLength;
} else {
/*
* Otherwise:
* There used to be code to enumerate the fromUnicode
* trie and find the highest entry, but it was removed in ICU 3.2
* because it was not tested and caused a low code coverage number.
*/
throw new Exception("U_INVALID_FORMAT_ERROR");
}
/*
* The table has an appropriate format.
* Allocate and build
* - a modified to-Unicode state table
* - a modified from-Unicode output array
* - a converter name string with the swap option appended
*/
size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20;
/* copy and modify the to-Unicode state table */
newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length];
for (int i = 0; i < newStateTable.length; i++) {
System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length);
}
newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);
newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);
/* copy and modify the from-Unicode result table */
newResults = new byte[sizeofFromUBytes];
System.arraycopy(bytes, 0, newResults, 0, sizeofFromUBytes);
/* conveniently, the table access macros work on the left side of expressions */
if (mbcsTable.outputType == MBCS_OUTPUT_1) {
MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL);
MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF);
} else /* MBCS_OUTPUT_2_SISO */ {
stage2Entry = MBCS_STAGE_2_FROM_U(table, U_LF);
MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL);
stage2Entry = MBCS_STAGE_2_FROM_U(table, U_NL);
MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF);
}
/* set the canonical converter name */
newName = new String(icuCanonicalName);
newName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING);
if (mbcsTable.swapLFNLStateTable == null) {
mbcsTable.swapLFNLStateTable = newStateTable;
mbcsTable.swapLFNLFromUnicodeBytes = newResults;
mbcsTable.swapLFNLName = newName;
}
return true;
}
/**
* MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3
@ -1056,6 +1207,14 @@ class CharsetMBCS extends CharsetICU {
int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
return (char) (((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (results[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
}
/* single-byte fromUnicode: set the 16-bit result word with newValue*/
static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, byte[] results, int c, int newValue) {
int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f);
int i = 2 * (table[i1] + (c & 0xf)); // used as index into byte[] array treated as char[] array
results[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK);
results[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK);
}
/* multi-byte fromUnicode: get the 32-bit stage 2 entry */
static int MBCS_STAGE_2_FROM_U(char[] table, int c) {
@ -1073,6 +1232,12 @@ class CharsetMBCS extends CharsetICU {
int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
return (char) (((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) | (bytes[i + 1] & UConverterConstants.UNSIGNED_BYTE_MASK));
}
static void MBCS_VALUE_2_FROM_STAGE_2_SET(byte[] bytes, int stage2Entry, int c, int newValue) {
int i = 2 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));
bytes[i] = (byte)((newValue >> 8) & UConverterConstants.UNSIGNED_BYTE_MASK);
bytes[i + 1] = (byte)(newValue & UConverterConstants.UNSIGNED_BYTE_MASK);
}
private static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) {
int i = 4 * (16 * ((char) stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf));

View file

@ -26,6 +26,8 @@ import com.ibm.icu.impl.InvalidFormatException;
* @stable ICU 3.6
*/
public final class CharsetProviderICU extends CharsetProvider{
private static String optionsString = null;
private static boolean gettingJavaCanonicalName = false;
/**
* Default constructor
@ -43,6 +45,8 @@ public final class CharsetProviderICU extends CharsetProvider{
*/
public final Charset charsetForName(String charsetName){
try{
// extract the options from the charset name
charsetName = processOptions(charsetName);
// get the canonical name
String icuCanonicalName = getICUCanonicalName(charsetName);
@ -142,7 +146,20 @@ public final class CharsetProviderICU extends CharsetProvider{
private static final Charset getCharset(String icuCanonicalName) throws IOException{
String[] aliases = (String[])getAliases(icuCanonicalName);
String canonicalName = getJavaCanonicalName(icuCanonicalName);
return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases));
/* Concat the option string to the icuCanonicalName so that the options can be handled properly
* by the actual charset.
* Note: getJavaCanonicalName() may eventually call this method so skip the concatenation part
* during getJavaCanonicalName() call.
*/
if (gettingJavaCanonicalName) {
gettingJavaCanonicalName = false;
} else if (optionsString != null) {
icuCanonicalName = icuCanonicalName.concat(optionsString);
optionsString = null;
}
return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases));
}
/**
* Gets the canonical name of the converter as defined by Java
@ -206,6 +223,7 @@ public final class CharsetProviderICU extends CharsetProvider{
* we have to try to use a java compatible name.
*/
if (cName != null) {
gettingJavaCanonicalName = true;
if (Charset.isSupported(cName)) {
cName = Charset.forName(cName).name();
}
@ -300,4 +318,15 @@ public final class CharsetProviderICU extends CharsetProvider{
}
return names;
}
private static final String processOptions(String charsetName) {
if (charsetName.contains(UConverterConstants.OPTION_SWAP_LFNL_STRING)) {
/* Remove and save the swap lfnl option string portion of the charset name. */
optionsString = UConverterConstants.OPTION_SWAP_LFNL_STRING;
charsetName = charsetName.substring(0, charsetName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING));
}
return charsetName;
}
}

View file

@ -73,6 +73,8 @@ interface UConverterConstants {
static final int OPTION_SWAP_LFNL = 0x10;
static final int OPTION_MAC = 0x20; //agljport:comment added for Mac ISCII encodings
static final String OPTION_SWAP_LFNL_STRING = ",swaplfnl";
/** values for the unicodeMask */
static final int HAS_SUPPLEMENTARY = 1;
static final int HAS_SURROGATES = 2;