From e33252c10262433d32fcecca193b1744aa76db31 Mon Sep 17 00:00:00 2001 From: Yoshito Umaoka Date: Sat, 26 Aug 2006 05:30:49 +0000 Subject: [PATCH] ICU-5018 charset conversion support X-SVN-Rev: 20172 --- icu4j/build.xml | 7 +- .../java.nio.charset.spi.CharsetProvider | 3 + .../com/ibm/icu/charset/CharsetCallback.java | 158 + .../ibm/icu/charset/CharsetDecoderICU.java | 639 +++ .../ibm/icu/charset/CharsetEncoderICU.java | 631 +++ icu4j/src/com/ibm/icu/charset/CharsetICU.java | 192 + .../ibm/icu/charset/CharsetProviderICU.java | 260 ++ icu4j/src/com/ibm/icu/impl/CharsetMBCS.java | 3568 +++++++++++++++++ icu4j/src/com/ibm/icu/impl/CharsetUTF16.java | 446 +++ .../src/com/ibm/icu/impl/CharsetUTF16LE.java | 449 +++ icu4j/src/com/ibm/icu/impl/CharsetUTF32.java | 318 ++ .../src/com/ibm/icu/impl/CharsetUTF32LE.java | 318 ++ icu4j/src/com/ibm/icu/impl/CharsetUTF8.java | 508 +++ .../ibm/icu/impl/InvalidFormatException.java | 16 + .../src/com/ibm/icu/impl/UConverterAlias.java | 789 ++++ .../icu/impl/UConverterAliasDataReader.java | 218 + .../impl/UConverterAliasesEnumeration.java | 83 + .../com/ibm/icu/impl/UConverterConstants.java | 156 + .../ibm/icu/impl/UConverterDataReader.java | 552 +++ .../ibm/icu/impl/UConverterSharedData.java | 545 +++ .../ibm/icu/impl/UConverterStaticData.java | 61 + 21 files changed, 9915 insertions(+), 2 deletions(-) create mode 100644 icu4j/src/META-INF/services/java.nio.charset.spi.CharsetProvider create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetCallback.java create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetDecoderICU.java create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetEncoderICU.java create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetICU.java create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java create mode 100644 icu4j/src/com/ibm/icu/impl/CharsetMBCS.java create mode 100644 icu4j/src/com/ibm/icu/impl/CharsetUTF16.java create mode 100644 icu4j/src/com/ibm/icu/impl/CharsetUTF16LE.java create mode 100644 icu4j/src/com/ibm/icu/impl/CharsetUTF32.java create mode 100644 icu4j/src/com/ibm/icu/impl/CharsetUTF32LE.java create mode 100644 icu4j/src/com/ibm/icu/impl/CharsetUTF8.java create mode 100644 icu4j/src/com/ibm/icu/impl/InvalidFormatException.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterAlias.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterAliasDataReader.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterAliasesEnumeration.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterConstants.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterDataReader.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterSharedData.java create mode 100644 icu4j/src/com/ibm/icu/impl/UConverterStaticData.java diff --git a/icu4j/build.xml b/icu4j/build.xml index 4daefcdba0a..023256ac715 100644 --- a/icu4j/build.xml +++ b/icu4j/build.xml @@ -177,7 +177,7 @@ - + + + diff --git a/icu4j/src/META-INF/services/java.nio.charset.spi.CharsetProvider b/icu4j/src/META-INF/services/java.nio.charset.spi.CharsetProvider new file mode 100644 index 00000000000..ca798e7dd4a --- /dev/null +++ b/icu4j/src/META-INF/services/java.nio.charset.spi.CharsetProvider @@ -0,0 +1,3 @@ +# Copyright (C) 2006, International Business Machines Corporation and others. All Rights Reserved. +# icu4j converters +com.ibm.icu.charset.CharsetProviderICU diff --git a/icu4j/src/com/ibm/icu/charset/CharsetCallback.java b/icu4j/src/com/ibm/icu/charset/CharsetCallback.java new file mode 100644 index 00000000000..9a2d14bf9e2 --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetCallback.java @@ -0,0 +1,158 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CoderResult; + + +/*public*/ class CharsetCallback { + /** + * FROM_U, TO_U context options for sub callback + * @draft ICU 3.6 + */ + /*public*/ static final String SUB_STOP_ON_ILLEGAL = "i"; + + /** + * FROM_U, TO_U context options for skip callback + * @draft ICU 3.6 + */ + /*public*/ static final String SKIP_STOP_ON_ILLEGAL = "i"; + + /** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) + * @draft ICU 3.6 + */ + /*public*/ static final String ESCAPE_ICU = null; + /** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) + * @draft ICU 3.6 + */ + /*public*/ static final String ESCAPE_JAVA = "J"; + /** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) + * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) + * @draft ICU 3.6 + */ + /*public*/ static final String ESCAPE_C = "C"; + /** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly + * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly + * @draft ICU 3.6 + */ + /*public*/ static final String ESCAPE_XML_DEC = "D"; + /** + * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly + * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly + * @draft ICU 3.6 + */ + /*public*/ static final String ESCAPE_XML_HEX = "X"; + /** + * FROM_U_CALLBACK_ESCAPE context option to escape teh code unit according to Unicode (U+XXXXX) + * @draft ICU 3.6 + */ + /*public*/ static final String ESCAPE_UNICODE = "U"; + + public interface Decoder { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr); + } + + public interface Encoder { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr); + } + public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + if(context==null){ + return CoderResult.UNDERFLOW; + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return CoderResult.UNDERFLOW; + } + } + return cr; + } + }; + public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + if(context==null){ + return CoderResult.UNDERFLOW; + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return CoderResult.UNDERFLOW; + } + } + return cr; + } + }; + + public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){ + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + if(context==null){ + return encoder.cbFromUWriteSub(encoder, source, target, offsets); + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return encoder.cbFromUWriteSub(encoder, source, target, offsets); + } + } + return cr; + } + }; + + public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + + if(context==null){ + return decoder.cbToUWriteSub(decoder, source, target, offsets); + }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ + if(!cr.isUnmappable()){ + return cr; + }else{ + return decoder.cbToUWriteSub(decoder, source, target, offsets); + } + } + return cr; + } + }; + + public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() { + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr){ + return cr; + } + }; + public static final Decoder TO_U_CALLBACK_STOP = new Decoder() { + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr){ + return cr; + } + }; +} diff --git a/icu4j/src/com/ibm/icu/charset/CharsetDecoderICU.java b/icu4j/src/com/ibm/icu/charset/CharsetDecoderICU.java new file mode 100644 index 00000000000..09f791f9021 --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetDecoderICU.java @@ -0,0 +1,639 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.BufferOverflowException; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.MalformedInputException; +import java.nio.ByteBuffer; + +import com.ibm.icu.impl.Assert; + +public abstract class CharsetDecoderICU extends CharsetDecoder{ + + protected int toUnicodeStatus; + protected byte[] toUBytesArray = new byte[128]; + protected int toUBytesBegin = 0; + protected int toULength; + protected char[] charErrorBufferArray = new char[128]; + protected int charErrorBufferLength; + protected int charErrorBufferBegin; + protected char[] invalidCharBuffer = new char[128]; + protected int invalidCharLength; + + /* store previous UChars/chars to continue partial matches */ + protected byte[] preToUArray; + protected int preToUBegin; + protected int preToULength; /* negative: replay */ + protected int preToUFirstLength; /* length of first character */ + + protected Object toUContext = null; + private CharsetCallback.Decoder onUnmappableInput = CharsetCallback.TO_U_CALLBACK_STOP; + private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP; + protected CharsetCallback.Decoder toCharErrorBehaviour= new CharsetCallback.Decoder(){ + public CoderResult call(CharsetDecoderICU decoder, Object context, + ByteBuffer source, CharBuffer target, IntBuffer offsets, + char[] buffer, int length, CoderResult cr) { + if(cr.isUnmappable()){ + return onUnmappableInput.call(decoder, context, + source, target, offsets, + buffer, length, cr); + }else if(cr.isMalformed()){ + return onMalformedInput.call(decoder, context, + source, target, offsets, + buffer, length, cr); + } + return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, + source, target, offsets, + buffer, length, cr); + } + }; + + protected CharsetDecoderICU(CharsetICU cs) { + super(cs, (float) (1/(float)cs.maxCharsPerByte), cs.maxCharsPerByte); + } + + + /** + * Sets the action to be taken if an illegal sequence is encountered + * @param newAction action to be taken + * @exception IllegalArgumentException + * @draft ICU 3.6 + */ + protected final void implOnMalformedInput(CodingErrorAction newAction) { + onMalformedInput = getCallback(newAction); + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * @param newAction action to be taken + * @exception IllegalArgumentException + * @draft ICU 3.6 + */ + protected final void implOnUnmappableCharacter(CodingErrorAction newAction) { + onUnmappableInput = getCallback(newAction); + } + private static CharsetCallback.Decoder getCallback(CodingErrorAction action){ + if(action==CodingErrorAction.REPLACE){ + return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE; + }else if(action==CodingErrorAction.IGNORE){ + return CharsetCallback.TO_U_CALLBACK_SKIP; + }else if(action==CodingErrorAction.REPORT){ + return CharsetCallback.TO_U_CALLBACK_STOP; + } + return CharsetCallback.TO_U_CALLBACK_STOP; + } + /** + * Flushes any characters saved in the converter's internal buffer and + * resets the converter. + * @param out action to be taken + * @return result of flushing action and completes the decoding all input. + * Returns CoderResult.UNDERFLOW if the action succeeds. + * @draft ICU 3.6 + */ + protected final CoderResult implFlush(CharBuffer out) { + return CoderResult.UNDERFLOW; + } + + /** + * Resets the to Unicode mode of converter + * @draft ICU 3.6 + */ + protected void implReset() { + toUnicodeStatus = 0 ; + toULength = 0; + charErrorBufferLength = 0; + charErrorBufferBegin = 0; + + /* store previous UChars/chars to continue partial matches */ + preToUBegin = 0; + preToULength = 0; /* negative: replay */ + preToUFirstLength = 0; + } + + /** + * Decodes one or more bytes. The default behaviour of the converter + * is stop and report if an error in input stream is encountered. + * To set different behaviour use @see CharsetDecoder.onMalformedInput() + * This method allows a buffer by buffer conversion of a data stream. + * The state of the conversion is saved between calls to convert. + * Among other things, this means multibyte input sequences can be + * split between calls. If a call to convert results in an Error, the + * conversion may be continued by calling convert again with suitably + * modified parameters.All conversions should be finished with a call to + * the flush method. + * @param in buffer to decode + * @param out buffer to populate with decoded result + * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding + * action succeeds or more input is needed for completing the decoding action. + * @draft ICU 3.6 + */ + protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){ + if(!in.hasRemaining()){ + return CoderResult.UNDERFLOW; + } + in.position(in.position()+toUCountPending()); + /* do the conversion */ + CoderResult ret = decode(in, out, null, false); + + setSourcePosition(in); + return ret; + } + + /** + * Implements the ICU semantic for decode operation + * @param in + * @param out + * @return + */ + protected abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets); + + /** + * Implements the ICU semantic for decode operation + * @param source + * @param target + * @param offsets + * @param flush + * @return + * @throws MalformedInputException + */ + protected final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { + + /* check parameters */ + if(target==null || source==null) { + throw new IllegalArgumentException(); + } + /* + * Make sure that the buffer sizes do not exceed the number range for + * int32_t because some functions use the size (in units or bytes) + * rather than comparing pointers, and because offsets are int32_t values. + * + * size_t is guaranteed to be unsigned and large enough for the job. + * + * Return with an error instead of adjusting the limits because we would + * not be able to maintain the semantics that either the source must be + * consumed or the target filled (unless an error occurs). + * An adjustment would be sourceLimit=t+0x7fffffff; for example. + */ + /*agljport:fix + if( + ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) || + ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t) + ) { + *err=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + */ + + /* flush the target overflow buffer */ + if(charErrorBufferLength>0) { + char[] overflow = null; + int i, length; + + overflow=charErrorBufferArray; + length=charErrorBufferLength; + i=0; + do { + if(target.remaining()<0) { + /* the overflow buffer contains too much, keep the rest */ + int j=0; + + do { + overflow[j++]=overflow[i++]; + } while(i=0) { + /* the overflow buffer is emptied and there is no new input: we are done */ + return CoderResult.UNDERFLOW; + } + + /* + * Do not simply return with a buffer overflow error if + * !flush && t==targetLimit + * because it is possible that the source will not generate any output. + * For example, the skip callback may be called; + * it does not output anything. + */ + + return toUnicodeWithCallback(source, target, offsets, flush); + } + + /* maximum number of indexed bytes */ + private static final int EXT_MAX_BYTES = 0x1f; + private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) { + int limit; + int delta, offset; + + if(sourceIndex>=0) { + /* + * adjust each offset by adding the previous sourceIndex + * minus the length of the input sequence that caused an + * error, if any + */ + delta=sourceIndex-errorInputLength; + } else { + /* + * set each offset to -1 because this conversion function + * does not handle offsets + */ + delta=-1; + } + limit=offsets.position()+length; + if(delta==0) { + /* most common case, nothing to do */ + } else if(delta>0) { + /* add the delta to each offset (but not if the offset is <0) */ + while(offsets.position()=0) { + offsets.put(offset+delta); + } + //FIXME: ++offsets; + } + } else /* delta<0 */ { + /* + * set each offset to -1 because this conversion function + * does not handle offsets + * or the error input sequence started in a previous buffer + */ + while(offsets.position()=0) { + /* normal mode */ + } else { + /* + * Previous m:n conversion stored source units from a partial match + * and failed to consume all of them. + * We need to "replay" them from a temporary buffer and convert them first. + */ + realSource=source; + realFlush=flush; + realSourceIndex=sourceIndex; + //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength); + replayArray.put(preToUArray,0, -preToULength); + source=replayArray; + source.position(0); + source.limit(replayArrayIndex-preToULength); + flush=false; + sourceIndex=-1; + preToULength=0; + } + + /* + * loop for conversion and error handling + * + * loop { + * convert + * loop { + * update offsets + * handle end of input + * handle errors/call callback + * } + * } + */ + for(;;) { + if(cr.isUnderflow()) { + /* convert */ + cr = decodeLoop(source, target, offsets); + + /* + * set a flag for whether the converter + * successfully processed the end of the input + * + * need not check cnv->preToULength==0 because a replay (<0) will cause + * s0) { + updateOffsets(offsets, length, sourceIndex, errorInputLength); + + + /* + * if a converter handles offsets and updates the offsets + * pointer at the end, then pArgs->offset should not change + * here; + * however, some converters do not handle offsets at all + * (sourceIndex<0) or may not update the offsets pointer + */ + //TODO: pArgs->offsets=offsets+=length; + } + + if(sourceIndex>=0) { + sourceIndex+=(source.position()-s); + } + + } + + if(preToULength<0) { + /* + * switch the source to new replay units (cannot occur while replaying) + * after offset handling and before end-of-input and callback handling + */ + if(realSource==null) + { + realSource=source; + realFlush=flush; + realSourceIndex=sourceIndex; + + //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength); + replayArray.put(preToUArray,0, -preToULength); + + source=replayArray; + source.limit(replayArrayIndex-preToULength); + flush=false; + if((sourceIndex+=preToULength)<0) { + sourceIndex=-1; + } + + preToULength=0; + } else { + /* see implementation note before _fromUnicodeWithCallback() */ + //agljport:todo U_ASSERT(realSource==NULL); + Assert.assrt(realSource==null); + } + } + + /* update pointers */ + s=source.position(); + t=target.position(); + + if(cr.isUnderflow()) { + if(s0) { + /* + * the entire input stream is consumed + * and there is a partial, truncated input sequence left + */ + + /* inject an error and continue with callback handling */ + cr = CoderResult.malformedForLength(toULength); + calledCallback=false; /* new error condition */ + } else { + /* input consumed */ + if(flush) { + /* + * return to the conversion loop once more if the flush + * flag is set and the conversion function has not + * successfully processed the end of the input yet + * + * (continue converting by breaking out of only the inner loop) + */ + if(!converterSawEndOfInput) { + break; + } + + /* reset the converter without calling the callback function */ + implReset(); + } + + /* done successfully */ + return cr; + } + } + + /* U_FAILURE(*err) */ + { + + if( calledCallback || cr.isOverflow() || + (cr.isMalformed() && cr.isUnmappable()) + ) { + /* + * the callback did not or cannot resolve the error: + * set output pointers and return + * + * the check for buffer overflow is redundant but it is + * a high-runner case and hopefully documents the intent + * well + * + * if we were replaying, then the replay buffer must be + * copied back into the UConverter + * and the real arguments must be restored + */ + if(realSource!=null) { + int length; + Assert.assrt(preToULength==0); + length=(int)(source.limit()-source.position()); + if(length>0) { + //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length); + source.get(preToUArray, preToUBegin, length); + preToULength=(byte)-length; + } + + source=realSource; + flush=realFlush; + } + return cr; + } + } + + /* copy toUBytes[] to invalidCharBuffer[] */ + errorInputLength=invalidCharLength=toULength; + if(errorInputLength>0) { + copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength); + } + + /* set the converter state to deal with the next character */ + toULength=0; + + /* call the callback function */ + cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr); + /* + * loop back to the offset handling + * + * this flag will indicate after offset handling + * that a callback was called; + * if the callback did not resolve the error, then we return + */ + calledCallback=true; + } + } + } + /** + * Releases the system resources by cleanly closing ICU converter opened + * @draft ICU 3.6 + */ + protected void finalize()throws Throwable{ + } + + /** + * Returns the number of chars held in the converter's internal state + * because more input is needed for completing the conversion. This function is + * useful for mapping semantics of ICU's converter interface to those of iconv, + * and this information is not needed for normal conversion. + * @param cnv The converter in which the input is held as internal state + * @param status ICU error code in/out parameter. + * Must fulfill U_SUCCESS before the function call. + * @return The number of chars in the state. -1 if an error is encountered. + * @draft ICU 3.4 + */ + /*public*/ int toUCountPending() { + if(preToULength > 0){ + return preToULength ; + }else if(preToULength < 0){ + return -preToULength; + }else if(toULength > 0){ + return toULength; + } + return 0; + } + + + private final void setSourcePosition(ByteBuffer source){ + // ok was there input held in the previous invocation of decodeLoop + // that resulted in output in this invocation? + source.position(source.position() - toUCountPending()); + + } + private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) { + for(int i=srcOffset; i0) { + target.put(ucharsArray[ucharsBegin++]); + --length; + } + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + } else { + /* output with offsets */ + try{ + while(length>0) { + target.put(ucharsArray[ucharsBegin++]); + offsets.put(sourceIndex); + --length; + } + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + } + /* write overflow */ + if(length>0) { + cnv.charErrorBufferLength= length; + do { + cnv.charErrorBufferArray[cnv.charErrorBufferBegin++]=ucharsArray[ucharsBegin++]; + } while(--length>0); + } + return cr; + } + /** + * Sub classes to override this method if required + * @param decoder + * @param source + * @param target + * @param offsets + * @return + */ + protected CoderResult cbToUWriteSub(CharsetDecoderICU decoder, + ByteBuffer source, CharBuffer target, + IntBuffer offsets){ + String sub = decoder.replacement(); + CharsetICU cs = (CharsetICU) decoder.charset(); + if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) { + char[] subArr = new char[] { 0x1a }; + return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub + .length(), target, offsets, source.position()); + } else { + return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(), + 0, sub.length(), target, offsets, source.position()); + + } + } +} diff --git a/icu4j/src/com/ibm/icu/charset/CharsetEncoderICU.java b/icu4j/src/com/ibm/icu/charset/CharsetEncoderICU.java new file mode 100644 index 00000000000..ab26d564191 --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetEncoderICU.java @@ -0,0 +1,631 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.MalformedInputException; + +import com.ibm.icu.impl.Assert; +import com.ibm.icu.text.UTF16; + + +public abstract class CharsetEncoderICU extends CharsetEncoder { + + protected byte[] errorBuffer = new byte[30]; + protected int errorBufferLength = 0; + + /** these are for encodeLoopICU */ + protected int fromUnicodeStatus; + protected int fromUChar32; + protected boolean useSubChar1; + + /* store previous UChars/chars to continue partial matches */ + protected int preFromUFirstCP; /* >=0: partial match */ + protected char[] preFromUArray; + protected int preFromUBegin; + protected int preFromULength; /* negative: replay */ + + protected char[] invalidUCharBuffer = new char[2]; + protected int invalidUCharLength; + protected Object fromUContext; + private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP; + private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP; + protected CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder(){ + public CoderResult call(CharsetEncoderICU encoder, Object context, + CharBuffer source, ByteBuffer target, IntBuffer offsets, + char[] buffer, int length, int cp, CoderResult cr) { + if(cr.isUnmappable()){ + return onUnmappableInput.call(encoder, context, + source, target, offsets, + buffer, length, cp, cr); + }else if(cr.isMalformed()){ + return onMalformedInput.call(encoder, context, + source, target, offsets, + buffer, length, cp, cr); + } + return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context, + source, target, offsets, + buffer, length, cp, cr); + + } + }; + + /** + * Construcs a new encoder for the given charset + * @param cs for which the decoder is created + * @param cHandle the address of ICU converter + * @param replacement the substitution bytes + * @draft ICU 3.6 + */ + protected CharsetEncoderICU(CharsetICU cs, byte[] replacement) { + super(cs, (cs.minBytesPerChar+cs.maxBytesPerChar)/2, cs.maxBytesPerChar, replacement); + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * @param newAction action to be taken + * @exception IllegalArgumentException + * @draft ICU 3.6 + */ + protected void implOnMalformedInput(CodingErrorAction newAction) { + onMalformedInput = getCallback(newAction); + } + + /** + * Sets the action to be taken if an illegal sequence is encountered + * @param newAction action to be taken + * @exception IllegalArgumentException + * @draft ICU 3.6 + */ + protected void implOnUnmappableCharacter(CodingErrorAction newAction) { + onUnmappableInput = getCallback(newAction); + } + + private static CharsetCallback.Encoder getCallback(CodingErrorAction action){ + if(action==CodingErrorAction.REPLACE){ + return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE; + }else if(action==CodingErrorAction.IGNORE){ + return CharsetCallback.FROM_U_CALLBACK_SKIP; + }else if(action==CodingErrorAction.REPORT){ + return CharsetCallback.FROM_U_CALLBACK_STOP; + } + return CharsetCallback.FROM_U_CALLBACK_STOP; + } + + /** + * Flushes any characters saved in the converter's internal buffer and + * resets the converter. + * @param out action to be taken + * @return result of flushing action and completes the decoding all input. + * Returns CoderResult.UNDERFLOW if the action succeeds. + * @draft ICU 3.6 + */ + protected CoderResult implFlush(ByteBuffer out) { + return CoderResult.UNDERFLOW; + } + + /** + * Resets the from Unicode mode of converter + * @draft ICU 3.6 + */ + protected void implReset() { + errorBufferLength=0; + fromUChar32=0; + fromUnicodeStatus = 0; + preFromUBegin = 0; + preFromUFirstCP = 0; + preFromULength = 0; + } + + /** + * Encodes one or more chars. The default behaviour of the + * converter is stop and report if an error in input stream is encountered. + * To set different behaviour use @see CharsetEncoder.onMalformedInput() + * @param in buffer to decode + * @param out buffer to populate with decoded result + * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding + * action succeeds or more input is needed for completing the decoding action. + * @draft ICU 3.6 + */ + protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { + if(!in.hasRemaining()){ + return CoderResult.UNDERFLOW; + } + in.position(in.position()+fromUCountPending()); + /* do the conversion */ + CoderResult ret = encode(in, out, null, false); + setSourcePosition(in); + return ret; + } + /** + * Implements ICU semantics of buffer management + * @param source + * @param target + * @param offsets + * @return + * @throws MalformedInputException + */ + protected abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets); + + /** + * Implements ICU semantics for encoding the buffer + * @param in + * @param out + * @return + */ + protected final CoderResult encode(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ + + + /* check parameters */ + if(target==null || source==null) { + throw new IllegalArgumentException(); + } + + /* + * Make sure that the buffer sizes do not exceed the number range for + * int32_t because some functions use the size (in units or bytes) + * rather than comparing pointers, and because offsets are int32_t values. + * + * size_t is guaranteed to be unsigned and large enough for the job. + * + * Return with an error instead of adjusting the limits because we would + * not be able to maintain the semantics that either the source must be + * consumed or the target filled (unless an error occurs). + * An adjustment would be targetLimit=t+0x7fffffff; for example. + */ + //Ram: not required + //if( ((long)(sourceLimit-sArrayIndex)>(long)0x3fffffff && sourceLimit>sArrayIndex) || ((long)(targetLimit-tArrayIndex)>(long)0x7fffffff && targetLimit>tArrayIndex)) { + // err[0]=ErrorCode.U_ILLEGAL_ARGUMENT_ERROR; + // return; + //} + + /* flush the target overflow buffer */ + if(errorBufferLength>0) { + byte[] overflowArray; + int i, length; + + overflowArray=errorBuffer; + length=errorBufferLength; + i=0; + do { + if(target.remaining()==0) { + /* the overflow buffer contains too much, keep the rest */ + int j=0; + + do { + overflowArray[j++]=overflowArray[i++]; + } while(i=0) { + /* the overflow buffer is emptied and there is no new input: we are done */ + return CoderResult.UNDERFLOW; + } + + /* + * Do not simply return with a buffer overflow error if + * !flush && t==targetLimit + * because it is possible that the source will not generate any output. + * For example, the skip callback may be called; + * it does not output anything. + */ + + return fromUnicodeWithCallback(source, target, offsets, flush); + + } + /* maximum number of indexed UChars */ + public static final int EXT_MAX_UCHARS = 19; + + protected final CoderResult fromUnicodeWithCallback(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ + int sBufferIndex; + int sourceIndex; + int errorInputLength; + boolean converterSawEndOfInput, calledCallback; + + + /* variables for m:n conversion */ + CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS); + int replayArrayIndex=0; + CharBuffer realSource; + boolean realFlush; + + CoderResult cr = CoderResult.UNDERFLOW; + + /* get the converter implementation function */ + sourceIndex=0; + + if(preFromULength>=0) { + /* normal mode */ + realSource=null; + realFlush=false; + } else { + /* + * Previous m:n conversion stored source units from a partial match + * and failed to consume all of them. + * We need to "replay" them from a temporary buffer and convert them first. + */ + realSource=source; + realFlush = flush; + + //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR); + replayArray.put(preFromUArray,0, -preFromULength); + source.position(replayArrayIndex); + source.limit(replayArrayIndex-preFromULength); //preFromULength is negative, see declaration + source=replayArray; + flush=false; + + preFromULength=0; + } + + /* + * loop for conversion and error handling + * + * loop { + * convert + * loop { + * update offsets + * handle end of input + * handle errors/call callback + * } + * } + */ + for(;;) { + /* convert */ + cr = encodeLoop(source, target, offsets); + /* + * set a flag for whether the converter + * successfully processed the end of the input + * + * need not check cnv.preFromULength==0 because a replay (<0) will cause + * s0) { + + /* + * if a converter handles offsets and updates the offsets + * pointer at the end, then offset should not change + * here; + * however, some converters do not handle offsets at all + * (sourceIndex<0) or may not update the offsets pointer + */ + offsets.position(offsets.position()+length); + } + + if(sourceIndex>=0) { + sourceIndex+=(int)(source.position()); + } + } + + if(preFromULength<0) { + /* + * switch the source to new replay units (cannot occur while replaying) + * after offset handling and before end-of-input and callback handling + */ + if(realSource==null) { + realSource=source; + realFlush=flush; + + //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR); + replayArray.put(preFromUArray,0, -preFromULength); + + source=replayArray; + source.position(replayArrayIndex); + source.limit(replayArrayIndex-preFromULength); + flush=false; + if((sourceIndex+=preFromULength)<0) { + sourceIndex=-1; + } + + preFromULength=0; + } else { + /* see implementation note before _fromUnicodeWithCallback() */ + //agljport:todo U_ASSERT(realSource==NULL); + Assert.assrt(realSource==null); + } + } + + /* update pointers */ + sBufferIndex=source.position(); + if(cr.isUnderflow()) { + if(sBufferIndex0) { + //UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR); + source.get(preFromUArray, 0, length ); + preFromULength=(byte)-length; + } + source=realSource; + flush=realFlush; + } + return cr; + } + } + + /* callback handling */ + { + /* get and write the code point */ + errorInputLength = UTF16.append(invalidUCharBuffer, 0, fromUChar32); + invalidUCharLength = errorInputLength; + + /* set the converter state to deal with the next character */ + fromUChar32=0; + + /* call the callback function */ + cr = fromCharErrorBehaviour.call(this, fromUContext, source, target, offsets, invalidUCharBuffer, invalidUCharLength, fromUChar32, cr); + } + + /* + * loop back to the offset handling + * + * this flag will indicate after offset handling + * that a callback was called; + * if the callback did not resolve the error, then we return + */ + calledCallback=true; + } + } + } + /** + * Ascertains if a given Unicode code point (32bit value for handling surrogates) + * can be converted to the target encoding. If the caller wants to test if a + * surrogate pair can be converted to target encoding then the + * responsibility of assembling the int value lies with the caller. + * For assembling a code point the caller can use UTF16 class of ICU4J and do something like: + *
+	 * while(i
+	 * or
+	 * 
+	 * String src = new String(mySource);
+	 * int i,codepoint;
+	 * boolean passed = false;
+	 * while(i0xfff)? 2:1;
+	 *	if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){
+	 *	    passed = false;
+	 *	}
+	 * }
+	 * 
+ * + * @param codepoint Unicode code point as int value + * @return true if a character can be converted + * @draft ICU 3.6 + * + */ + public boolean canEncode(int codepoint) { + return true; + } + + public boolean isLegalReplacement(byte[] repl){ + return true; + } + + /** + * Releases the system resources by cleanly closing ICU converter opened + * @exception Throwable exception thrown by super class' finalize method + * @draft ICU 3.6 + */ + protected void finalize() throws Throwable { + } + + protected static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv, + byte[] bytesArray, int bytesBegin, int bytesLength, + ByteBuffer out, IntBuffer offsets, int sourceIndex){ + + //write bytes + int obl = bytesLength; + CoderResult cr = CoderResult.UNDERFLOW; + int bytesLimit = bytesBegin + bytesLength; + try{ + for (;bytesBegin< bytesLimit;){ + out.put(bytesArray[bytesBegin]); + bytesBegin++; + } + // success + bytesLength=0; + }catch( BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + + + if(offsets!=null) { + while(obl>bytesLength) { + offsets.put(sourceIndex); + --obl; + } + } + //write overflow + cnv.errorBufferLength = bytesLimit - bytesBegin; + if(cnv.errorBufferLength >0) { + if(cnv!=null) { + int index = 0; + while(bytesBegin 0){ + return UTF16.getCharCount(preFromUFirstCP)+preFromULength ; + }else if(preFromULength < 0){ + return -preFromULength ; + }else if(fromUChar32 > 0){ + return 1; + }else if(preFromUFirstCP >0){ + return UTF16.getCharCount(preFromUFirstCP); + } + return 0; + } + /** + * + * @param source + */ + private final void setSourcePosition(CharBuffer source){ + + // ok was there input held in the previous invocation of decodeLoop + // that resulted in output in this invocation? + source.position(source.position() - fromUCountPending()); + } + /** + * Write the codepage substitution character. + * Subclasses to override this method. + * For stateful converters, it is typically necessary to handle this + * specificially for the converter in order to properly maintain the state. + */ + protected CoderResult cbFromUWriteSub (CharsetEncoderICU encoder, + CharBuffer source, ByteBuffer target, + IntBuffer offsets){ + CharsetICU cs = (CharsetICU) encoder.charset(); + byte[] sub = encoder.replacement(); + if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) { + return CharsetEncoderICU.fromUWriteBytes(encoder, + new byte[] { cs.subChar1 }, 0, 1, target, offsets, source + .position()); + } else { + return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0, + sub.length, target, offsets, source.position()); + } + } +} diff --git a/icu4j/src/com/ibm/icu/charset/CharsetICU.java b/icu4j/src/com/ibm/icu/charset/CharsetICU.java new file mode 100644 index 00000000000..400101db16c --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetICU.java @@ -0,0 +1,192 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.io.ByteArrayInputStream; +import java.io.InputStreamReader; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.HashMap; + +import com.ibm.icu.lang.UCharacter; + + + +public abstract class CharsetICU extends Charset{ + + protected String icuCanonicalName; + protected String javaCanonicalName; + protected int options; + + protected int maxBytesPerChar; + protected int minBytesPerChar; + protected float maxCharsPerByte; + protected byte subChar1 = 0x00; + + protected int mode; + protected boolean flush; + protected boolean useFallback; + + /** + * + * @param icuCanonicalName + * @param canonName + * @param aliases + * @draft ICU 3.6 + */ + protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) { + super(canonicalName,aliases); + if(canonicalName.length() == 0){ + throw new IllegalCharsetNameException(canonicalName); + } + this.javaCanonicalName = canonicalName; + this.icuCanonicalName = icuCanonicalName; + } + + /** + * Ascertains if a charset is a sub set of this charset + * @param cs charset to test + * @return true if the given charset is a subset of this charset + */ + public boolean contains(Charset cs){ + if (null == cs) { + return false; + } else if (this.equals(cs)) { + return true; + } + return false; + } + private static final HashMap algorithmicCharsets = new HashMap(); + static{ + algorithmicCharsets.put("BOCU-1", "com.ibm.icu.impl.CharsetBOCU1" ); + algorithmicCharsets.put("CESU-8", "com.ibm.icu.impl.CharsetCESU8" ); + algorithmicCharsets.put("HZ", "com.ibm.icu.impl.CharsetHZ" ); + algorithmicCharsets.put("imapmailboxname", "com.ibm.icu.impl.CharsetIMAP" ); + algorithmicCharsets.put("ISCII", "com.ibm.icu.impl.CharsetISCII" ); + algorithmicCharsets.put("iso2022", "com.ibm.icu.impl.CharsetISO2022" ); + algorithmicCharsets.put("iso88591", "com.ibm.icu.impl.CharsetBOCU1" ); + algorithmicCharsets.put("lmbcs1", "com.ibm.icu.impl.CharsetLMBCS1" ); + algorithmicCharsets.put("lmbcs11", "com.ibm.icu.impl.CharsetLMBCS11" ); + algorithmicCharsets.put("lmbcs16", "com.ibm.icu.impl.CharsetLMBCS16" ); + algorithmicCharsets.put("lmbcs17", "com.ibm.icu.impl.CharsetLMBCS17" ); + algorithmicCharsets.put("lmbcs18", "com.ibm.icu.impl.CharsetLMBCS18" ); + algorithmicCharsets.put("lmbcs19", "com.ibm.icu.impl.CharsetLMBCS19" ); + algorithmicCharsets.put("lmbcs2", "com.ibm.icu.impl.CharsetLMBCS2" ); + algorithmicCharsets.put("lmbcs3", "com.ibm.icu.impl.CharsetLMBCS3" ); + algorithmicCharsets.put("lmbcs4", "com.ibm.icu.impl.CharsetLMBCS4" ); + algorithmicCharsets.put("lmbcs5", "com.ibm.icu.impl.CharsetLMBCS5" ); + algorithmicCharsets.put("lmbcs6", "com.ibm.icu.impl.CharsetLMBCS6" ); + algorithmicCharsets.put("lmbcs8", "com.ibm.icu.impl.CharsetLMBCS8" ); + algorithmicCharsets.put("scsu", "com.ibm.icu.impl.CharsetSCSU" ); + algorithmicCharsets.put("usascii", "com.ibm.icu.impl.CharsetUSASCII" ); + algorithmicCharsets.put("UTF-16", "com.ibm.icu.impl.CharsetUTF16" ); + algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.impl.CharsetUTF16" ); + algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.impl.CharsetUTF16LE" ); + algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.impl.CharsetUTF16LE" ); + algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.impl.CharsetUTF16" ); + algorithmicCharsets.put("UTF-32", "com.ibm.icu.impl.CharsetUTF32" ); + algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.impl.CharsetUTF32" ); + algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.impl.CharsetUTF32LE" ); + algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.impl.CharsetUTF32LE" ); + algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.impl.CharsetUTF32" ); + algorithmicCharsets.put("UTF-7", "com.ibm.icu.impl.CharsetUTF7" ); + algorithmicCharsets.put("UTF-8", "com.ibm.icu.impl.CharsetUTF8" ); + } + + /*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + String className = (String) algorithmicCharsets.get(icuCanonicalName); + if(className==null){ + //all the cnv files are loaded as MBCS + className = "com.ibm.icu.impl.CharsetMBCS"; + } + try{ + CharsetICU conv = null; + Class cs = Class.forName(className); + Class[] paramTypes = new Class[]{ String.class, String.class, String[].class}; + final Constructor c = cs.getConstructor(paramTypes); + Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases}; + + java.security.AccessController.doPrivileged + (new java.security.PrivilegedAction() { + public Object run() { + c.setAccessible(true); + return null; + } + }); + + // Run constructor + try { + Object obj = c.newInstance(params); + if(obj!=null && obj instanceof CharsetICU){ + conv = (CharsetICU)obj; + return conv; + } + }catch (InvocationTargetException e) { + throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException()); + } + }catch(ClassNotFoundException ex){ + }catch(NoSuchMethodException ex){ + }catch (IllegalAccessException ex){ + }catch (InstantiationException ex){ + } + throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className); + } + + /** Always use fallbacks from codepage to Unicode */ + protected final boolean isToUUseFallback() { + return true; + } + + /** Use fallbacks from Unicode to codepage when useFallback or for private-use code points */ + protected final boolean isFromUUseFallback(int c) { + return (useFallback) || isPrivateUse(c); + } + + /** + * + */ + public static final String getDefaultCharsetName(){ + String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding(); + return defaultEncoding; + } + + /*public*/ static final boolean isPrivateUse(int c) { + return (UCharacter.getType(c) == UCharacter.PRIVATE_USE); + } + + /** + * Returns a charset object for the named charset. + * This method gurantee that ICU charset is returned when + * available. If the ICU charset provider does not support + * the specified charset, then try other charset providers + * including the standard Java charset provider. + * + * @param charsetName The name of the requested charset, + * may be either a canonical name or an alias + * @return A charset object for the named charset + * @throws IllegalCharsetNameException If the given charset name + * is illegal + * @throws UnsupportedCharsetException If no support for the + * named charset is available in this instance of th Java + * virtual machine + */ + public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException { + CharsetProviderICU icuProvider = new CharsetProviderICU(); + Charset cs = icuProvider.charsetForName(charsetName); + if (cs != null) { + return cs; + } + return Charset.forName(charsetName); + } +} + diff --git a/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java b/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java new file mode 100644 index 00000000000..8ae7d1bdb33 --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java @@ -0,0 +1,260 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.charset; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; +import java.nio.charset.spi.CharsetProvider; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; + +import com.ibm.icu.impl.UConverterAlias; + +public final class CharsetProviderICU extends CharsetProvider{ + + /** + * Constructs a CharsetProviderICU object + * @stable ICU 2.4 + */ + public CharsetProviderICU(){ + } + + /** + * Constructs a charset for the given charset name + * @param charsetName charset name + * @return charset objet for the given charset name, null if unsupported + * @stable ICU 2.4 + */ + public final Charset charsetForName(String charsetName){ + try{ + // get the canonical name + String icuCanonicalName = getICUCanonicalName(charsetName); + + // create the converter object and return it + if(icuCanonicalName==null || icuCanonicalName.length()==0){ + // this would make the Charset API to throw + // unsupported encoding exception + return null; + } + return getCharset(icuCanonicalName); + }catch(UnsupportedCharsetException ex){ + }catch(IOException ex){ + } + return null; + } + /** + * Gets the canonical name of the converter as defined by Java + * @param enc converter name + * @return canonical name of the converter + * @internal ICU 3.4 + */ + public static final String getICUCanonicalName(String enc) + throws UnsupportedCharsetException{ + String canonicalName = null; + String ret = null; + try{ + if(enc!=null){ + if((canonicalName = UConverterAlias.getCanonicalName(enc, "MIME"))!=null){ + ret = canonicalName; + }else if((canonicalName = UConverterAlias.getCanonicalName(enc, "IANA"))!=null){ + ret = canonicalName; + }else if((canonicalName = UConverterAlias.getCanonicalName(enc, ""))!=null){ + ret = canonicalName; + }else if((canonicalName = UConverterAlias.getAlias(enc, 0))!=null){ + /* we have some aliases in the form x-blah .. match those first */ + ret = canonicalName; + }else if(enc.indexOf("x-")==0){ + /* TODO: Match with getJavaCanonicalName method */ + /* + char temp[ UCNV_MAX_CONVERTER_NAME_LENGTH] = {0}; + strcpy(temp, encName+2); + */ + ret = enc.substring(2); + }else{ + /* unsupported encoding */ + ret = ""; + } + } + return ret; + }catch(IOException ex){ + throw new UnsupportedCharsetException(enc); + } + } + private static final Charset getCharset(String icuCanonicalName) throws IOException{ + String[] aliases = (String[])getAliases(icuCanonicalName); + String canonicalName = getJavaCanonicalName(icuCanonicalName); + return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases)); + } + /** + * Gets the canonical name of the converter as defined by Java + * @param icuCanonicalName converter name + * @return canonical name of the converter + * @internal ICU 3.4 + */ + + private static String getJavaCanonicalName(String icuCanonicalName){ + /* + If a charset listed in the IANA Charset Registry is supported by an implementation + of the Java platform then its canonical name must be the name listed in the registry. + Many charsets are given more than one name in the registry, in which case the registry + identifies one of the names as MIME-preferred. If a charset has more than one registry + name then its canonical name must be the MIME-preferred name and the other names in + the registry must be valid aliases. If a supported charset is not listed in the IANA + registry then its canonical name must begin with one of the strings "X-" or "x-". + */ + if(icuCanonicalName==null ){ + return null; + } + try{ + String cName = null; + /* find out the alias with MIME tag */ + if((cName=UConverterAlias.getStandardName(icuCanonicalName, "MIME"))!=null){ + /* find out the alias with IANA tag */ + }else if((cName=UConverterAlias.getStandardName(icuCanonicalName, "IANA"))!=null){ + }else { + /* + check to see if an alias already exists with x- prefix, if yes then + make that the canonical name + */ + int aliasNum = UConverterAlias.countAliases(icuCanonicalName); + String name; + for(int i=0;i=0;) { + ret[j] = aliasArray[j]; + } + + } + return (ret); + + } + + /** + * Class that implements the iterator for charsets + * @stable ICU 2.4 + */ + protected final class CharsetIterator implements Iterator{ + private String[] names; + private int currentIndex; + protected CharsetIterator(String[] strs){ + names = strs; + currentIndex=0; + } + public boolean hasNext(){ + return (currentIndex< names.length); + } + public Object next(){ + if(currentIndex>>8; + //if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { + if(mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { + try { + baseNameString = reader.readBaseTableName(); + if(offset != 0) { + //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read; + mbcsTable.extIndexes=reader.readExtIndexes(offset - 32 - baseNameString.length() - 1); + } + } + catch(IOException e) { + throw new InvalidFormatException(); + } + } + /* + if(offset != 0) { + try { + //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read; + int namelen = baseNameString != null? baseNameString.length() + 1: 0; + mbcsTable.extIndexes=dataReader.readExtIndexes(offset - 32 - namelen); + + } + catch(IOException e) { + if(debug) System.err.println("Caught IOException: " + e.getMessage()); + pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; + return; + } + } + */ + //agljport:add this would be unnecessary if extIndexes were memory mapped + if(mbcsTable.extIndexes != null) { + /* + try { + //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 + mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; + //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] + //byte[] extTables = dataReader.readExtTables(nbytes); + //mbcsTable.extTables = ByteBuffer.wrap(extTables); + } + catch(IOException e) { + System.err.println("Caught IOException: " + e.getMessage()); + pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; + return; + } + */ + } + + if(mbcsTable.outputType==MBCS_OUTPUT_EXT_ONLY) { + UConverterSharedData baseSharedData = null; + ByteBuffer extIndexes; + String baseName; + + /* extension-only file, load the base table and set values appropriately */ + if((extIndexes=mbcsTable.extIndexes)==null) { + /* extension-only file without extension */ + throw new InvalidFormatException(); + } + + if(args.nestedLoads!=1) { + /* an extension table must not be loaded as a base table */ + throw new InvalidFormatException(); + } + + /* load the base table */ + baseName=baseNameString; + if(baseName.equals(staticData.name)) { + /* forbid loading this same extension-only file */ + throw new InvalidFormatException(); + } + + /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ + //agljport:fix args.size=sizeof(UConverterLoadArgs); + LoadArguments args2 = new LoadArguments(2, baseName); + baseSharedData=loadConverter(args2); + + if( baseSharedData.staticData.conversionType!=UConverterType.MBCS || + baseSharedData.mbcs.baseSharedData!=null + ) { + //agljport:fix ucnv_unload(baseSharedData); + throw new InvalidFormatException(); + } + + /* copy the base table data */ + //agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't need the deep copy so can just make sure mbcs and its local reference both refer to the same new object + mbcsTable = data.mbcs = baseSharedData.mbcs; + + /* overwrite values with relevant ones for the extension converter */ + mbcsTable.baseSharedData=baseSharedData; + mbcsTable.extIndexes=extIndexes; + + /* + * It would be possible to share the swapLFNL data with a base converter, + * but the generated name would have to be different, and the memory + * would have to be free'd only once. + * It is easier to just create the data for the extension converter + * separately when it is requested. + */ + mbcsTable.swapLFNLStateTable=null; + mbcsTable.swapLFNLFromUnicodeBytes=null; + mbcsTable.swapLFNLName=null; + + /* + * Set a special, runtime-only outputType if the extension converter + * is a DBCS version of a base converter that also maps single bytes. + */ + if(staticData.conversionType==UConverterType.DBCS || + (staticData.conversionType==UConverterType.MBCS && staticData.minBytesPerChar>=2)){ + + if(baseSharedData.mbcs.outputType==MBCS_OUTPUT_2_SISO) { + /* the base converter is SI/SO-stateful */ + int entry; + + /* get the dbcs state from the state table entry for SO=0x0e */ + entry=mbcsTable.stateTable[0][0xe]; + if( MBCS_ENTRY_IS_FINAL(entry) && + MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && + MBCS_ENTRY_FINAL_STATE(entry)!=0 + ) { + mbcsTable.dbcsOnlyState=(byte)MBCS_ENTRY_FINAL_STATE(entry); + + mbcsTable.outputType=MBCS_OUTPUT_DBCS_ONLY; + } + } + else if(baseSharedData.staticData.conversionType==UConverterType.MBCS && + baseSharedData.staticData.minBytesPerChar==1 && + baseSharedData.staticData.maxBytesPerChar==2 && + mbcsTable.countStates<=127){ + + /* non-stateful base converter, need to modify the state table */ + int newStateTable[][/*256*/]; + int state[]; // this works because java 2-D array is array of references and we can have state = newStateTable[i]; + int i, count; + + /* allocate a new state table and copy the base state table contents */ + count=mbcsTable.countStates; + newStateTable=new int[(count+1)*1024][256]; + + for(i = 0; i < mbcsTable.stateTable.length; ++i) + System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, mbcsTable.stateTable[i].length); + + /* change all final single-byte entries to go to a new all-illegal state */ + state=newStateTable[0]; + for(i=0; i<256; ++i) { + if(MBCS_ENTRY_IS_FINAL(state[i])) { + state[i]=MBCS_ENTRY_TRANSITION(count, 0); + } + } + + /* build the new all-illegal state */ + state=newStateTable[count]; + for(i=0; i<256; ++i) { + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); + } + mbcsTable.stateTable=newStateTable; + mbcsTable.countStates=(byte)(count+1); + mbcsTable.stateTableOwned=true; + + mbcsTable.outputType=MBCS_OUTPUT_DBCS_ONLY; + } + } + + /* + * unlike below for files with base tables, do not get the unicodeMask + * from the sharedData; instead, use the base table's unicodeMask, + * which we copied in the memcpy above; + * this is necessary because the static data unicodeMask, especially + * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data + */ + } + else { + /* conversion file with a base table; an additional extension table is optional */ + /* make sure that the output type is known */ + switch(mbcsTable.outputType) { + case MBCS_OUTPUT_1: + case MBCS_OUTPUT_2: + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4: + case MBCS_OUTPUT_3_EUC: + case MBCS_OUTPUT_4_EUC: + case MBCS_OUTPUT_2_SISO: + /* OK */ + break; + default: + throw new InvalidFormatException(); + } + + stateTableArray = new int[header.countStates][256]; + toUFallbacksArray = new MBCSToUFallback[header.countToUFallbacks]; + for(int i = 0; i < toUFallbacksArray.length; ++i) + toUFallbacksArray[i] = data.new MBCSToUFallback(); + unicodeCodeUnitsArray = new char[(header.offsetFromUTable - header.offsetToUCodeUnits)/2]; + fromUnicodeTableArray = new char[(header.offsetFromUBytes - header.offsetFromUTable)/2]; + fromUnicodeBytesArray = new byte[header.fromUBytesLength]; + try { + reader.readMBCSTable(stateTableArray, toUFallbacksArray, unicodeCodeUnitsArray, fromUnicodeTableArray, fromUnicodeBytesArray); + } + catch(IOException e) { + throw new InvalidFormatException(); + } + + mbcsTable.countStates=(byte)header.countStates; + mbcsTable.countToUFallbacks=header.countToUFallbacks; + mbcsTable.stateTable=stateTableArray; + mbcsTable.toUFallbacks=toUFallbacksArray; + mbcsTable.unicodeCodeUnits=unicodeCodeUnitsArray; + + mbcsTable.fromUnicodeTable=fromUnicodeTableArray; + mbcsTable.fromUnicodeBytes=fromUnicodeBytesArray; + mbcsTable.fromUBytesLength=header.fromUBytesLength; + + /* + * converter versions 6.1 and up contain a unicodeMask that is + * used here to select the most efficient function implementations + */ + //agljport:fix info.size=sizeof(UDataInfo); + //agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); + //agljport:fix if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { + /* mask off possible future extensions to be safe */ + mbcsTable.unicodeMask=(short)(staticData.unicodeMask&3); + //agljport:fix } else { + /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ + //agljport:fix mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; + //agljport:fix } + if(offset != 0) { + try { + //agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null terminator byte all already read; + //int namelen = baseNameString != null? baseNameString.length() + 1: 0; + //mbcsTable.extIndexes=dataReader.readExtIndexes(offset - 32 - namelen); + mbcsTable.extIndexes=reader.readExtIndexes(0); + } + catch(IOException e) { + throw new InvalidFormatException(); + } + } + } + return data; + } + + protected void initializeConverter(int options) + { + UConverterMBCSTable mbcsTable; + ByteBuffer extIndexes; + short outputType; + byte maxBytesPerUChar; + + mbcsTable=sharedData.mbcs; + outputType=mbcsTable.outputType; + + if(outputType==MBCS_OUTPUT_DBCS_ONLY) { + /* the swaplfnl option does not apply, remove it */ + this.options=options&=~UConverterConstants.OPTION_SWAP_LFNL; + } + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + /* do this because double-checked locking is broken */ + boolean isCached; + + //agljport:todo umtx_lock(NULL); + isCached=mbcsTable.swapLFNLStateTable!=null; + //agljport:todo umtx_unlock(NULL); + + if(!isCached) { + //agljport:fix if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { + //agljport:fix if(U_FAILURE(*pErrorCode)) { + //agljport:fix return; /* something went wrong */ + //agljport:fix } + + /* the option does not apply, remove it */ + //agljport:fix cnv->options=options&=~UCNV_OPTION_SWAP_LFNL; + //agljport:fix } + } + } + + if(icuCanonicalName.toLowerCase().indexOf("gb18030") >= 0) { + /* set a flag for GB 18030 mode, which changes the callback behavior */ + this.options|=MBCS_OPTION_GB18030; + } + + /* fix maxBytesPerUChar depending on outputType and options etc. */ + if(outputType==MBCS_OUTPUT_2_SISO) { + maxBytesPerChar=3; /* SO+DBCS */ + } + + extIndexes=mbcsTable.extIndexes; + if(extIndexes!=null) { + maxBytesPerUChar=(byte)GET_MAX_BYTES_PER_UCHAR(extIndexes); + if(outputType==MBCS_OUTPUT_2_SISO) { + ++maxBytesPerUChar; /* SO + multiple DBCS */ + } + + if(maxBytesPerUChar>maxBytesPerChar) { + maxBytesPerChar=maxBytesPerUChar; + } + } + } + + /** + * MBCS output types for conversions from Unicode. + * These per-converter types determine the storage method in stage 3 of the lookup table, + * mostly how many bytes are stored per entry. + */ + protected static final int MBCS_OUTPUT_1 = 0; /* 0 */ + protected static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */ + protected static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */ + protected static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */ + protected static final int MBCS_OUTPUT_3_EUC=8; /* 8 */ + protected static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */ + protected static final int MBCS_OUTPUT_2_SISO=12; /* c */ + protected static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */ + protected static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */ + protected static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1; + protected static final int MBCS_OUTPUT_DBCS_ONLY=0xdb; /* runtime-only type for DBCS-only handling of SISO tables */ + + /* GB 18030 data ------------------------------------------------------------ */ + + /* helper macros for linear values for GB 18030 four-byte sequences */ + protected static long LINEAR_18030(long a, long b, long c, long d) {return ((((a)*10+(b))*126L+(c))*10L+(d));} + + protected static long LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30); + + protected static long LINEAR(long x) {return LINEAR_18030(x>>>24, (x>>>16)&0xff, (x>>>8)&0xff, x&0xff);} + + /* + * Some ranges of GB 18030 where both the Unicode code points and the + * GB four-byte sequences are contiguous and are handled algorithmically by + * the special callback functions below. + * The values are start & end of Unicode & GB codes. + * + * Note that single surrogates are not mapped by GB 18030 + * as of the re-released mapping tables from 2000-nov-30. + */ + protected static final long gb18030Ranges[][] = new long[/*13*/][/*4*/]{ + {0x10000L, 0x10FFFFL, LINEAR(0x90308130L), LINEAR(0xE3329A35L)}, + {0x9FA6L, 0xD7FFL, LINEAR(0x82358F33L), LINEAR(0x8336C738L)}, + {0x0452L, 0x200FL, LINEAR(0x8130D330L), LINEAR(0x8136A531L)}, + {0xE865L, 0xF92BL, LINEAR(0x8336D030L), LINEAR(0x84308534L)}, + {0x2643L, 0x2E80L, LINEAR(0x8137A839L), LINEAR(0x8138FD38L)}, + {0xFA2AL, 0xFE2FL, LINEAR(0x84309C38L), LINEAR(0x84318537L)}, + {0x3CE1L, 0x4055L, LINEAR(0x8231D438L), LINEAR(0x8232AF32L)}, + {0x361BL, 0x3917L, LINEAR(0x8230A633L), LINEAR(0x8230F237L)}, + {0x49B8L, 0x4C76L, LINEAR(0x8234A131L), LINEAR(0x8234E733L)}, + {0x4160L, 0x4336L, LINEAR(0x8232C937L), LINEAR(0x8232F837L)}, + {0x478EL, 0x4946L, LINEAR(0x8233E838L), LINEAR(0x82349638L)}, + {0x44D7L, 0x464BL, LINEAR(0x8233A339L), LINEAR(0x8233C931L)}, + {0xFFE6L, 0xFFFFL, LINEAR(0x8431A234L), LINEAR(0x8431A439L)} + }; + + /* bit flag for UConverter.options indicating GB 18030 special handling */ + protected static final int MBCS_OPTION_GB18030 = 0x8000; + + /** + * MBCS action codes for conversions to Unicode. + * These values are in bits 23..20 of the state table entries. + */ + protected static final int MBCS_STATE_VALID_DIRECT_16 = 0; + protected static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1; + protected static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1; + protected static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1; + protected static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1; + protected static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1; + protected static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1; + protected static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1; + protected static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1; + + /* Methods for state table entries */ + protected static int MBCS_ENTRY_TRANSITION(int state, int offset) {return (state<<24L)|offset; } + protected static int MBCS_ENTRY_FINAL(int state, int action, int value) {return (int)(0x80000000|((int)(state)<<24L)|((action)<<20L)|(value));} + protected static boolean MBCS_ENTRY_IS_TRANSITION(int entry) {return (entry)>=0; } + protected static boolean MBCS_ENTRY_IS_FINAL(int entry) {return (entry)<0;} + protected static int MBCS_ENTRY_TRANSITION_STATE(int entry) {return ((entry)>>>24);} + protected static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) {return ((entry)&0xffffff);} + protected static int MBCS_ENTRY_FINAL_STATE(int entry) {return ((entry)>>>24)&0x7f;} + protected static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) {return ((entry)<0x80100000);} + protected static int MBCS_ENTRY_FINAL_ACTION(int entry) {return ((entry)>>>20)&0xf;} + protected static int MBCS_ENTRY_FINAL_VALUE(int entry) {return ((entry)&0xfffff); } + protected static char MBCS_ENTRY_FINAL_VALUE_16(int entry) {return (char)(entry);} + + /** + * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. + * It works for single-byte, single-state codepages that only map + * to and from BMP code points, and it always + * returns fallback values. + */ + protected static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) + { + return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b & UConverterConstants.UNSIGNED_BYTE_MASK]); + } + + /* single-byte fromUnicode: get the 16-bit result word */ + protected static char MBCS_SINGLE_RESULT_FROM_U(char[] table, byte[] results, int c) + { + int i1 = table[c>>>10] +((c>>>4)&0x3f); + int i = 2* (table[i1] +(c&0xf)); // used as index into byte[] array treated as char[] array + return (char)(((results[i] & UConverterConstants.UNSIGNED_BYTE_MASK) <<8) | (results[i+1] & UConverterConstants.UNSIGNED_BYTE_MASK)); + } + + /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ + protected static int MBCS_STAGE_2_FROM_U(char[] table, int c) + { + int i = 2 * (table[(c)>>>10] +((c>>>4)&0x3f)); // 2x because used as index into char[] array treated as int[] array + return ((table[i] & UConverterConstants.UNSIGNED_SHORT_MASK) <<16) | (table[i+1] & UConverterConstants.UNSIGNED_SHORT_MASK); + } + + protected static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) {return ( ((stage2Entry) & (1<< (16+((c)&0xf)) )) !=0);} + + protected static char MBCS_VALUE_2_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) + { + int i = 2 * (16*((char)stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK)+(c&0xf)); + return (char)(((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) <<8) | (bytes[i+1] & UConverterConstants.UNSIGNED_BYTE_MASK)); + } + + protected static int MBCS_VALUE_4_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) + { + int i = 4 * (16*((char)stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK)+(c&0xf)); + return ((bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK) <<24) | + ((bytes[i+1] & UConverterConstants.UNSIGNED_BYTE_MASK) <<16) | + ((bytes[i+2] & UConverterConstants.UNSIGNED_BYTE_MASK) <<8) | + (bytes[i+3] & UConverterConstants.UNSIGNED_BYTE_MASK); + } + + protected static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) + { + return ((16*((char)(stage2Entry) & UConverterConstants.UNSIGNED_SHORT_MASK)+((c)&0xf))*3); + } + + //------------UConverterExt------------------------------------------------------- + + protected static final int INDEXES_LENGTH = 0; /* 0 */ + + protected static final int TO_U_INDEX = INDEXES_LENGTH + 1; /* 1 */ + protected static final int TO_U_LENGTH = TO_U_INDEX + 1; + protected static final int TO_U_UCHARS_INDEX = TO_U_LENGTH + 1; + protected static final int TO_U_UCHARS_LENGTH = TO_U_UCHARS_INDEX + 1; + + protected static final int FROM_U_UCHARS_INDEX = TO_U_UCHARS_LENGTH + 1; /* 5 */ + protected static final int FROM_U_VALUES_INDEX = FROM_U_UCHARS_INDEX + 1; + protected static final int FROM_U_LENGTH = FROM_U_VALUES_INDEX + 1; + protected static final int FROM_U_BYTES_INDEX = FROM_U_LENGTH + 1; + protected static final int FROM_U_BYTES_LENGTH = FROM_U_BYTES_INDEX + 1; + + protected static final int FROM_U_STAGE_12_INDEX = FROM_U_BYTES_LENGTH + 1; /* 10 */ + protected static final int FROM_U_STAGE_1_LENGTH = FROM_U_STAGE_12_INDEX + 1; + protected static final int FROM_U_STAGE_12_LENGTH = FROM_U_STAGE_1_LENGTH + 1; + protected static final int FROM_U_STAGE_3_INDEX = FROM_U_STAGE_12_LENGTH + 1; + protected static final int FROM_U_STAGE_3_LENGTH = FROM_U_STAGE_3_INDEX + 1; + protected static final int FROM_U_STAGE_3B_INDEX = FROM_U_STAGE_3_LENGTH + 1; + protected static final int FROM_U_STAGE_3B_LENGTH = FROM_U_STAGE_3B_INDEX + 1; + + protected static final int COUNT_BYTES = FROM_U_STAGE_3B_LENGTH + 1; /* 17 */ + protected static final int COUNT_UCHARS = COUNT_BYTES + 1; + protected static final int FLAGS = COUNT_UCHARS + 1; + + protected static final int RESERVED_INDEX = FLAGS + 1; /* 20, moves with additional indexes */ + + protected static final int SIZE=31; + protected static final int INDEXES_MIN_LENGTH=32; + + /* toUnicode helpers -------------------------------------------------------- */ + + protected static final int TO_U_BYTE_SHIFT = 24; + protected static final int TO_U_VALUE_MASK = 0xffffff; + protected static final int TO_U_MIN_CODE_POINT = 0x1f0000; + protected static final int TO_U_MAX_CODE_POINT = 0x2fffff; + protected static final int TO_U_ROUNDTRIP_FLAG = (1<<23); + protected static final int TO_U_INDEX_MASK = 0x3ffff; + protected static final int TO_U_LENGTH_SHIFT = 18; + protected static final int TO_U_LENGTH_OFFSET = 12; + + /* maximum number of indexed UChars */ + protected static final int MAX_UCHARS = 19; + + protected static int TO_U_GET_BYTE(int word) + { + return word>>>TO_U_BYTE_SHIFT; + } + + protected static int TO_U_GET_VALUE(int word) + { + return word&TO_U_VALUE_MASK; + } + + protected static boolean TO_U_IS_ROUNDTRIP(int value) + { + return (value&TO_U_ROUNDTRIP_FLAG)!=0; + } + + protected static boolean TO_U_IS_PARTIAL(int value) + { + return (value&UConverterConstants.UNSIGNED_INT_MASK)>>TO_U_LENGTH_SHIFT)-TO_U_LENGTH_OFFSET; + } + + /* fromUnicode helpers ------------------------------------------------------ */ + + /* most trie constants are shared with ucnvmbcs.h */ + protected static final int STAGE_2_LEFT_SHIFT = 2; + protected static final int STAGE_3_GRANULARITY = 4; + + /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */ + protected static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) + { + return stage3.get(stage3.position() + ((int)stage12.get( stage12.position() + (stage12.get(stage12.position()+s1Index) +((c>>>4)&0x3f)) )< (impossible roundtrip to 0 bytes, value 01) */ + protected static final int FROM_U_SUBCHAR1 = 0x80000001; + + /* at most 3 bytes in the lower part of the value */ + protected static final int FROM_U_MAX_DIRECT_LENGTH = 3; + + /* maximum number of indexed bytes */ + protected static final int MAX_BYTES = 0x1f; + + protected static boolean FROM_U_IS_PARTIAL(int value) {return (value>>>FROM_U_LENGTH_SHIFT)==0;} + protected static int FROM_U_GET_PARTIAL_INDEX(int value) {return value;} + + protected static boolean FROM_U_IS_ROUNDTRIP(int value) {return (value&FROM_U_ROUNDTRIP_FLAG)!=0;} + protected static int FROM_U_MASK_ROUNDTRIP(int value) {return value&~FROM_U_ROUNDTRIP_FLAG;} + + /* use after masking off the roundtrip flag */ + protected static int FROM_U_GET_LENGTH(int value) {return (value>>>FROM_U_LENGTH_SHIFT)&MAX_BYTES;} + + /* get bytes or bytes index */ + protected static int FROM_U_GET_DATA(int value) {return value&FROM_U_DATA_MASK;} + + /* get the pointer to an extension array from indexes[index] */ + protected static Buffer ARRAY(ByteBuffer indexes, int index, Class itemType) + { + int oldpos = indexes.position(); + Buffer b; + + indexes.position(indexes.getInt(index*4)); + if(itemType == int.class) + b = indexes.asIntBuffer(); + else if(itemType == short.class) + b = indexes.asShortBuffer(); + else if(itemType == byte.class) + b = indexes.slice(); + else if(itemType == char.class) + b = indexes.asCharBuffer(); + else + b = indexes.slice(); + indexes.position(oldpos); + return b; + } + + protected static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) + { + indexes.position(0); + IntBuffer a = indexes.asIntBuffer(); + int n; + if(a.hasArray()) + n = a.array()[COUNT_BYTES]; + else + n = a.get(COUNT_BYTES); + + return indexes.getInt(4*COUNT_BYTES)&0xff; + } + + /* + * @return index of the UChar, if found; else <0 + */ + protected static int findFromU(CharBuffer fromUSection, int length, char u) + { + int i, start, limit; + + /* binary search */ + start=0; + limit=length; + for(;;) { + i=limit-start; + if(i<=1) { + break; /* done */ + } + /* startmode==0 is equivalent to firstLength==1. + */ + protected static int SISO_STATE(UConverterSharedData sharedData, int mode) + { + return sharedData.mbcs.outputType==MBCS_OUTPUT_2_SISO ? (byte)mode : + sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1; + } + + class CharsetDecoderMBCS extends CharsetDecoderICU{ + + public CharsetDecoderMBCS(CharsetICU cs) { + super(cs); + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + CoderResult[] crArray = {cr}; + + int sourceArrayIndex; + int stateTable[][/*256*/]; + char[] unicodeCodeUnits; + + int offset; + byte state; + int byteIndex; + byte[] bytes; + + int sourceIndex, nextSourceIndex; + + int entry = 0; + char c; + byte action; + + if(preToULength>0) { + /* + * pass sourceIndex=-1 because we continue from an earlier buffer + * in the future, this may change with continuous offsets + */ + cr = continueMatchToU(source, target, offsets, -1); + + if(cr.isError() || preToULength<0) { + return cr; + } + } + + if(sharedData.mbcs.countStates==1) { + if((sharedData.mbcs.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) { + cr = cnvMBCSSingleToBMPWithOffsets(source, target, offsets); + } + else { + cr = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets); + } + return cr; + } + + /* set up the local pointers */ + sourceArrayIndex = source.position(); + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + stateTable = sharedData.mbcs.swapLFNLStateTable; + } + else { + stateTable = sharedData.mbcs.stateTable; + } + unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; + + /* get the converter state from UConverter */ + offset = (int)toUnicodeStatus; + byteIndex = toULength; + bytes = toUBytesArray; + + /* + * if we are in the SBCS state for a DBCS-only converter, + * then load the DBCS state from the MBCS data + * (dbcsOnlyState==0 if it is not a DBCS-only converter) + */ + if((state=(byte)(mode))==0) { + state = sharedData.mbcs.dbcsOnlyState; + } + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex = byteIndex==0 ? 0 : -1; + nextSourceIndex = 0; + + /* conversion loop */ + while(sourceArrayIndex=source.limit()) { + break; + } + if(!target.hasRemaining()) { + /* target is full */ + cr = CoderResult.OVERFLOW; + break; + } + + ++nextSourceIndex; + bytes[byteIndex++] = source.get(sourceArrayIndex++); + } + else /* byteIndex>0 */ { + ++nextSourceIndex; + entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++)) & UConverterConstants.UNSIGNED_BYTE_MASK]; + } + + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); + offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); + continue; + } + + /* save the previous state for proper extension mapping with SI/SO-stateful converters */ + mode = state; + + /* set the next state early so that we can reuse the entry variable */ + state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ + + /* + * An if-else-if chain provides more reliable performance for + * the most common cases compared to a switch. + */ + action = (byte)(MBCS_ENTRY_FINAL_ACTION(entry)); + if(action==MBCS_STATE_VALID_16) { + offset += MBCS_ENTRY_FINAL_VALUE_16(entry); + c = unicodeCodeUnits[offset]; + if(c<0xfffe) { + /* output BMP code point */ + target.put(c); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + } + else if(c==0xfffe) { + if(isToUUseFallback() && (entry=(int)getFallback(sharedData.mbcs, offset))!=0xfffe) { + /* output fallback BMP code point */ + target.put((char)entry); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + } + } + else { + /* callback(illegal) */ + cr = CoderResult.malformedForLength(byteIndex); + } + } + else if(action==MBCS_STATE_VALID_DIRECT_16) { + /* output BMP code point */ + target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + } + else if(action==MBCS_STATE_VALID_16_PAIR) { + offset += MBCS_ENTRY_FINAL_VALUE_16(entry); + c = unicodeCodeUnits[offset++]; + if(c<0xd800) { + /* output BMP code point below 0xd800 */ + target.put(c); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + } + else if(isToUUseFallback() ? c<=0xdfff : c<=0xdbff) { + /* output roundtrip or fallback surrogate pair */ + target.put((char)(c&0xdbff)); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + if(target.hasRemaining()) { + target.put(unicodeCodeUnits[offset]); + if(offsets!=null) { + offsets.put(sourceIndex); + } + } + else { + /* target overflow */ + charErrorBufferArray[0] = unicodeCodeUnits[offset]; + charErrorBufferLength = 1; + cr = CoderResult.OVERFLOW; + + offset = 0; + break; + } + } + else if(isToUUseFallback() ? (c&0xfffe)==0xe000 : c==0xe000) { + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ + target.put(unicodeCodeUnits[offset]); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + } + else if(c==0xffff) { + /* callback(illegal) */ + cr = CoderResult.malformedForLength(byteIndex); + } + } + else if(action==MBCS_STATE_VALID_DIRECT_20 || + (action==MBCS_STATE_FALLBACK_DIRECT_20 && isToUUseFallback())) { + entry = MBCS_ENTRY_FINAL_VALUE(entry); + /* output surrogate pair */ + target.put((char)(0xd800|(char)(entry>>10))); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + c = (char)(0xdc00|(char)(entry&0x3ff)); + if(target.hasRemaining()) { + target.put(c); + if(offsets!=null) { + offsets.put(sourceIndex); + } + } + else { + /* target overflow */ + charErrorBufferArray[0]=c; + charErrorBufferLength=1; + cr = CoderResult.OVERFLOW; + + offset = 0; + break; + } + } + else if(action==MBCS_STATE_CHANGE_ONLY) { + /* + * This serves as a state change without any output. + * It is useful for reading simple stateful encodings, + * for example using just Shift-In/Shift-Out codes. + * The 21 unused bits may later be used for more sophisticated + * state transitions. + */ + if(sharedData.mbcs.dbcsOnlyState==0) { + byteIndex = 0; + } + else { + /* SI/SO are illegal for DBCS-only conversion */ + state = (byte)(mode); /* restore the previous state */ + + /* callback(illegal) */ + cr = CoderResult.malformedForLength(byteIndex); + } + } + else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { + if(isToUUseFallback()) { + /* output BMP code point */ + target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); + if(offsets!=null) { + offsets.put(sourceIndex); + } + byteIndex = 0; + } + } + else if(action==MBCS_STATE_UNASSIGNED) { + /* just fall through */ + } + else if(action==MBCS_STATE_ILLEGAL) { + /* callback(illegal) */ + cr = CoderResult.malformedForLength(byteIndex); + } + else { + /* reserved, must never occur */ + byteIndex = 0; + } + + /* end of action codes: prepare for a new character */ + offset=0; + + if(byteIndex==0) { + sourceIndex = nextSourceIndex; + } + else if(cr.isError()) { + /* callback(illegal) */ + break; + } + else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ + int sourceBeginIndex = sourceArrayIndex; + source.position(sourceArrayIndex); + byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, crArray); + sourceArrayIndex = source.position(); + sourceIndex = nextSourceIndex+(int)(sourceArrayIndex-sourceBeginIndex); + + if(cr.isError()) { + /* not mappable or buffer overflow */ + break; + } + } + } + + /* set the converter state back into UConverter */ + toUnicodeStatus = offset; + mode = state; + toULength = byteIndex; + + /* write back the updated pointers */ + source.position(sourceArrayIndex); + + return cr; + } + + /* + * continue partial match with new input + * never called for simple, single-character conversion + */ + protected CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex) + { + CoderResult cr = CoderResult.UNDERFLOW; + + int[] value = new int[1]; + int match, length; + + match = matchToU((byte)SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source, value); + + if(match>0) { + if(match>=preToULength) { + /* advance src pointer for the consumed input */ + source.position(source.position()+match-preToULength); + preToULength = 0; + } + else { + /* the match did not use all of preToU[] - keep the rest for replay */ + length = preToULength - match; + System.arraycopy(preToUArray, preToUBegin+match, preToUArray, preToUBegin, length); + preToULength=(byte)-length; + } + + /* write result */ + cr = writeToU(value[0], target, offsets, srcIndex); + } + else if(match<0) { + /* save state for partial match */ + int j, sArrayIndex; + + /* just _append_ the newly consumed input to preToU[] */ + sArrayIndex = source.position(); + match =- match; + for(j=preToULength; j0) { + System.arraycopy(preToUArray, preToUBegin+preToUFirstLength, preToUArray, preToUBegin, length); + } + + /* mark preToU for replay */ + preToULength = (byte)-length; + + /* set the error code for unassigned */ + cr = CoderResult.unmappableForLength(preToUFirstLength); + } + return cr; + } + + /* + * this works like natchFromU() except + * - the first character is in pre + * - no trie is used + * - the returned matchLength is not offset by 2 + */ + protected int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source, int[] pMatchValue) + { + ByteBuffer cx = sharedData.mbcs.extIndexes; + IntBuffer toUTable, toUSection; + + int value, matchValue, srcLength; + int i, j, index, length, matchLength; + short b; + + if(cx==null || cx.asIntBuffer().get(TO_U_LENGTH)<=0) { + return 0; /* no extension data, no match */ + } + + /* initialize */ + toUTable = (IntBuffer)ARRAY(cx, TO_U_INDEX, int.class); + index = 0; + + matchValue = 0; + i = j = matchLength=0; + srcLength = source.remaining(); + + if(sisoState==0) { + /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ + if(preLength>1) { + return 0; /* no match of a DBCS sequence in SBCS mode */ + } + else if(preLength==1) { + srcLength = 0; + } + else /* preLength==0 */ { + if(srcLength>1) { + srcLength = 1; + } + } + flush = true; + } + + /* we must not remember fallback matches when not using fallbacks */ + + /* match input units until there is a full match or the input is consumed */ + for(;;) { + /* go to the next section */ + int oldpos = toUTable.position(); + toUSection=((IntBuffer)toUTable.position(index)).slice(); + toUTable.position(oldpos); + + /* read first pair of the section */ + value = toUSection.get(); + length = TO_U_GET_BYTE(value); + value =TO_U_GET_VALUE(value); + if(value!=0 && + (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback()) && + TO_U_VERIFY_SISO_MATCH(sisoState, i+j)) { + /* remember longest match so far */ + matchValue=value; + matchLength=i+j; + } + + /* match pre[] then src[] */ + if(iMAX_BYTES) { + /* + * end of the entire input stream, stop with the longest match so far + * or: partial match must not be longer than UCNV_EXT_MAX_BYTES + * because it must fit into state buffers + */ + break; + } + else { + /* continue with more input next time */ + return -length; + } + } + + /* search for the current UChar */ + value = findToU(toUSection, length, b); + if(value==0) { + /* no match here, stop with the longest match so far */ + break; + } else { + if(TO_U_IS_PARTIAL(value)) { + /* partial match, continue */ + index = TO_U_GET_PARTIAL_INDEX(value); + } else { + if((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback()) && + TO_U_VERIFY_SISO_MATCH(sisoState, i+j)) { + /* full match, stop with result */ + matchValue = value; + matchLength = i+j; + } + else { + /* full match on fallback not taken, stop with the longest match so far */ + } + break; + } + } + } + + if(matchLength==0) { + /* no match at all */ + return 0; + } + + /* return result */ + pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue); + return matchLength; + } + + protected CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) + { + ByteBuffer cx = sharedData.mbcs.extIndexes; + /* output the result */ + if(TO_U_IS_CODE_POINT(value)) { + /* output a single code point */ + return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex); + } else { + /* output a string - with correct data we have resultLength>0 */ + + char[] a = new char[TO_U_GET_LENGTH(value)]; + CharBuffer cb = ((CharBuffer)ARRAY(cx, TO_U_UCHARS_INDEX, char.class)); + cb.position(TO_U_GET_INDEX(value)); + cb.get(a, 0, a.length); + return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex); + } + } + + protected CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) + { + CoderResult cr = CoderResult.UNDERFLOW; + int tBeginIndex = target.position(); + + if(target.hasRemaining()) { + if(c<=0xffff) { + target.put((char)c); + c = UConverterConstants.U_SENTINEL; + } else /* c is a supplementary code point */ { + target.put(UTF16.getLeadSurrogate(c)); + c = UTF16.getTrailSurrogate(c); + if(target.hasRemaining()) { + target.put((char)c); + c = UConverterConstants.U_SENTINEL; + } + } + + /* write offsets */ + if(offsets!=null) { + offsets.put(sourceIndex); + if((tBeginIndex+1)=0) { + charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c); + cr = CoderResult.OVERFLOW; + } + + return cr; + } + + /* + * Input sequence: cnv->toUBytes[0..length[ + * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input + * else return 0 after output has been written to the target + */ + protected int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex, CoderResult[] crArray) + { + //ByteBuffer cx; + + if(sharedData.mbcs.extIndexes!=null && + initialMatchToU(length, source, target, offsets, sourceIndex, crArray)) { + return 0; /* an extension mapping handled the input */ + } + + /* GB 18030 */ + if(length==4 && (options&MBCS_OPTION_GB18030)!=0) { + long[] range; + long linear; + int i; + + linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]); + range = gb18030Ranges[0]; + for(i=0; i0) { + /* advance src pointer for the consumed input */ + source.position(source.position()+match-firstLength); + + /* write result to target */ + crArray[0] = writeToU(value[0], target, offsets, srcIndex); + return true; + } + else if(match<0) { + /* save state for partial match */ + byte[] sArray; + int sArrayIndex; + int j; + + /* copy the first code point */ + sArray = toUBytesArray; + sArrayIndex = toUBytesBegin; + preToUFirstLength = (byte)firstLength; + for(j=0; j0) { + entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; + /* MBCS_ENTRY_IS_FINAL(entry) */ + + /* test the most common case first */ + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { + /* output BMP code point */ + target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); + --targetCapacity; + continue; + } + + /* + * An if-else-if chain provides more reliable performance for + * the most common cases compared to a switch. + */ + action = (byte)(MBCS_ENTRY_FINAL_ACTION(entry)); + if(action==MBCS_STATE_FALLBACK_DIRECT_16) { + if(isToUUseFallback()) { + /* output BMP code point */ + target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); + --targetCapacity; + continue; + } + } + else if(action==MBCS_STATE_UNASSIGNED) { + /* just fall through */ + } + else if(action==MBCS_STATE_ILLEGAL) { + /* callback(illegal) */ + cr = CoderResult.malformedForLength(sourceArrayIndex-lastSource); + } else { + /* reserved, must never occur */ + continue; + } + + /* set offsets since the start or the last extension */ + if(offsets!=null) { + int count = sourceArrayIndex-lastSource; + + /* predecrement: do not set the offset for the callback-causing character */ + while(--count>0) { + offsets.put(sourceIndex++); + } + /* offset and sourceIndex are now set for the current character */ + } + + if(cr.isError()) { + /* callback(illegal) */ + break; + } + else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ + lastSource = sourceArrayIndex; + toUBytesArray[0]=source.get(sourceArrayIndex-1); + source.position(sourceArrayIndex); + toULength = toU((byte)1, source, target, offsets, sourceIndex, crArray); + sourceArrayIndex = source.position(); + sourceIndex += 1+(int)(sourceArrayIndex-lastSource); + + if(cr.isError()) { + /* not mappable or buffer overflow */ + break; + } + + /* recalculate the targetCapacity after an extension mapping */ + targetCapacity = target.remaining(); + length = source.remaining(); + if(length0) { + offsets.put(sourceIndex++); + --count; + } + } + + /* write back the updated pointers */ + source.position(sourceArrayIndex); + + return cr; + } + + /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ + protected CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets) + { + CoderResult cr = CoderResult.UNDERFLOW; + CoderResult[] crArray = {cr}; + + int sourceArrayIndex; + int[][] stateTable; + + int sourceIndex; + + int entry; + char c; + byte action; + + /* set up the local pointers */ + sourceArrayIndex = source.position(); + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + stateTable = sharedData.mbcs.swapLFNLStateTable; + } + else { + stateTable = sharedData.mbcs.stateTable; + } + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex = 0; + + /* conversion loop */ + while(sourceArrayIndex>>10))); + if(offsets!=null) { + offsets.put(sourceIndex); + } + c = (char)(0xdc00|(char)(entry&0x3ff)); + if(target.hasRemaining()) { + target.put(c); + if(offsets!=null) { + offsets.put(sourceIndex); + } + } + else { + /* target overflow */ + charErrorBufferArray[0]=c; + charErrorBufferLength=1; + cr = CoderResult.OVERFLOW; + break; + } + + ++sourceIndex; + continue; + } + else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { + if(isToUUseFallback()) { + /* output BMP code point */ + target.put((char)MBCS_ENTRY_FINAL_VALUE_16(entry)); + if(offsets!=null) { + offsets.put(sourceIndex); + } + + ++sourceIndex; + continue; + } + } + else if(action==MBCS_STATE_UNASSIGNED) { + /* just fall through */ + } + else if(action==MBCS_STATE_ILLEGAL) { + /* callback(illegal) */ + cr = CoderResult.malformedForLength(1); + } + else { + /* reserved, must never occur */ + ++sourceIndex; + continue; + } + + if(cr.isError()) { + /* callback(illegal) */ + break; + } + else /* unassigned sequences indicated with byteIndex>0 */ { + /* try an extension mapping */ + int sourceBeginIndex = sourceArrayIndex; + toUBytesArray[0] = source.get(sourceArrayIndex-1); + source.position(sourceArrayIndex); + toULength = toU((byte)1, source, target, offsets, sourceIndex, crArray); + sourceArrayIndex = source.position(); + sourceIndex += 1+(int)(sourceArrayIndex-sourceBeginIndex); + + if(cr.isError()) { + /* not mappable or buffer overflow */ + break; + } + } + } + + /* write back the updated pointers */ + source.position(sourceArrayIndex); + + return cr; + } + + protected int getFallback(UConverterMBCSTable mbcsTable, int offset) + { + MBCSToUFallback[] toUFallbacks; + int i, start, limit; + + limit = mbcsTable.countToUFallbacks; + if(limit>0) { + /* do a binary search for the fallback mapping */ + toUFallbacks = mbcsTable.toUFallbacks; + start = 0; + while(start=0) { + /* + * pass sourceIndex=-1 because we continue from an earlier buffer + * in the future, this may change with continuous offsets + */ + cr = continueMatchFromU(source, target, offsets, -1); + + if(cr.isError() || preFromULength<0) { + return cr; + } + } + + /* use optimized function if possible */ + outputType = sharedData.mbcs.outputType; + unicodeMask = sharedData.mbcs.unicodeMask; + if(outputType==MBCS_OUTPUT_1 && (unicodeMask&UConverterConstants.HAS_SURROGATES) == 0) { + if((unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) == 0) { + cr = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets); + } else { + cr = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets); + } + return cr; + } else if(outputType==MBCS_OUTPUT_2) { + cr = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets); + return cr; + } + + table = sharedData.mbcs.fromUnicodeTable; + sourceArrayIndex = source.position(); + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes; + } else { + bytes = sharedData.mbcs.fromUnicodeBytes; + } + + /* get the converter state from UConverter */ + c = fromUChar32; + + if(outputType==MBCS_OUTPUT_2_SISO) { + prevLength=(int)fromUnicodeStatus; + if(prevLength==0) { + /* set the real value */ + prevLength=1; + } + } else { + /* prevent fromUnicodeStatus from being set to something non-0 */ + prevLength=0; + } + + /* sourceIndex=-1 if the current character began in the previous buffer */ + prevSourceIndex=-1; + sourceIndex= c==0 ? 0 : -1; + nextSourceIndex=0; + + /* conversion loop */ + /* + * This is another piece of ugly code: + * A goto into the loop if the converter state contains a first surrogate + * from the previous function call. + * It saves me to check in each loop iteration a check of if(c==0) + * and duplicating the trail-surrogate-handling code in the else + * branch of that check. + * I could not find any other way to get around this other than + * using a function call for the conversion and callback, which would + * be even more inefficient. + * + * Markus Scherer 2000-jul-19 + */ + boolean doloop = true; + if(c!=0 && target.hasRemaining()) { + SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength); + doloop = getTrail(source, target, unicodeMask, x, crArray); + c = x.c; + sourceArrayIndex = x.sourceArrayIndex; + sourceIndex = x.sourceIndex; + nextSourceIndex = x.nextSourceIndex; + prevSourceIndex = x.prevSourceIndex; + prevLength = x.prevLength; + cr = crArray[0]; + } + + if(doloop) { + while(sourceArrayIndex0 */ + if(length<=target.remaining()) { + if(offsets==null) { + switch(length) { + /* each branch falls through to the next one */ + case 4: + target.put((byte)(value>>>24)); + case 3: + target.put((byte)(value>>>16)); + case 2: + target.put((byte)(value>>>8)); + case 1: + target.put((byte)value); + default: + /* will never occur */ + break; + } + } + else { + switch(length) { + /* each branch falls through to the next one */ + case 4: + target.put((byte)(value>>>24)); + offsets.put(sourceIndex); + case 3: + target.put((byte)(value>>>16)); + offsets.put(sourceIndex); + case 2: + target.put((byte)(value>>>8)); + offsets.put(sourceIndex); + case 1: + target.put((byte)value); + offsets.put(sourceIndex); + default: + /* will never occur */ + break; + } + } + } + else { + int errorBufferArrayIndex; + + /* + * We actually do this backwards here: + * In order to save an intermediate variable, we output + * first to the overflow buffer what does not fit into the + * regular target. + */ + /* we know that 1<=targetCapacity>>16); + case 2: + errorBuffer[errorBufferArrayIndex++]=(byte)(value>>>8); + case 1: + errorBuffer[errorBufferArrayIndex]=(byte)value; + default: + /* will never occur */ + break; + } + errorBufferLength = (byte)length; + + /* now output what fits into the regular target */ + value>>>=8*length; /* length was reduced by targetCapacity */ + switch(target.remaining()) { + /* each branch falls through to the next one */ + case 3: + target.put((byte)(value>>>16)); + if(offsets!=null) { + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(value>>>8)); + if(offsets!=null) { + offsets.put(sourceIndex); + } + case 1: + target.put((byte)value); + if(offsets!=null) { + offsets.put(sourceIndex); + } + default: + /* will never occur */ + break; + } + + /* target overflow */ + cr = CoderResult.OVERFLOW; + c=0; + break; + } + + /* normal end of conversion: prepare for a new character */ + c=0; + if(offsets!=null) { + prevSourceIndex=sourceIndex; + sourceIndex=nextSourceIndex; + } + continue; + } + else { + /* target is full */ + cr = CoderResult.OVERFLOW; + break; + } + } + } + + /* + * the end of the input stream and detection of truncated input + * are handled by the framework, but for EBCDIC_STATEFUL conversion + * we need to emit an SI at the very end + * + * conditions: + * successful + * EBCDIC_STATEFUL in DBCS mode + * end of input and no truncated input + */ + if(outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && + flush && sourceArrayIndex>=source.limit() && c==0){ + + /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ + if(target.hasRemaining()) { + target.put((byte)UConverterConstants.SI); + if(offsets!=null) { + /* set the last source character's index (sourceIndex points at sourceLimit now) */ + offsets.put(prevSourceIndex); + } + } + else { + /* target is full */ + errorBuffer[0]=(byte)UConverterConstants.SI; + errorBufferLength=1; + cr = CoderResult.OVERFLOW; + } + prevLength=1; /* we switched into SBCS */ + } + + /* set the converter state back into UConverter */ + fromUChar32=c; + fromUnicodeStatus=prevLength; + + source.position(sourceArrayIndex); + } + catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + + return cr; + } + + /* + * continue partial match with new input, requires cnv->preFromUFirstCP>=0 + * never called for simple, single-character conversion + */ + protected CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, int srcIndex) + { + CoderResult cr = CoderResult.UNDERFLOW; + int[] value = new int[1]; + int match; + + match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, target, value); + if(match>=2) { + match-=2; /* remove 2 for the initial code point */ + + if(match>=preFromULength) { + /* advance src pointer for the consumed input */ + source.position(source.position()+match-preFromULength); + preFromULength=0; + } else { + /* the match did not use all of preFromU[] - keep the rest for replay */ + int length = preFromULength-match; + System.arraycopy(preFromUArray, preFromUBegin+match, preFromUArray, preFromUBegin, length); + preFromULength=(byte)-length; + } + + /* finish the partial match */ + preFromUFirstCP = UConverterConstants.U_SENTINEL; + + /* write result */ + writeFromU(value[0], target, offsets, srcIndex); + } + else if(match<0) { + /* save state for partial match */ + int sArrayIndex; + int j; + + /* just _append_ the newly consumed input to preFromU[] */ + sArrayIndex = source.position(); + match =- match-2; /* remove 2 for the initial code point */ + for(j=preFromULength; j */ + useSubChar1=true; + } + + /* move the first code point to the error field */ + fromUChar32 = preFromUFirstCP; + preFromUFirstCP = UConverterConstants.U_SENTINEL; + + /* mark preFromU for replay */ + preFromULength = (byte) - preFromULength; + + /* set the error code for unassigned */ + cr = CoderResult.unmappableForLength(source.position()); + } + return cr; + } + + /* + * @param cx pointer to extension data; if NULL, returns 0 + * @param firstCP the first code point before all the other UChars + * @param pre UChars that must match; !initialMatch: partial match with them + * @param preLength length of pre, >=0 + * @param src UChars that can be used to complete a match + * @param srcLength length of src, >=0 + * @param pMatchValue [out] output result value for the match from the data structure + * @param useFallback "use fallback" flag, usually from cnv->useFallback + * @param flush TRUE if the end of the input stream is reached + * @return >1: matched, return value=total match length (number of input units matched) + * 1: matched, no mapping but request for + * (only for the first code point) + * 0: no match + * <0: partial match, return value=negative total match length + * (partial matches are never returned for flush==TRUE) + * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) + * the matchLength is 2 if only firstCP matched, and >2 if firstCP and + * further code units matched + */ + //static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength, const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush) + protected int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source, ByteBuffer target, int[] pMatchValue) + { + ByteBuffer cx = sharedData.mbcs.extIndexes; + + CharBuffer stage12, stage3; + IntBuffer stage3b; + + CharBuffer fromUTableUChars, fromUSectionUChars; + IntBuffer fromUTableValues, fromUSectionValues; + + int value, matchValue; + int i, j, index, length, matchLength; + char c; + + if(cx==null) { + return 0; /* no extension data, no match */ + } + + /* trie lookup of firstCP */ + index=firstCP>>>10; /* stage 1 index */ + if(index>=cx.asIntBuffer().get(FROM_U_STAGE_1_LENGTH)) { + return 0; /* the first code point is outside the trie */ + } + + stage12 = (CharBuffer)ARRAY(cx, FROM_U_STAGE_12_INDEX, char.class); + stage3 = (CharBuffer)ARRAY(cx, FROM_U_STAGE_3_INDEX, char.class); + index = FROM_U(stage12, stage3, index, firstCP); + + stage3b = (IntBuffer)ARRAY(cx, FROM_U_STAGE_3B_INDEX, int.class); + value = stage3b.get(stage3b.position() + index); + if(value==0) { + return 0; + } + + if(TO_U_IS_PARTIAL(value)) { + /* partial match, enter the loop below */ + index = FROM_U_GET_PARTIAL_INDEX(value); + + /* initialize */ + fromUTableUChars = (CharBuffer)ARRAY(cx, FROM_U_UCHARS_INDEX, char.class); + fromUTableValues = (IntBuffer)ARRAY(cx, FROM_U_VALUES_INDEX, int.class); + + matchValue=0; + i=j=matchLength=0; + + /* we must not remember fallback matches when not using fallbacks */ + + /* match input units until there is a full match or the input is consumed */ + for(;;) { + /* go to the next section */ + int oldpos = fromUTableUChars.position(); + fromUSectionUChars = ((CharBuffer)fromUTableUChars.position(index)).slice(); + fromUTableUChars.position(oldpos); + oldpos = fromUTableValues.position(); + fromUSectionValues = ((IntBuffer)fromUTableValues.position(index)).slice(); + fromUTableValues.position(oldpos); + + /* read first pair of the section */ + length = fromUSectionUChars.get(); + value = fromUSectionValues.get(); + if( value!=0 && + (FROM_U_IS_ROUNDTRIP(value) || + isFromUUseFallback(firstCP)) + ) { + /* remember longest match so far */ + matchValue = value; + matchLength = 2+i+j; + } + + /* match pre[] then src[] */ + if(iMAX_UCHARS) { + /* + * end of the entire input stream, stop with the longest match so far + * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS + * because it must fit into state buffers + */ + break; + } else { + /* continue with more input next time */ + return -(2+length); + } + } + + /* search for the current UChar */ + index = findFromU(fromUSectionUChars, length, c); + if(index<0) { + /* no match here, stop with the longest match so far */ + break; + } else { + value = fromUSectionValues.get(fromUSectionValues.position() + index); + if(FROM_U_IS_PARTIAL(value)) { + /* partial match, continue */ + index = FROM_U_GET_PARTIAL_INDEX(value); + } else { + if( FROM_U_IS_ROUNDTRIP(value) || + isFromUUseFallback(firstCP) + ) { + /* full match, stop with result */ + matchValue=value; + matchLength=2+i+j; + } else { + /* full match on fallback not taken, stop with the longest match so far */ + } + break; + } + } + } + + if(matchLength==0) { + /* no match at all */ + return 0; + } + } else /* result from firstCP trie lookup */ { + if( FROM_U_IS_ROUNDTRIP(value) || + isFromUUseFallback(firstCP) + ) { + /* full match, stop with result */ + matchValue=value; + matchLength=2; + } else { + /* fallback not taken */ + return 0; + } + } + + if((matchValue&FROM_U_RESERVED_MASK) != 0) { + /* do not interpret values with reserved bits used, for forward compatibility */ + return 0; + } + + /* return result */ + if(matchValue==FROM_U_SUBCHAR1) { + return 1; /* assert matchLength==2 */ + } + + pMatchValue[0]=FROM_U_MASK_ROUNDTRIP(matchValue); + return matchLength; + } + + protected CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) + { + ByteBuffer cx = sharedData.mbcs.extIndexes; + + byte bufferArray[] = new byte[1+MAX_BYTES]; + int bufferArrayIndex = 0; + byte[] resultArray; + int resultArrayIndex; + int length, prevLength; + + length = FROM_U_GET_LENGTH(value); + value = FROM_U_GET_DATA(value); + + /* output the result */ + if(length<=FROM_U_MAX_DIRECT_LENGTH) { + /* + * Generate a byte array and then write it below. + * This is not the fastest possible way, but it should be ok for + * extension mappings, and it is much simpler. + * Offset and overflow handling are only done once this way. + */ + int p = bufferArrayIndex+1; /* reserve buffer[0] for shiftByte below */ + switch(length) { + case 3: + bufferArray[p++] = (byte)(value>>>16); + case 2: + bufferArray[p++] = (byte)(value>>>8); + case 1: + bufferArray[p++] = (byte)value; + default: + break; /* will never occur */ + } + resultArray = bufferArray; + resultArrayIndex = bufferArrayIndex+1; + } + else { + byte[] slice = new byte[length]; + + ByteBuffer bb = ((ByteBuffer)ARRAY(cx, FROM_U_BYTES_INDEX, byte.class)); + bb.position(value); + bb.get(slice, 0, slice.length); + + resultArray = slice; + resultArrayIndex = 0; + } + + /* with correct data we have length>0 */ + + if((prevLength=(int)fromUnicodeStatus)!=0) { + /* handle SI/SO stateful output */ + byte shiftByte; + + if(prevLength>1 && length==1) { + /* change from double-byte mode to single-byte */ + shiftByte = (byte)UConverterConstants.SI; + fromUnicodeStatus = 1; + } + else if(prevLength==1 && length>1) { + /* change from single-byte mode to double-byte */ + shiftByte = (byte)UConverterConstants.SO; + fromUnicodeStatus = 2; + } + else { + shiftByte = 0; + } + + if(shiftByte!=0) { + /* prepend the shift byte to the result bytes */ + bufferArray[0] = shiftByte; + if(resultArray!=bufferArray || resultArrayIndex!=bufferArrayIndex+1) { + System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex+1, length); + } + resultArray = bufferArray; + resultArrayIndex = bufferArrayIndex; + ++length; + } + } + + return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex); + } + + /* + * @return if(U_FAILURE) return the code point for cnv->fromUChar32 + * else return 0 after output has been written to the target + */ + protected int fromU(int cp_, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex, CoderResult[] crArray) + { + //ByteBuffer cx; + long cp = cp_ & UConverterConstants.UNSIGNED_INT_MASK; + + useSubChar1=false; + + if( sharedData.mbcs.extIndexes!=null && initialMatchFromU((int)cp, source, target, offsets, sourceIndex, crArray)) { + return 0; /* an extension mapping handled the input */ + } + + /* GB 18030 */ + if((options&MBCS_OPTION_GB18030)!=0) { + long[] range; + int i; + + range = gb18030Ranges[0]; + for(i=0; i=2 && + !(FROM_U_GET_LENGTH(value[0])==1 && + sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) + ) { + /* advance src pointer for the consumed input */ + source.position(source.position()+match-2); /* remove 2 for the initial code point */ + + /* write result to target */ + crArray[0] = writeFromU(value[0], target, offsets, srcIndex); + return true; + } else if(match<0) { + /* save state for partial match */ + int sArrayIndex; + int j; + + /* copy the first code point */ + preFromUFirstCP=cp; + + /* now copy the newly consumed input */ + sArrayIndex = source.position(); + match =- match-2; /* remove 2 for the initial code point */ + for(j=0; j */ + useSubChar1=true; + return false; + } else /* match==0 no match */ { + return false; + } + } + + /* + * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages + * that map only to and from the BMP. + * In addition to single-byte/state optimizations, the offset calculations + * become much easier. + */ + protected CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets) + { + CoderResult cr = CoderResult.UNDERFLOW; + CoderResult[] crArray = {cr}; + + int sourceArrayIndex, lastSource; + int targetCapacity, length; + char[] table; + byte[] results; + + int c, sourceIndex; + char value, minValue; + + /* set up the local pointers */ + sourceArrayIndex = source.position(); + targetCapacity = target.remaining(); + table = sharedData.mbcs.fromUnicodeTable; + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + results = sharedData.mbcs.swapLFNLFromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? + } + else { + results = sharedData.mbcs.fromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? + } + + if(useFallback) { + /* use all roundtrip and fallback results */ + minValue = 0x800; + } + else { + /* use only roundtrips and fallbacks from private-use characters */ + minValue = 0xc00; + } + + /* get the converter state from UConverter */ + c = fromUChar32; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex = c==0 ? 0 : -1; + lastSource = sourceArrayIndex; + + /* + * since the conversion here is 1:1 UChar:uint8_t, we need only one counter + * for the minimum of the sourceLength and targetCapacity + */ + length = source.limit()-sourceArrayIndex; + if(length0) { + SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); + doloop = getTrailSingleBMP(source, x, cr); + c = x.c; + sourceArrayIndex = x.sourceArrayIndex; + } + + if(doloop) { + while(targetCapacity>0) { + /* + * Get a correct Unicode code point: + * a single UChar for a BMP code point or + * a matched surrogate pair for a "supplementary code point". + */ + c = source.get(sourceArrayIndex++); + /* + * Do not immediately check for single surrogates: + * Assume that they are unassigned and check for them in that case. + * This speeds up the conversion of assigned characters. + */ + /* convert the Unicode code point in c into codepage bytes */ + value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); + + /* is this code point assigned, or do we use fallbacks? */ + if(value>=minValue) { + /* assigned, write the output character bytes from value and length */ + /* length==1 */ + /* this is easy because we know that there is enough space */ + target.put((byte)value); + --targetCapacity; + + /* normal end of conversion: prepare for a new character */ + c=0; + continue; + } + else if(!UTF16.isSurrogate((char)c)) { + /* normal, unassigned BMP character */ + } + else if(UTF16.isLeadSurrogate((char)c)) { + //getTrail: + SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); + doloop = getTrailSingleBMP(source, x, cr); + c = x.c; + sourceArrayIndex = x.sourceArrayIndex; + if(!doloop) + break; + } + else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + cr = CoderResult.malformedForLength(1); + break; + } + + /* c does not have a mapping */ + + /* get the number of code units for c to correctly advance sourceIndex */ + length = UTF16.getCharCount(c); + + /* set offsets since the start or the last extension */ + if(offsets!=null) { + int count = sourceArrayIndex-lastSource; + + /* do not set the offset for this character */ + count -= length; + + while(count>0) { + offsets.put(sourceIndex++); + --count; + } + /* offsets and sourceIndex are now set for the current character */ + } + + /* try an extension mapping */ + lastSource = sourceArrayIndex; + source.position(sourceArrayIndex); + c = fromU(c, source, target, offsets, sourceIndex, crArray); + sourceArrayIndex = source.position(); + sourceIndex += length+(sourceArrayIndex-lastSource); + lastSource = sourceArrayIndex; + + if(cr.isError()) { + /* not mappable or buffer overflow */ + break; + } else { + /* a mapping was written to the target, continue */ + + /* recalculate the targetCapacity after an extension mapping */ + targetCapacity = target.remaining(); + length = source.limit() - sourceArrayIndex; + if(length0) { + offsets.put(sourceIndex++); + --count; + } + } + + /* set the converter state back into UConverter */ + fromUChar32=c; + + /* write back the updated pointers */ + source.position(sourceArrayIndex); + + return cr; + } + + /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ + protected CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets) + { + CoderResult cr = CoderResult.UNDERFLOW; + CoderResult[] crArray = {cr}; + + int sourceArrayIndex; + + char[] table; + byte[] results; //agljport:comment results is used to to get 16-bit values out of byte[] array + + int c; + int sourceIndex, nextSourceIndex; + + char value, minValue; + + /* set up the local pointers */ + short unicodeMask; + sourceArrayIndex = source.position(); + + table = sharedData.mbcs.fromUnicodeTable; + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + results = sharedData.mbcs.swapLFNLFromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? + } + else { + results = sharedData.mbcs.fromUnicodeBytes; //agljport:comment should swapLFNLFromUnicodeBytes be a ByteBuffer so results can be a 16-bit view of it? + } + + if(useFallback) { + /* use all roundtrip and fallback results */ + minValue = 0x800; + } + else { + /* use only roundtrips and fallbacks from private-use characters */ + minValue = 0xc00; + } + //agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation + unicodeMask = sharedData.mbcs.unicodeMask; + + /* get the converter state from UConverter */ + c = fromUChar32; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex= c==0 ? 0 : -1; + nextSourceIndex=0; + + boolean doloop = true; + if(c!=0 && target.hasRemaining()) { + SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); + doloop = getTrailDouble(source, target, unicodeMask, x, crArray); + c = x.c; + sourceArrayIndex = x.sourceArrayIndex; + sourceIndex = x.sourceIndex; + nextSourceIndex = x.nextSourceIndex; + } + + if(doloop) { + while(sourceArrayIndex=minValue) { + /* assigned, write the output character bytes from value and length */ + /* length==1 */ + /* this is easy because we know that there is enough space */ + target.put((byte)value); + if(offsets!=null) { + offsets.put(sourceIndex); + } + + /* normal end of conversion: prepare for a new character */ + c=0; + sourceIndex = nextSourceIndex; + } + else { /* unassigned */ + /* try an extension mapping */ + SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); + doloop = unassignedDouble(source, target, x, crArray); + c = x.c; + sourceArrayIndex = x.sourceArrayIndex; + sourceIndex = x.sourceIndex; + nextSourceIndex = x.nextSourceIndex; + if(!doloop) + break; + } + } + else { + /* target is full */ + cr = CoderResult.OVERFLOW; + break; + } + } + } + + /* set the converter state back into UConverter */ + fromUChar32=c; + + /* write back the updated pointers */ + source.position(sourceArrayIndex); + + return cr; + } + + /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ + protected CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets) + { + CoderResult cr = CoderResult.UNDERFLOW; + CoderResult[] crArray = {cr}; + + int sourceArrayIndex; + + char[] table; + byte[] bytes; + + int c, sourceIndex, nextSourceIndex; + + int stage2Entry; + int value; + int length; + short unicodeMask; + + /* use optimized function if possible */ + unicodeMask = sharedData.mbcs.unicodeMask; + + /* set up the local pointers */ + sourceArrayIndex = source.position(); + + table = sharedData.mbcs.fromUnicodeTable; + + if((options&UConverterConstants.OPTION_SWAP_LFNL)!=0) { + bytes = sharedData.mbcs.swapLFNLFromUnicodeBytes; + } else { + bytes = sharedData.mbcs.fromUnicodeBytes; + } + + /* get the converter state from UConverter */ + c = fromUChar32; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex= c==0 ? 0 : -1; + nextSourceIndex=0; + + /* conversion loop */ + boolean doloop = true; + if(c!=0 && target.hasRemaining()) { + SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); + doloop = getTrailDouble(source, target, unicodeMask, x, crArray); + c = x.c; + sourceArrayIndex = x.sourceArrayIndex; + sourceIndex = x.sourceIndex; + nextSourceIndex = x.nextSourceIndex; + } + + if(doloop) { + while(sourceArrayIndex0 */ + if(length==1) { + /* this is easy because we know that there is enough space */ + target.put((byte)value); + if(offsets!=null) { + offsets.put(sourceIndex); + } + } + else /* length==2 */ { + target.put((byte)(value>>>8)); + if(2<=target.remaining()) { + target.put((byte)value); + if(offsets!=null) { + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + } + else { + if(offsets!=null) { + offsets.put(sourceIndex); + } + errorBuffer[0]=(byte)value; + errorBufferLength=1; + + /* target overflow */ + cr = CoderResult.OVERFLOW; + c=0; + break; + } + } + + /* normal end of conversion: prepare for a new character */ + c=0; + sourceIndex=nextSourceIndex; + continue; + } + else { + /* target is full */ + cr = CoderResult.OVERFLOW; + break; + } + } + } + + /* set the converter state back into UConverter */ + fromUChar32=c; + + /* write back the updated pointers */ + source.position(sourceArrayIndex); + + return cr; + } + + protected final class SideEffectsSingleBMP { + int c, sourceArrayIndex; + public SideEffectsSingleBMP(int c_, int sourceArrayIndex_) + { + c = c_; + sourceArrayIndex = sourceArrayIndex_; + } + } + + // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets + // assumes input c is lead surrogate + protected final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult cr) + { + if(x.sourceArrayIndex=1) { + target.put(trail); + if(offsets!=null) { + offsets.put(-1); + offsets.put(-1); + } + } else /* targetCapacity==1 */ { + charErrorBufferArray[charErrorBufferBegin+0]=trail; + charErrorBufferLength=1; + throw new BufferOverflowException(); + } + count=0; + c=0; + break; + } else { + /* unmatched lead surrogate, handle here for consistent toUBytes[] */ + + /* back out reading the code unit after it */ + if((source.position()-sourceArrayIndex)>=2) { + sourceArrayIndex-=2; + } else { + /* + * if the trail unit's first byte was in a previous buffer, then + * we need to put it into a special place because toUBytes[] will be + * used for the lead unit's bytes + */ + toUnicodeStatus=0x100|pArray[pArrayIndex+2]; + --sourceArrayIndex; + } + toULength=2; + cr = CoderResult.malformedForLength(sourceArrayIndex);; + } + } + } while(length>0); + toULength=(byte)count; + } + + /* copy an even number of bytes for complete UChars */ + count=2*target.remaining(); + if(count>length) { + count=length&~1; + } + if(c==0 && count>0) { + length-=count; + count>>=1; + //targetCapacity-=count; + if(offsets==null) { + do { + c=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)); + sourceArrayIndex+=2; + if(!UTF16.isSurrogate(c)) { + target.put(c); + } else if(UTF16.isLeadSurrogate(c) && count>=2 && + UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK))) + ) { + sourceArrayIndex+=2; + --count; + target.put(c); + target.put(trail); + } else { + break; + } + } while(--count>0); + } else { + do { + c=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)); + sourceArrayIndex+=2; + if(!UTF16.isSurrogate(c)) { + target.put(c); + offsets.put(sourceIndex); + sourceIndex+=2; + } else if(UTF16.isLeadSurrogate(c) && count>=2 && + UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK))) + ) { + sourceArrayIndex+=2; + --count; + target.put(c); + target.put(trail); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + sourceIndex+=4; + } else { + break; + } + } while(--count>0); + } + + if(count==0) { + /* done with the loop for complete UChars */ + c=0; + } else { + /* keep c for surrogate handling, trail will be set there */ + length+=2*(count-1); /* one more byte pair was consumed than count decremented */ + } + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + toUBytesArray[toUBytesBegin+0]=(byte)(c>>>8); + toUBytesArray[toUBytesBegin+1]=(byte)c; + toULength=2; + + if(UTF16.isLeadSurrogate(c)) { + if(length>=2) { + if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))) { + /* output the surrogate pair, will overflow (see conditions comment above) */ + sourceArrayIndex+=2; + length-=2; + target.put(c); + if(offsets!=null) { + offsets.put(sourceIndex); + } + charErrorBufferArray[charErrorBufferBegin+0]=trail; + charErrorBufferLength=1; + toULength=0; + cr = CoderResult.OVERFLOW; + } else { + /* unmatched lead surrogate */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + } + + + /* check for a remaining source byte */ + if(length>0) { + if(!target.hasRemaining()) { + cr = CoderResult.OVERFLOW; + } else { + /* it must be length==1 because otherwise the above would have copied more */ + toUBytesArray[toULength++]=source.get(sourceArrayIndex++); + } + } + + source.position(sourceArrayIndex); + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + + } + class CharsetEncoderUTF16 extends CharsetEncoderICU{ + + public CharsetEncoderUTF16(CharsetICU cs) { + super(cs, fromUSubstitution); + implReset(); + } + + private final static int NEED_TO_WRITE_BOM = 1; + + protected void implReset() { + super.implReset(); + fromUnicodeStatus = NEED_TO_WRITE_BOM; + } + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + if(!source.hasRemaining()) { + /* no input, nothing to do */ + return cr; + } + char c; + /* write the BOM if necessary */ + if(fromUnicodeStatus==NEED_TO_WRITE_BOM) { + byte bom[]={ (byte)0xfe, (byte)0xff }; + cr = fromUWriteBytes(this,bom, 0, bom.length, target, offsets, -1); + if(cr.isError()){ + return cr; + } + fromUnicodeStatus=0; + } + + if(!target.hasRemaining()) { + return CoderResult.OVERFLOW; + } + + int sourceIndex = 0; + char trail = 0; + int length = source.remaining(); + + try{ + /* c!=0 indicates in several places outside the main loops that a surrogate was found */ + + if((c=(char)fromUChar32)!=0 && UTF16.isTrailSurrogate(trail=source.get(sourceIndex)) && target.remaining()>=4) { + /* the last buffer ended with a lead surrogate, output the surrogate pair */ + ++sourceIndex; + --length; + target.put((byte)(c>>>8)); + target.put((byte)c); + target.put((byte)(trail>>>8)); + target.put((byte)trail); + if(offsets!=null && offsets.remaining()>=4) { + offsets.put(-1); + offsets.put(-1); + offsets.put(-1); + offsets.put(-1); + } + sourceIndex=1; + fromUChar32=c=0; + } + byte overflow[/*4*/] = new byte[4]; + int sourceArrayIndex = source.position(); + + if(c==0) { + /* copy an even number of bytes for complete UChars */ + int count=2*length; + int targetCapacity = target.limit(); + if(count>targetCapacity) { + count=targetCapacity&~1; + } + /* count is even */ + targetCapacity-=count; + count>>=1; + length-=count; + + if(offsets==null) { + while(count>0) { + c= source.get(sourceArrayIndex++); + if(!UTF16.isSurrogate(c)) { + target.put((byte)(c>>>8)); + target.put((byte)c); + + } else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) { + ++sourceArrayIndex; + --count; + target.put((byte)(c>>>8)); + target.put((byte)c); + target.put((byte)(trail>>>8)); + target.put((byte)trail); + } else { + break; + } + --count; + } + } else { + while(count>0) { + c=source.get(sourceArrayIndex++); + if(!UTF16.isSurrogate(c)) { + target.put((byte)(c>>>8)); + target.put((byte)c); + offsets.put(sourceIndex); + offsets.put(sourceIndex++); + } else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) { + ++sourceArrayIndex; + --count; + target.put((byte)(c>>>8)); + target.put((byte)c); + target.put((byte)(trail>>>8)); + target.put((byte)trail); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + sourceIndex+=2; + } else { + break; + } + --count; + } + } + + if(count==0) { + /* done with the loop for complete UChars */ + if(length>0 && targetCapacity>0) { + /* + * there is more input and some target capacity - + * it must be targetCapacity==1 because otherwise + * the above would have copied more; + * prepare for overflow output + */ + if(!UTF16.isSurrogate(c=source.get(sourceArrayIndex++))) { + overflow[0]=(byte)(c>>>8); + overflow[1]=(byte)c; + length=2; /* 2 bytes to output */ + c=0; + /* } else { keep c for surrogate handling, length will be set there */ + } + } else { + length=0; + c=0; + } + } else { + /* keep c for surrogate handling, length will be set there */ + targetCapacity+=2*count; + } + } else { + length=0; /* from here on, length counts the bytes in overflow[] */ + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + length=0; + if(UTF16.isLeadSurrogate(c)) { + if(sourceArrayIndex>>8); + overflow[1]=(byte)c; + overflow[2]=(byte)(trail>>>8); + overflow[3]=(byte)trail; + length=4; /* 4 bytes to output */ + c=0; + } else { + /* unmatched lead surrogate */ + //pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND; + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + //pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND; + } + fromUChar32=c; + } + source.position(sourceArrayIndex); + if(length>0) { + /* output length bytes with overflow (length>targetCapacity>0) */ + fromUWriteBytes(this, overflow, 0, length, target, offsets, sourceIndex); + } + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + public CharsetDecoder newDecoder() { + return new CharsetDecoderUTF16(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderUTF16(this); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/CharsetUTF16LE.java b/icu4j/src/com/ibm/icu/impl/CharsetUTF16LE.java new file mode 100644 index 00000000000..f372592e936 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/CharsetUTF16LE.java @@ -0,0 +1,449 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +import com.ibm.icu.charset.CharsetDecoderICU; +import com.ibm.icu.charset.CharsetEncoderICU; +import com.ibm.icu.charset.CharsetICU; +import com.ibm.icu.text.UTF16; + +/** + * @author Niti Hantaweepant + */ +public class CharsetUTF16LE extends CharsetICU { + protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff}; + public CharsetUTF16LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 2; + maxCharsPerByte = 1; + } + class CharsetDecoderUTF16LE extends CharsetDecoderICU{ + + public CharsetDecoderUTF16LE(CharsetICU cs) { + super(cs); + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + if(!source.hasRemaining() && toUnicodeStatus==0) { + /* no input, nothing to do */ + return cr; + } + if(!target.hasRemaining()) { + return CoderResult.OVERFLOW; + } + + int sourceIndex=0, count=0, length, sourceArrayIndex; + char c=0, trail; + length = source.remaining(); + sourceArrayIndex = source.position(); + try{ + /* complete a partial UChar or pair from the last call */ + if(toUnicodeStatus!=0) { + /* + * special case: single byte from a previous buffer, + * where the byte turned out not to belong to a trail surrogate + * and the preceding, unmatched lead surrogate was put into toUBytes[] + * for error handling + */ + toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus; + toULength=1; + toUnicodeStatus=0; + } + if((count=toULength)!=0) { + byte[] pArray=toUBytesArray; + int pArrayIndex = toUBytesBegin; + do { + pArray[count++]=source.get(sourceArrayIndex++); + ++sourceIndex; + --length; + if(count==2) { + c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK)); + if(!UTF16.isSurrogate(c)) { + /* output the BMP code point */ + target.put(c); + if(offsets!=null) { + offsets.put(-1); + } + count=0; + c=0; + break; + } else if(UTF16.isLeadSurrogate(c)) { + /* continue collecting bytes for the trail surrogate */ + c=0; /* avoid unnecessary surrogate handling below */ + } else { + /* fall through to error handling for an unmatched trail surrogate */ + break; + } + } else if(count==4) { + c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK)); + trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK)); + if(UTF16.isTrailSurrogate(trail)) { + /* output the surrogate pair */ + target.put(c); + if(target.remaining()>=1) { + target.put(trail); + if(offsets!=null) { + offsets.put(-1); + offsets.put(-1); + } + } else /* targetCapacity==1 */ { + charErrorBufferArray[charErrorBufferBegin+0]=trail; + charErrorBufferLength=1; + throw new BufferOverflowException(); + } + count=0; + c=0; + break; + } else { + /* unmatched lead surrogate, handle here for consistent toUBytes[] */ + + /* back out reading the code unit after it */ + if((source.position()-sourceArrayIndex)>=2) { + sourceArrayIndex-=2; + } else { + /* + * if the trail unit's first byte was in a previous buffer, then + * we need to put it into a special place because toUBytes[] will be + * used for the lead unit's bytes + */ + toUnicodeStatus=0x100|pArray[pArrayIndex+2]; + --sourceArrayIndex; + } + toULength=2; + cr = CoderResult.malformedForLength(sourceArrayIndex);; + } + } + } while(length>0); + toULength=(byte)count; + } + + /* copy an even number of bytes for complete UChars */ + count=2*target.remaining(); + if(count>length) { + count=length&~1; + } + if(c==0 && count>0) { + length-=count; + count>>=1; + //targetCapacity-=count; + if(offsets==null) { + do { + c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)); + sourceArrayIndex+=2; + if(!UTF16.isSurrogate(c)) { + target.put(c); + } else if(UTF16.isLeadSurrogate(c) && count>=2 && + UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK))) + ) { + sourceArrayIndex+=2; + --count; + target.put(c); + target.put(trail); + } else { + break; + } + } while(--count>0); + } else { + do { + c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)); + sourceArrayIndex+=2; + if(!UTF16.isSurrogate(c)) { + target.put(c); + offsets.put(sourceIndex); + sourceIndex+=2; + } else if(UTF16.isLeadSurrogate(c) && count>=2 && + UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK))) + ) { + sourceArrayIndex+=2; + --count; + target.put(c); + target.put(trail); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + sourceIndex+=4; + } else { + break; + } + } while(--count>0); + } + + if(count==0) { + /* done with the loop for complete UChars */ + c=0; + } else { + /* keep c for surrogate handling, trail will be set there */ + length+=2*(count-1); /* one more byte pair was consumed than count decremented */ + } + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + toUBytesArray[toUBytesBegin+0]=(byte)c; + toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8); + toULength=2; + + if(UTF16.isLeadSurrogate(c)) { + if(length>=2) { + if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) { + /* output the surrogate pair, will overflow (see conditions comment above) */ + sourceArrayIndex+=2; + length-=2; + target.put(c); + if(offsets!=null) { + offsets.put(sourceIndex); + } + charErrorBufferArray[charErrorBufferBegin+0]=trail; + charErrorBufferLength=1; + toULength=0; + cr = CoderResult.OVERFLOW; + } else { + /* unmatched lead surrogate */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + } + + + /* check for a remaining source byte */ + if(length>0) { + if(!target.hasRemaining()) { + cr = CoderResult.OVERFLOW; + } else { + /* it must be length==1 because otherwise the above would have copied more */ + toUBytesArray[toULength++]=source.get(sourceArrayIndex++); + } + } + + source.position(sourceArrayIndex); + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + + } + class CharsetEncoderUTF16LE extends CharsetEncoderICU{ + + public CharsetEncoderUTF16LE(CharsetICU cs) { + super(cs, fromUSubstitution); + implReset(); + } + + private final static int NEED_TO_WRITE_BOM = 1; + + protected void implReset() { + super.implReset(); + fromUnicodeStatus = NEED_TO_WRITE_BOM; + } + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + if(!source.hasRemaining()) { + /* no input, nothing to do */ + return cr; + } + char c; + /* write the BOM if necessary */ + if(fromUnicodeStatus==NEED_TO_WRITE_BOM) { + byte bom[]={ (byte)0xff, (byte)0xfe }; + cr = fromUWriteBytes(this,bom, 0, bom.length, target, offsets, -1); + if(cr.isError()){ + return cr; + } + fromUnicodeStatus=0; + } + + if(!target.hasRemaining()) { + return CoderResult.OVERFLOW; + } + + int sourceIndex = 0; + char trail = 0; + int length = source.remaining(); + + try{ + /* c!=0 indicates in several places outside the main loops that a surrogate was found */ + + if((c=(char)fromUChar32)!=0 && UTF16.isTrailSurrogate(trail=source.get(sourceIndex)) && target.remaining()>=4) { + /* the last buffer ended with a lead surrogate, output the surrogate pair */ + ++sourceIndex; + --length; + target.put((byte)c); + target.put((byte)(c>>>8)); + target.put((byte)trail); + target.put((byte)(trail>>>8)); + if(offsets!=null && offsets.remaining()>=4) { + offsets.put(-1); + offsets.put(-1); + offsets.put(-1); + offsets.put(-1); + } + sourceIndex=1; + fromUChar32=c=0; + } + byte overflow[/*4*/] = new byte[4]; + int sourceArrayIndex = source.position(); + + if(c==0) { + /* copy an even number of bytes for complete UChars */ + int count=2*length; + int targetCapacity = target.limit(); + if(count>targetCapacity) { + count=targetCapacity&~1; + } + /* count is even */ + targetCapacity-=count; + count>>=1; + length-=count; + + if(offsets==null) { + while(count>0) { + c= source.get(sourceArrayIndex++); + if(!UTF16.isSurrogate(c)) { + target.put((byte)c); + target.put((byte)(c>>>8)); + + } else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) { + ++sourceArrayIndex; + --count; + target.put((byte)c); + target.put((byte)(c>>>8)); + target.put((byte)trail); + target.put((byte)(trail>>>8)); + } else { + break; + } + --count; + } + } else { + while(count>0) { + c=source.get(sourceArrayIndex++); + if(!UTF16.isSurrogate(c)) { + target.put((byte)c); + target.put((byte)(c>>>8)); + offsets.put(sourceIndex); + offsets.put(sourceIndex++); + } else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) { + ++sourceArrayIndex; + --count; + target.put((byte)c); + target.put((byte)(c>>>8)); + target.put((byte)trail); + target.put((byte)(trail>>>8)); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + offsets.put(sourceIndex); + sourceIndex+=2; + } else { + break; + } + --count; + } + } + + if(count==0) { + /* done with the loop for complete UChars */ + if(length>0 && targetCapacity>0) { + /* + * there is more input and some target capacity - + * it must be targetCapacity==1 because otherwise + * the above would have copied more; + * prepare for overflow output + */ + if(!UTF16.isSurrogate(c=source.get(sourceArrayIndex++))) { + overflow[0]=(byte)c; + overflow[1]=(byte)(c>>>8); + length=2; /* 2 bytes to output */ + c=0; + /* } else { keep c for surrogate handling, length will be set there */ + } + } else { + length=0; + c=0; + } + } else { + /* keep c for surrogate handling, length will be set there */ + targetCapacity+=2*count; + } + } else { + length=0; /* from here on, length counts the bytes in overflow[] */ + } + + if(c!=0) { + /* + * c is a surrogate, and + * - source or target too short + * - or the surrogate is unmatched + */ + length=0; + if(UTF16.isLeadSurrogate(c)) { + if(sourceArrayIndex>>8); + overflow[2]=(byte)trail; + overflow[3]=(byte)(trail>>>8); + length=4; /* 4 bytes to output */ + c=0; + } else { + /* unmatched lead surrogate */ + //pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND; + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + } else { + /* see if the trail surrogate is in the next buffer */ + } + } else { + /* unmatched trail surrogate */ + //pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND; + } + fromUChar32=c; + } + source.position(sourceArrayIndex); + if(length>0) { + /* output length bytes with overflow (length>targetCapacity>0) */ + fromUWriteBytes(this, overflow, 0, length, target, offsets, sourceIndex); + } + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + public CharsetDecoder newDecoder() { + return new CharsetDecoderUTF16LE(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderUTF16LE(this); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/CharsetUTF32.java b/icu4j/src/com/ibm/icu/impl/CharsetUTF32.java new file mode 100644 index 00000000000..48727d999f9 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/CharsetUTF32.java @@ -0,0 +1,318 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +import com.ibm.icu.charset.CharsetDecoderICU; +import com.ibm.icu.charset.CharsetEncoderICU; +import com.ibm.icu.charset.CharsetICU; +import com.ibm.icu.text.UTF16; +/** + * @author Niti Hantaweepant + */ +public class CharsetUTF32 extends CharsetICU { + protected byte[] fromUSubstitution = new byte[]{(byte)0, (byte)0, (byte)0xff, (byte)0xfd}; + public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 4; + maxCharsPerByte = 1; + } + class CharsetDecoderUTF32 extends CharsetDecoderICU{ + + public CharsetDecoderUTF32(CharsetICU cs) { + super(cs); + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + + int sourceArrayIndex = source.position(); + int ch, i; + + try{ + donefornow: + { + /* UTF-8 returns here for only non-offset, this needs to change.*/ + if (toUnicodeStatus != 0 && target.hasRemaining()) { + i = toULength; /* restore # of bytes consumed */ + + ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/ + toUnicodeStatus = 0; + + while (i < 4) { + if (sourceArrayIndex < source.limit()) { + ch = (ch << 8) | ((byte)(source.get(sourceArrayIndex)) & UConverterConstants.UNSIGNED_BYTE_MASK); + toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++); + } + else { + /* stores a partially calculated target*/ + /* + 1 to make 0 a valid character */ + toUnicodeStatus = ch + 1; + toULength = (byte) i; + break donefornow; + } + } + + if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= UConverterSharedData.MAXIMUM_UCS2) + { + /* fits in 16 bits */ + target.put((char)ch); + } + else { + /* write out the surrogates */ + target.put(UTF16.getLeadSurrogate(ch)); + ch = UTF16.getTrailSurrogate(ch); + if (target.hasRemaining()) { + target.put((char)ch); + } + else { + /* Put in overflow buffer (not handled here) */ + charErrorBufferArray[0] = (char) ch; + charErrorBufferLength = 1; + throw new BufferOverflowException(); + } + } + } + else { + toULength = (byte)i; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break donefornow; + } + } + + while (sourceArrayIndex < source.limit() && target.hasRemaining()) { + i = 0; + ch = 0; + + while (i < 4) { + if (sourceArrayIndex < source.limit()) { + ch = (ch << 8) | ((byte)(source.get(sourceArrayIndex)) & UConverterConstants.UNSIGNED_BYTE_MASK); + toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++); + } + else { + /* stores a partially calculated target*/ + /* + 1 to make 0 a valid character */ + toUnicodeStatus = ch + 1; + toULength = (byte) i; + break donefornow; + } + } + + if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= UConverterSharedData.MAXIMUM_UCS2) + { + /* fits in 16 bits */ + target.put((char) ch); + } + else { + /* write out the surrogates */ + target.put(UTF16.getLeadSurrogate(ch)); + ch = UTF16.getTrailSurrogate(ch); + if (target.hasRemaining()) { + target.put((char)ch); + } + else { + /* Put in overflow buffer (not handled here) */ + charErrorBufferArray[0] = (char) ch; + charErrorBufferLength = 1; + throw new BufferOverflowException(); + } + } + } + else { + toULength = (byte)i; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break; + } + } + } + + if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { + /* End of target buffer */ + cr = CoderResult.OVERFLOW; + } + + source.position(sourceArrayIndex); + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + + class CharsetEncoderUTF32 extends CharsetEncoderICU{ + + public CharsetEncoderUTF32(CharsetICU cs) { + super(cs, fromUSubstitution); + implReset(); + } + + private final static int NEED_TO_WRITE_BOM = 1; + + protected void implReset() { + super.implReset(); + fromUnicodeStatus = NEED_TO_WRITE_BOM; + } + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + if(!source.hasRemaining()) { + /* no input, nothing to do */ + return cr; + } + + /* write the BOM if necessary */ + if(fromUnicodeStatus==NEED_TO_WRITE_BOM) { + byte[] bom={ 0, 0, (byte)0xfe, (byte)0xff }; + cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1); + if(cr.isError()){ + return cr; + } + fromUnicodeStatus=0; + } + + int ch, ch2; + int indexToWrite; + byte temp[] = new byte[4]; + temp[0] = 0; + int sourceArrayIndex = source.position(); + + try{ + boolean doloop = true; + if (fromUChar32 != 0) { + ch = fromUChar32; + fromUChar32 = 0; + //lowsurogate: + if (sourceArrayIndex < source.limit()) { + ch2 = source.get(sourceArrayIndex); + if (UTF16.isTrailSurrogate((char)ch2)) { + ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE; + sourceArrayIndex++; + } + else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + fromUChar32 = ch; + cr = CoderResult.malformedForLength(sourceArrayIndex); + doloop = false; + } + } + else { + /* ran out of source */ + fromUChar32 = ch; + if (flush) { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + doloop = false; + } + + /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ + temp[1] = (byte) (ch >>> 16 & 0x1F); + temp[2] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */ + temp[3] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */ + + for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) { + if (target.hasRemaining()) { + target.put(temp[indexToWrite]); + } + else { + errorBuffer[errorBufferLength++] = temp[indexToWrite]; + cr = CoderResult.OVERFLOW; + } + } + } + + if(doloop) { + while (sourceArrayIndex < source.limit() && target.hasRemaining()) { + ch = source.get(sourceArrayIndex++); + + if (UTF16.isSurrogate((char)ch)) { + if (UTF16.isLeadSurrogate((char)ch)) { + //lowsurogate: + if (sourceArrayIndex < source.limit()) { + ch2 = source.get(sourceArrayIndex); + if (UTF16.isTrailSurrogate((char)ch2)) { + ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE; + sourceArrayIndex++; + } + else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + fromUChar32 = ch; + cr = CoderResult.OVERFLOW; + break; + } + } + else { + /* ran out of source */ + fromUChar32 = ch; + if (flush) { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + break; + } + } + else { + fromUChar32 = ch; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break; + } + } + + /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ + temp[1] = (byte) (ch >>> 16 & 0x1F); + temp[2] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */ + temp[3] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */ + + for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) { + if (target.hasRemaining()) { + target.put(temp[indexToWrite]); + } + else { + errorBuffer[errorBufferLength++] = temp[indexToWrite]; + cr = CoderResult.OVERFLOW; + } + } + } + } + + if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { + cr = CoderResult.OVERFLOW; + } + source.position(sourceArrayIndex); + + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + public CharsetDecoder newDecoder() { + return new CharsetDecoderUTF32(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderUTF32(this); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/CharsetUTF32LE.java b/icu4j/src/com/ibm/icu/impl/CharsetUTF32LE.java new file mode 100644 index 00000000000..eac8634b6d9 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/CharsetUTF32LE.java @@ -0,0 +1,318 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +import com.ibm.icu.charset.CharsetDecoderICU; +import com.ibm.icu.charset.CharsetEncoderICU; +import com.ibm.icu.charset.CharsetICU; +import com.ibm.icu.text.UTF16; +/** + * @author Niti Hantaweepant + */ +public class CharsetUTF32LE extends CharsetICU { + protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff, (byte)0, (byte)0}; + public CharsetUTF32LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 4; + maxCharsPerByte = 1; + } + class CharsetDecoderUTF32LE extends CharsetDecoderICU{ + + public CharsetDecoderUTF32LE(CharsetICU cs) { + super(cs); + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + + int sourceArrayIndex = source.position(); + int ch, i; + + try{ + donefornow: + { + /* UTF-8 returns here for only non-offset, this needs to change.*/ + if (toUnicodeStatus != 0 && target.hasRemaining()) { + i = toULength; /* restore # of bytes consumed */ + + ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/ + toUnicodeStatus = 0; + + while (i < 4) { + if (sourceArrayIndex < source.limit()) { + ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8); + toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++); + } + else { + /* stores a partially calculated target*/ + /* + 1 to make 0 a valid character */ + toUnicodeStatus = ch + 1; + toULength = (byte) i; + break donefornow; + } + } + + if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= UConverterSharedData.MAXIMUM_UCS2) + { + /* fits in 16 bits */ + target.put((char)ch); + } + else { + /* write out the surrogates */ + target.put(UTF16.getLeadSurrogate(ch)); + ch = UTF16.getTrailSurrogate(ch); + if (target.hasRemaining()) { + target.put((char)ch); + } + else { + /* Put in overflow buffer (not handled here) */ + charErrorBufferArray[0] = (char) ch; + charErrorBufferLength = 1; + throw new BufferOverflowException(); + } + } + } + else { + toULength = (byte)i; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break donefornow; + } + } + + while (sourceArrayIndex < source.limit() && target.hasRemaining()) { + i = 0; + ch = 0; + + while (i < 4) { + if (sourceArrayIndex < source.limit()) { + ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8); + toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++); + } + else { + /* stores a partially calculated target*/ + /* + 1 to make 0 a valid character */ + toUnicodeStatus = ch + 1; + toULength = (byte) i; + break donefornow; + } + } + + if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + if (ch <= UConverterSharedData.MAXIMUM_UCS2) + { + /* fits in 16 bits */ + target.put((char) ch); + } + else { + /* write out the surrogates */ + target.put(UTF16.getLeadSurrogate(ch)); + ch = UTF16.getTrailSurrogate(ch); + if (target.hasRemaining()) { + target.put((char)ch); + } + else { + /* Put in overflow buffer (not handled here) */ + charErrorBufferArray[0] = (char) ch; + charErrorBufferLength = 1; + throw new BufferOverflowException(); + } + } + } + else { + toULength = (byte)i; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break; + } + } + } + + if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { + /* End of target buffer */ + cr = CoderResult.OVERFLOW; + } + + source.position(sourceArrayIndex); + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + + class CharsetEncoderUTF32LE extends CharsetEncoderICU{ + + public CharsetEncoderUTF32LE(CharsetICU cs) { + super(cs, fromUSubstitution); + implReset(); + } + + private final static int NEED_TO_WRITE_BOM = 1; + + protected void implReset() { + super.implReset(); + fromUnicodeStatus = NEED_TO_WRITE_BOM; + } + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + if(!source.hasRemaining()) { + /* no input, nothing to do */ + return cr; + } + + /* write the BOM if necessary */ + if(fromUnicodeStatus==NEED_TO_WRITE_BOM) { + byte[] bom={ (byte)0xff, (byte)0xfe, 0, 0 }; + cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1); + if(cr.isError()){ + return cr; + } + fromUnicodeStatus=0; + } + + int ch, ch2; + int indexToWrite; + byte temp[] = new byte[4]; + temp[3] = 0; + int sourceArrayIndex = source.position(); + + try{ + boolean doloop = true; + if (fromUChar32 != 0) { + ch = fromUChar32; + fromUChar32 = 0; + //lowsurogate: + if (sourceArrayIndex < source.limit()) { + ch2 = source.get(sourceArrayIndex); + if (UTF16.isTrailSurrogate((char)ch2)) { + ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE; + sourceArrayIndex++; + } + else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + fromUChar32 = ch; + cr = CoderResult.malformedForLength(sourceArrayIndex); + doloop = false; + } + } + else { + /* ran out of source */ + fromUChar32 = ch; + if (flush) { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + doloop = false; + } + + /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ + temp[2] = (byte) (ch >>> 16 & 0x1F); + temp[1] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */ + temp[0] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */ + + for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) { + if (target.hasRemaining()) { + target.put(temp[indexToWrite]); + } + else { + errorBuffer[errorBufferLength++] = temp[indexToWrite]; + cr = CoderResult.OVERFLOW; + } + } + } + + if(doloop) { + while (sourceArrayIndex < source.limit() && target.hasRemaining()) { + ch = source.get(sourceArrayIndex++); + + if (UTF16.isSurrogate((char)ch)) { + if (UTF16.isLeadSurrogate((char)ch)) { + //lowsurogate: + if (sourceArrayIndex < source.limit()) { + ch2 = source.get(sourceArrayIndex); + if (UTF16.isTrailSurrogate((char)ch2)) { + ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE; + sourceArrayIndex++; + } + else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + fromUChar32 = ch; + cr = CoderResult.OVERFLOW; + break; + } + } + else { + /* ran out of source */ + fromUChar32 = ch; + if (flush) { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + cr = CoderResult.malformedForLength(sourceArrayIndex); + } + break; + } + } + else { + fromUChar32 = ch; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break; + } + } + + /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ + temp[2] = (byte) (ch >>> 16 & 0x1F); + temp[1] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */ + temp[0] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */ + + for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) { + if (target.hasRemaining()) { + target.put(temp[indexToWrite]); + } + else { + errorBuffer[errorBufferLength++] = temp[indexToWrite]; + cr = CoderResult.OVERFLOW; + } + } + } + } + + if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { + cr = CoderResult.OVERFLOW; + } + source.position(sourceArrayIndex); + + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + public CharsetDecoder newDecoder() { + return new CharsetDecoderUTF32LE(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderUTF32LE(this); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/CharsetUTF8.java b/icu4j/src/com/ibm/icu/impl/CharsetUTF8.java new file mode 100644 index 00000000000..241b4bd249d --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/CharsetUTF8.java @@ -0,0 +1,508 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.impl; + +import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +import com.ibm.icu.charset.CharsetDecoderICU; +import com.ibm.icu.charset.CharsetEncoderICU; +import com.ibm.icu.charset.CharsetICU; +import com.ibm.icu.text.UTF16; +/** + * @author Niti Hantaweepant + */ +public class CharsetUTF8 extends CharsetICU { + protected byte[] fromUSubstitution = new byte[]{(byte)0xef, (byte)0xbf, (byte)0xbd}; + public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 1; + maxCharsPerByte = 1; + } + + /* UTF-8 Conversion DATA + * for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9 + */ + private static final long OFFSETS_FROM_UTF8[] = {0, + 0x00000000L, 0x00003080L, 0x000E2080L, + 0x03C82080L, 0xFA082080L, 0x82082080L}; + + private static final byte BYTES_FROM_UTF8[] = + { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 + }; + + /* + * Starting with Unicode 3.0.1: + * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; + * byte sequences with more than 4 bytes are illegal in UTF-8, + * which is tested with impossible values for them + */ + private static final long UTF8_MIN_CHAR32[] = { 0L, 0L, 0x80L, 0x800L, 0x10000L, 0xffffffffL, 0xffffffffL }; + + class CharsetDecoderUTF8 extends CharsetDecoderICU{ + + public CharsetDecoderUTF8(CharsetICU cs) { + super(cs); + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + + int sourceArrayIndex = source.position(); + + // Todo: CESU8 implementation + // boolean isCESU8 = args.converter.sharedData == _CESU8Data; + boolean isCESU8 = (UConverterSharedData._CESU8Data != null); + int ch, ch2 = 0; + int i, inBytes; + + try{ + + donefornow: + { + if (toUnicodeStatus!=0 && target.hasRemaining()) + { + inBytes = mode; /* restore # of bytes to consume */ + i = toULength; /* restore # of bytes consumed */ + + ch = toUnicodeStatus; /*Stores the previously calculated ch from a previous call*/ + toUnicodeStatus = 0; + + while (i < inBytes) + { + if (sourceArrayIndex=5 it is 0x10ffff= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch))) + { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + toULength = 0; + if (ch <= UConverterSharedData.MAXIMUM_UCS2) + { + /* fits in 16 bits */ + target.put((char)ch); + } + else + { + /* write out the surrogates */ + ch -= UConverterSharedData.HALF_BASE; + target.put((char) ((ch >> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START)); + ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START; + if(target.hasRemaining()) { + target.put((char)ch); + + } else /* targetCapacity==1 */ { + charErrorBufferArray[charErrorBufferBegin+0]=(char)ch; + charErrorBufferLength=1; + throw new BufferOverflowException(); + } + } + } + else + { + toULength = (byte)i; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break donefornow; + } + } + + while (sourceArrayIndex < source.limit() && target.hasRemaining()) + { + ch = source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK; + if (ch < 0x80) /* Simple case */ + { + target.put((char)ch); + } + else + { + /* store the first char */ + toUBytesArray[0] = (byte)ch; + inBytes = BYTES_FROM_UTF8[(int)ch]; /* lookup current sequence length */ + i = 1; + + while (i < inBytes) + { + if (sourceArrayIndex < source.limit()) + { + toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK); + if (!isTrail((byte)ch2)) + { + break; /* i < inBytes */ + } + ch = (ch << 6) + ch2; + ++sourceArrayIndex; + i++; + } + else + { + /* stores a partially calculated target*/ + toUnicodeStatus = ch; + mode = inBytes; + toULength = (byte) i; + break donefornow; + } + } + + /* Remove the accumulated high bits */ + ch -= OFFSETS_FROM_UTF8[inBytes]; + + /* + * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: + * - use only trail bytes after a lead byte (checked above) + * - use the right number of trail bytes for a given lead byte + * - encode a code point <= U+10ffff + * - use the fewest possible number of bytes for their code points + * - use at most 4 bytes (for i>=5 it is 0x10ffff= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch))) + { + /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ + toULength = 0; + if (ch <= UConverterSharedData.MAXIMUM_UCS2) + { + /* fits in 16 bits */ + target.put((char) ch); + } + else + { + /* write out the surrogates */ + ch -= UConverterSharedData.HALF_BASE; + target.put((char) ((ch >>> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START)); + ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START; + if (target.hasRemaining()) + { + target.put((char)ch); + } + else + { + /* Put in overflow buffer (not handled here) */ + charErrorBufferArray[charErrorBufferBegin+0]=(char)ch; + charErrorBufferLength=1; + throw new BufferOverflowException(); + } + } + } + else + { + toULength = (byte)i; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break; + } + } + } + } + + if (sourceArrayIndex < source.limit() && !target.hasRemaining()) + { + /* End of target buffer */ + cr = CoderResult.OVERFLOW; + } + + source.position(sourceArrayIndex); + + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + + } + class CharsetEncoderUTF8 extends CharsetEncoderICU{ + + public CharsetEncoderUTF8(CharsetICU cs) { + super(cs, fromUSubstitution); + implReset(); + } + + protected void implReset() { + super.implReset(); + } + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + CoderResult cr = CoderResult.UNDERFLOW; + + int sourceArrayIndex = source.position(); + + // Todo: CESU8 implementation + // boolean isCESU8 = args.converter.sharedData == _CESU8Data; + boolean isCESU8 = (UConverterSharedData._CESU8Data != null); + + int ch; + short indexToWrite; + byte temp[] = new byte[4]; + boolean doloop = true; + + try{ + + if (fromUChar32 != 0 && target.hasRemaining()) + { + ch = fromUChar32; + fromUChar32 = 0; + + if (sourceArrayIndex < source.limit()) { + /* test the following code unit */ + char trail = source.get(sourceArrayIndex); + if(UTF16.isTrailSurrogate(trail)) { + ++sourceArrayIndex; + ch = UTF16.getCodePoint((char)ch, trail); + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + fromUChar32 = (int)ch; + cr = CoderResult.malformedForLength(sourceArrayIndex); + doloop = false; + } + } else { + /* no more input */ + fromUChar32 = (int)ch; + doloop = false; + } + + if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) + { + indexToWrite = 2; + temp[2] = (byte) ((ch >>> 12) | 0xe0); + } + else + { + indexToWrite = 3; + temp[3] = (byte) ((ch >>> 18) | 0xf0); + temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80); + } + temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80); + temp[0] = (byte) ((ch & 0x3f) | 0x80); + + for (; indexToWrite >= 0; indexToWrite--) + { + if (target.hasRemaining()) + { + target.put(temp[indexToWrite]); + } + else + { + errorBuffer[errorBufferLength++] = temp[indexToWrite]; + cr = CoderResult.OVERFLOW; + } + } + } + + if(doloop) { + while (sourceArrayIndex < source.limit() && target.hasRemaining()) + { + ch = source.get(sourceArrayIndex++); + + if (ch < 0x80) /* Single byte */ + { + target.put((byte)ch); + } + else if (ch < 0x800) /* Double byte */ + { + target.put((byte) ((ch >>> 6) | 0xc0)); + if (target.hasRemaining()) + { + target.put((byte) ((ch & 0x3f) | 0x80)); + } + else + { + errorBuffer[0] = (byte) ((ch & 0x3f) | 0x80); + errorBufferLength = 1; + throw new BufferOverflowException(); + } + } + else + /* Check for surrogates */ + { + if(UTF16.isSurrogate((char)ch) && !isCESU8) { + if(UTF16.isLeadSurrogate((char)ch)) { + + if (sourceArrayIndex < source.limit()) { + /* test the following code unit */ + char trail = source.get(sourceArrayIndex); + if(UTF16.isTrailSurrogate(trail)) { + ++sourceArrayIndex; + ch = UTF16.getCodePoint((char)ch, trail); + //ch2 = 0; + /* convert this supplementary code point */ + /* exit this condition tree */ + } + else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + fromUChar32 = ch; + cr = CoderResult.malformedForLength(sourceArrayIndex);; + break; + } + } + else { + /* no more input */ + fromUChar32 = ch; + break; + } + } + else { + fromUChar32 = (int)ch; + cr = CoderResult.malformedForLength(sourceArrayIndex); + break; + } + } + + if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) + { + indexToWrite = 2; + temp[2] = (byte) ((ch >>> 12) | 0xe0); + } + else + { + indexToWrite = 3; + temp[3] = (byte) ((ch >>> 18) | 0xf0); + temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80); + } + temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80); + temp[0] = (byte) ((ch & 0x3f) | 0x80); + + for (; indexToWrite >= 0; indexToWrite--) + { + if (target.hasRemaining()) + { + target.put(temp[indexToWrite]); + } + else + { + errorBuffer[errorBufferLength++] = temp[indexToWrite]; + cr = CoderResult.OVERFLOW; + } + } + } + } + } + + if (sourceArrayIndex < source.limit() && !target.hasRemaining()) + { + cr = CoderResult.OVERFLOW; + } + + source.position(sourceArrayIndex); + + }catch(BufferOverflowException ex){ + cr = CoderResult.OVERFLOW; + } + return cr; + } + } + + /* single-code point definitions -------------------------------------------- */ + + /** + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ + public static boolean isSingle(byte c) {return (((c)&0x80)==0);} + + /** + * Is this code unit (byte) a UTF-8 lead byte? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ + public static boolean isLead(byte c) {return ((((c)-0xc0) & UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);} + + /** + * Is this code unit (byte) a UTF-8 trail byte? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @stable ICU 2.4 + */ + public static boolean isTrail(byte c) {return (((c)&0xc0)==0x80);} + + /** + * How many code units (bytes) are used for the UTF-8 encoding + * of this Unicode code point? + * @param c 32-bit code point + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point + * @stable ICU 2.4 + */ + public static final int length(int c) + { + long uc = c & UConverterConstants.UNSIGNED_INT_MASK; + return + (uc<=0x7f ? 1 : + (uc<=0x7ff ? 2 : + (uc<=0xd7ff ? 3 : + (uc<=0xdfff || uc>0x10ffff ? 0 : + (uc<=0xffff ? 3 : 4) + ) + ) + ) + ); + } + + public CharsetDecoder newDecoder() { + return new CharsetDecoderUTF8(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderUTF8(this); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/InvalidFormatException.java b/icu4j/src/com/ibm/icu/impl/InvalidFormatException.java new file mode 100644 index 00000000000..c827977081b --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/InvalidFormatException.java @@ -0,0 +1,16 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +public class InvalidFormatException extends Exception { + public InvalidFormatException(){} + public InvalidFormatException(String message){ + super(message); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/UConverterAlias.java b/icu4j/src/com/ibm/icu/impl/UConverterAlias.java new file mode 100644 index 00000000000..4cd7337cfa5 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterAlias.java @@ -0,0 +1,789 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.io.IOException; +import java.io.BufferedInputStream; +import java.io.InputStream; +import java.nio.ByteBuffer; + +import com.ibm.icu.charset.CharsetICU; + +public final class UConverterAlias { + /** The largest value a 32 bit unsigned integer can hold @draft ICU 3.6 */ + public static final long UINT32_MAX = 4294967295L; + + public static final int AMBIGUOUS_ALIAS_MAP_BIT = 0x8000; + + public static final int CONVERTER_INDEX_MASK = 0xFFF; + + public static final int NUM_RESERVED_TAGS = 2; + + public static final int NUM_HIDDEN_TAGS = 1; + + static int[] gConverterListArray = null; + + static int gConverterListArrayIndex; + + static int[] gTagListArray = null; + + static int gTagListArrayIndex; + + static int[] gAliasListArray = null; + + static int gAliasListArrayIndex; + + static int[] gUntaggedConvArrayArray = null; + + static int gUntaggedConvArrayArrayIndex; + + static int[] gTaggedAliasArrayArray = null; + + static int gTaggedAliasArrayArrayIndex; + + static int[] gTaggedAliasListsArray = null; + + static int gTaggedAliasListsArrayIndex; + + static byte[] gStringTableArray = null; + + static int gStringTableArrayIndex; + + static long gConverterListSize; + + static long gTagListSize; + + static long gAliasListSize; + + static long gUntaggedConvArraySize; + + static long gTaggedAliasArraySize; + + static long gTaggedAliasListsSize; + + static long gStringTableSize; + + static final String GET_STRING(int idx) { + return new String(gStringTableArray, 2 * idx, (int) strlen(gStringTableArray, 2 * idx)); + } + + public static final int strlen(byte[] sArray, int sBegin) + { + int i = sBegin; + while(i < sArray.length && sArray[i++] != 0) {} + return i - sBegin - 1; + } + + public static final int tocLengthIndex = 0; + + public static final int converterListIndex = 1; + + public static final int tagListIndex = 2; + + public static final int aliasListIndex = 3; + + public static final int untaggedConvArrayIndex = 4; + + public static final int taggedAliasArrayIndex = 5; + + public static final int taggedAliasListsIndex = 6; + + public static final int reservedIndex1 = 7; + + public static final int stringTableIndex = 8; + + public static final int minTocLength = 8; /* + * min. tocLength in the file, + * does not count the + * tocLengthIndex! + */ + + public static final int offsetsCount = minTocLength + 1; /* + * length of the + * swapper's + * temporary + * offsets[] + */ + + static ByteBuffer gAliasData = null; + + private static final boolean isAlias(String alias) { + if (alias == null) { + throw new IllegalArgumentException("Alias param is null!"); + } else if (alias.length() == 0) { + return false; + } else { + return true; + } + } + + private static final String CNVALIAS_DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE + "/cnvalias.icu"; + + /** + * Default buffer size of datafile + */ + private static final int CNVALIAS_DATA_BUFFER_SIZE = 25000; + + private static final synchronized boolean haveAliasData() + throws IOException{ + boolean needInit; + + // agljport:todo umtx_lock(NULL); + needInit = gAliasData == null; + + /* load converter alias data from file if necessary */ + if (needInit) { + ByteBuffer data = null; + long[] tableArray = null; + long tableStart; + long reservedSize1; + byte[] reservedBytes = null; + + // agljport:fix data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, + // isAcceptable, NULL, pErrorCode); + // data = udata_openChoice(null, DATA_TYPE, DATA_NAME, 0, + // isAcceptable, null, pErrorCode); + InputStream i = ICUData.getRequiredStream(CNVALIAS_DATA_FILE_NAME); + BufferedInputStream b = new BufferedInputStream(i, CNVALIAS_DATA_BUFFER_SIZE); + UConverterAliasDataReader reader = new UConverterAliasDataReader(b); + tableArray = reader.readToc(offsetsCount); + + tableStart = tableArray[0]; + if (tableStart < minTocLength) { + throw new IOException("Invalid data format."); + } + gConverterListSize = tableArray[1]; + gTagListSize = tableArray[2]; + gAliasListSize = tableArray[3]; + gUntaggedConvArraySize = tableArray[4]; + gTaggedAliasArraySize = tableArray[5]; + gTaggedAliasListsSize = tableArray[6]; + reservedSize1 = tableArray[7] * 2; + gStringTableSize = tableArray[8] * 2; + + gConverterListArray = new int[(int) gConverterListSize]; + gTagListArray = new int[(int) gTagListSize]; + gAliasListArray = new int[(int) gAliasListSize]; + gUntaggedConvArrayArray = new int[(int) gUntaggedConvArraySize]; + gTaggedAliasArrayArray = new int[(int) gTaggedAliasArraySize]; + gTaggedAliasListsArray = new int[(int) gTaggedAliasListsSize]; + reservedBytes = new byte[(int) reservedSize1]; + gStringTableArray = new byte[(int) gStringTableSize]; + + reader.read(gConverterListArray, gTagListArray, + gAliasListArray, gUntaggedConvArrayArray, + gTaggedAliasArrayArray, gTaggedAliasListsArray, + reservedBytes, gStringTableArray); + data = ByteBuffer.allocate(0); // dummy UDataMemory object in absence + // of memory mapping + + // agljport:todo umtx_lock(NULL); + if (gAliasData == null) { + gAliasData = data; + data = null; + + // agljport:fix ucln_common_registerCleanup(UCLN_COMMON_IO, + // io_cleanup); + } + // agljport:todo umtx_unlock(NULL); + + /* if a different thread set it first, then close the extra data */ + if (data != null) { + // agljport:fix udata_close(data); /* NULL if it was set + // correctly */ + } + } + + return true; + } + + // U_CFUNC const char * io_getConverterName(const char *alias, UErrorCode + // *pErrorCode) + public static final String io_getConverterName(String alias) + throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + long convNum = findConverter(alias, isAmbigous); + if (convNum < gConverterListSize) { + return GET_STRING(gConverterListArray[(int) convNum]); + } + /* else converter not found */ + } + return null; + } + + /* + * search for an alias return the converter number index for gConverterList + */ + // static U_INLINE uint32_t findConverter(const char *alias, UErrorCode + // *pErrorCode) + private static final long findConverter(String alias, boolean[] isAmbigous) { + long mid, start, limit; + long lastMid; + long result; + + /* do a binary search for the alias */ + start = 0; + limit = gUntaggedConvArraySize; + mid = limit; + lastMid = UINT32_MAX; + + for (;;) { + mid = (start + limit) / 2; + if (lastMid == mid) { /* Have we moved? */ + break; /* We haven't moved, and it wasn't found. */ + } + lastMid = mid; + result = compareNames(alias, GET_STRING(gAliasListArray[(int) mid])); + + if (result < 0) { + limit = mid; + } else if (result > 0) { + start = mid; + } else { + /* + * Since the gencnval tool folds duplicates into one entry, this + * alias in gAliasList is unique, but different standards may + * map an alias to different converters. + */ + if ((gUntaggedConvArrayArray[(int) mid] & AMBIGUOUS_ALIAS_MAP_BIT) != 0) { + isAmbigous[0]=true; + } + return gUntaggedConvArrayArray[(int) mid] & CONVERTER_INDEX_MASK; + } + } +// public static final long UINT32_MAX = 4294967295L; + return Long.MAX_VALUE; + } + + /** + * \var io_stripForCompare Remove the underscores, dashes and spaces from + * the name, and convert the name to lower case. + * + * @param dst + * The destination buffer, which is <= the buffer of name. + * @param dst + * The destination buffer, which is <= the buffer of name. + * @return the destination buffer. + */ + public static final StringBuffer io_stripForCompare(StringBuffer dst, String name) { + return io_stripASCIIForCompare(dst, name); + } + + /* @see compareNames */ + private static final StringBuffer io_stripASCIIForCompare(StringBuffer dst, String name) { + name = name.concat("\000"); + int nameIndex = 0; + char c1 = name.charAt(0); + int dstItr = 0; + + while (c1 != 0) { + /* Ignore delimiters '-', '_', and ' ' */ + while ((c1 = name.charAt(nameIndex)) == 0x2d || c1 == 0x5f + || c1 == 0x20) { + ++nameIndex; + } + + /* lowercase for case-insensitive comparison */ + dst.append(Character.toLowerCase(c1)); + ++dstItr; + ++nameIndex; + } + if (dst.length() > 0) + dst.deleteCharAt(dst.length() - 1); + return dst; + } + + /** + * Do a fuzzy compare of a two converter/alias names. The comparison is + * case-insensitive. It also ignores the characters '-', '_', and ' ' (dash, + * underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8" + * are exactly equivalent. + * + * This is a symmetrical (commutative) operation; order of arguments is + * insignificant. This is an important property for sorting the list (when + * the list is preprocessed into binary form) and for performing binary + * searches on it at run time. + * + * @param name1 + * a converter name or alias, zero-terminated + * @param name2 + * a converter name or alias, zero-terminated + * @return 0 if the names match, or a negative value if the name1 lexically + * precedes name2, or a positive value if the name1 lexically + * follows name2. + * + * @see io_stripForCompare + */ + public static int compareNames(String name1, String name2){ + int result = 0; + int i1 = 0; + int i2 = 0; + while (true) { + char ch1 = 0; + char ch2 = 0; + // Ignore delimiters '-', '_', and ASCII White_Space + if (i1 < name1.length()) { + ch1 = name1.charAt(i1 ++); + } + while (ch1 == '-' || ch1 == '_' || ch1 == ' ' ) { + if (i1 < name1.length()) { + ch1 = name1.charAt(i1 ++); + } + else { + ch1 = 0; + } + } + if (i2 < name2.length()) { + ch2 = name2.charAt(i2 ++); + } + while (ch2 == '-' || ch2 == '_' || ch2 == ' ' ) { + if (i2 < name2.length()) { + ch2 = name2.charAt(i2 ++); + } + else { + ch2 = 0; + } + } + + // If we reach the ends of both strings then they match + if (ch1 == 0 && ch2 == 0) { + return 0; + } + + // Case-insensitive comparison + if (ch1 != ch2) { + result = Character.toLowerCase(ch1)- Character.toLowerCase(ch2); + if (result != 0) { + return result; + } + } + } + } + + public static int io_countAliases(String alias) + throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + long convNum = findConverter(alias, isAmbigous); + if (convNum < gConverterListSize) { + /* tagListNum - 1 is the ALL tag */ + int listOffset = gTaggedAliasArrayArray[(int) ((gTagListSize - 1) + * gConverterListSize + convNum)]; + + if (listOffset != 0) { + return gTaggedAliasListsArray[listOffset]; + } + /* else this shouldn't happen. internal program error */ + } + /* else converter not found */ + } + return 0; + } + + /** + * Return the number of all aliases (and converter names). + * + * @param pErrorCode + * The error code + * @return the number of all aliases + */ + // U_CFUNC uint16_t io_countTotalAliases(UErrorCode *pErrorCode); + public static int io_countTotalAliases() throws IOException{ + if (haveAliasData()) { + return (int) gAliasListSize; + } + return 0; + } + + // U_CFUNC const char * io_getAlias(const char *alias, uint16_t n, + // UErrorCode *pErrorCode) + public static String io_getAlias(String alias, int n) throws IOException{ + if (haveAliasData() && isAlias(alias)) { + boolean[] isAmbigous = new boolean[1]; + long convNum = findConverter(alias,isAmbigous); + if (convNum < gConverterListSize) { + /* tagListNum - 1 is the ALL tag */ + int listOffset = gTaggedAliasArrayArray[(int) ((gTagListSize - 1) + * gConverterListSize + convNum)]; + + if (listOffset != 0) { + //long listCount = gTaggedAliasListsArray[listOffset]; + /* +1 to skip listCount */ + int[] currListArray = gTaggedAliasListsArray; + int currListArrayIndex = listOffset + 1; + + return GET_STRING(currListArray[currListArrayIndex + n]); + + } + /* else this shouldn't happen. internal program error */ + } + /* else converter not found */ + } + return null; + } + + // U_CFUNC uint16_t io_countStandards(UErrorCode *pErrorCode) { + public static int io_countStandards() throws IOException{ + if (haveAliasData()) { + return (int) (gTagListSize - NUM_HIDDEN_TAGS); + } + return 0; + } + + // U_CAPI const char * U_EXPORT2getStandard(uint16_t n, UErrorCode + // *pErrorCode) + public static String getStandard(int n) throws IOException{ + if (haveAliasData()) { + return GET_STRING(gTagListArray[n]); + } + return null; + } + + // U_CAPI const char * U_EXPORT2 getStandardName(const char *alias, const + // char *standard, UErrorCode *pErrorCode) + public static final String getStandardName(String alias, String standard)throws IOException { + if (haveAliasData() && isAlias(alias)) { + long listOffset = findTaggedAliasListsOffset(alias, standard); + + if (0 < listOffset && listOffset < gTaggedAliasListsSize) { + int[] currListArray = gTaggedAliasListsArray; + long currListArrayIndex = listOffset + 1; + if (currListArray[0] != 0) { + return GET_STRING(currListArray[(int) currListArrayIndex]); + } + } + } + return null; + } + + // U_CAPI uint16_t U_EXPORT2 countAliases(const char *alias, UErrorCode + // *pErrorCode) + public static int countAliases(String alias) throws IOException{ + return io_countAliases(alias); + } + + // U_CAPI const char* U_EXPORT2 getAlias(const char *alias, uint16_t n, + // UErrorCode *pErrorCode) + public static String getAlias(String alias, int n) throws IOException{ + return io_getAlias(alias, n); + } + + // U_CFUNC uint16_t countStandards(void) + public static int countStandards()throws IOException{ + return io_countStandards(); + } + + /*returns a single Name from the list, will return NULL if out of bounds + */ + public static String getAvailableName (int n){ + try{ + if (0 <= n && n <= 0xffff) { + String name = bld_getAvailableConverter(n); + return name; + } + }catch(IOException ex){ + //throw away exception + } + return null; + } + // U_CAPI const char * U_EXPORT2 getCanonicalName(const char *alias, const + // char *standard, UErrorCode *pErrorCode) { + public static String getCanonicalName(String alias, String standard) throws IOException{ + if (haveAliasData() && isAlias(alias)) { + long convNum = findTaggedConverterNum(alias, standard); + + if (convNum < gConverterListSize) { + return GET_STRING(gConverterListArray[(int) convNum]); + } + } + + return null; + } + public static int countAvailable (){ + try{ + return bld_countAvailableConverters(); + }catch(IOException ex){ + //throw away exception + } + return -1; + } + + // U_CAPI UEnumeration * U_EXPORT2 openStandardNames(const char *convName, + // const char *standard, UErrorCode *pErrorCode) + public static final UConverterAliasesEnumeration openStandardNames(String convName, String standard)throws IOException { + UConverterAliasesEnumeration aliasEnum = null; + if (haveAliasData() && isAlias(convName)) { + long listOffset = findTaggedAliasListsOffset(convName, standard); + + /* + * When listOffset == 0, we want to acknowledge that the converter + * name and standard are okay, but there is nothing to enumerate. + */ + if (listOffset < gTaggedAliasListsSize) { + + UConverterAliasesEnumeration.UAliasContext context = new UConverterAliasesEnumeration.UAliasContext(listOffset, 0); + aliasEnum = new UConverterAliasesEnumeration(); + aliasEnum.setContext(context); + } + /* else converter or tag not found */ + } + return aliasEnum; + } + + // static uint32_t getTagNumber(const char *tagname) + private static long getTagNumber(String tagName) { + if (gTagListArray != null) { + long tagNum; + for (tagNum = 0; tagNum < gTagListSize; tagNum++) { + if (tagName.equals(GET_STRING(gTagListArray[(int) tagNum]))) { + return tagNum; + } + } + } + + return UINT32_MAX; + } + + // static uint32_t findTaggedAliasListsOffset(const char *alias, const char + // *standard, UErrorCode *pErrorCode) + private static long findTaggedAliasListsOffset(String alias, String standard) { + long idx; + long listOffset; + long convNum; + long tagNum = getTagNumber(standard); + boolean[] isAmbigous = new boolean[1]; + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, isAmbigous); + + if (tagNum < (gTagListSize - NUM_HIDDEN_TAGS) + && convNum < gConverterListSize) { + listOffset = gTaggedAliasArrayArray[(int) (tagNum + * gConverterListSize + convNum)]; + if (listOffset != 0 + && gTaggedAliasListsArray[(int) listOffset + 1] != 0) { + return listOffset; + } + if (isAmbigous[0]==true) { + /* + * Uh Oh! They used an ambiguous alias. We have to search the + * whole swiss cheese starting at the highest standard affinity. + * This may take a while. + */ + + for (idx = 0; idx < gTaggedAliasArraySize; idx++) { + listOffset = gTaggedAliasArrayArray[(int) idx]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + long currTagNum = idx / gConverterListSize; + long currConvNum = (idx - currTagNum + * gConverterListSize); + long tempListOffset = gTaggedAliasArrayArray[(int) (tagNum + * gConverterListSize + currConvNum)]; + if (tempListOffset != 0 + && gTaggedAliasListsArray[(int) tempListOffset + 1] != 0) { + return tempListOffset; + } + /* + * else keep on looking We could speed this up by + * starting on the next row because an alias is unique + * per row, right now. This would change if alias + * versioning appears. + */ + } + } + /* The standard doesn't know about the alias */ + } + /* else no default name */ + return 0; + } + /* else converter or tag not found */ + + return UINT32_MAX; + } + + /* Return the canonical name */ + // static uint32_t findTaggedConverterNum(const char *alias, const char + // *standard, UErrorCode *pErrorCode) + private static long findTaggedConverterNum(String alias, String standard) { + long idx; + long listOffset; + long convNum; + long tagNum = getTagNumber(standard); + boolean[] isAmbigous = new boolean[1]; + + /* Make a quick guess. Hopefully they used a TR22 canonical alias. */ + convNum = findConverter(alias, isAmbigous); + + if (tagNum < (gTagListSize - NUM_HIDDEN_TAGS) + && convNum < gConverterListSize) { + listOffset = gTaggedAliasArrayArray[(int) (tagNum + * gConverterListSize + convNum)]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + return convNum; + } + if (isAmbigous[0] == true) { + /* + * Uh Oh! They used an ambiguous alias. We have to search one + * slice of the swiss cheese. We search only in the requested + * tag, not the whole thing. This may take a while. + */ + long convStart = (tagNum) * gConverterListSize; + long convLimit = (tagNum + 1) * gConverterListSize; + for (idx = convStart; idx < convLimit; idx++) { + listOffset = gTaggedAliasArrayArray[(int) idx]; + if (listOffset != 0 && isAliasInList(alias, listOffset)) { + return idx - convStart; + } + } + /* The standard doesn't know about the alias */ + } + /* else no canonical name */ + } + /* else converter or tag not found */ + + return UINT32_MAX; + } + + // static U_INLINE UBool isAliasInList(const char *alias, uint32_t + // listOffset) + private static boolean isAliasInList(String alias, long listOffset) { + if (listOffset != 0) { + long currAlias; + long listCount = gTaggedAliasListsArray[(int) listOffset]; + /* +1 to skip listCount */ + int[] currList = gTaggedAliasListsArray; + long currListArrayIndex = listOffset + 1; + for (currAlias = 0; currAlias < listCount; currAlias++) { + if (currList[(int) (currAlias + currListArrayIndex)] != 0 + && compareNames( + alias, + GET_STRING(currList[(int) (currAlias + currListArrayIndex)])) == 0) { + return true; + } + } + } + return false; + } + + // begin bld.c + static String[] gAvailableConverters = null; + + static int gAvailableConverterCount = 0; + + static byte[] gDefaultConverterNameBuffer; // [MAX_CONVERTER_NAME_LENGTH + + // 1]; /* +1 for NULL */ + + static String gDefaultConverterName = null; + + // static UBool haveAvailableConverterList(UErrorCode *pErrorCode) + static boolean haveAvailableConverterList() throws IOException{ + if (gAvailableConverters == null) { + int idx; + int localConverterCount; + String converterName; + String[] localConverterList; + + if (!haveAliasData()) { + return false; + } + + /* We can't have more than "*converterTable" converters to open */ + localConverterList = new String[(int) gConverterListSize]; + + localConverterCount = 0; + + for (idx = 0; idx < gConverterListSize; idx++) { + converterName = GET_STRING(gConverterListArray[idx]); + //UConverter cnv = UConverter.open(converterName); + //TODO: Fix me + localConverterList[localConverterCount++] = converterName; + + } + + // agljport:todo umtx_lock(NULL); + if (gAvailableConverters == null) { + gAvailableConverters = localConverterList; + gAvailableConverterCount = localConverterCount; + /* haveData should have already registered the cleanup function */ + } else { + // agljport:todo free((char **)localConverterList); + } + // agljport:todo umtx_unlock(NULL); + } + return true; + } + + // U_CFUNC uint16_t bld_countAvailableConverters(UErrorCode *pErrorCode) + public static int bld_countAvailableConverters() throws IOException{ + if (haveAvailableConverterList()) { + return gAvailableConverterCount; + } + return 0; + } + + // U_CFUNC const char * bld_getAvailableConverter(uint16_t n, UErrorCode + // *pErrorCode) + public static String bld_getAvailableConverter(int n) throws IOException{ + if (haveAvailableConverterList()) { + if (n < gAvailableConverterCount) { + return gAvailableConverters[n]; + } + } + return null; + } + + /* default converter name --------------------------------------------------- */ + + /* + * In order to be really thread-safe, the get function would have to take + * a buffer parameter and copy the current string inside a mutex block. + * This implementation only tries to be really thread-safe while + * setting the name. + * It assumes that setting a pointer is atomic. + */ + + // U_CFUNC const char * getDefaultName() + public static final synchronized String getDefaultName() { + /* local variable to be thread-safe */ + String name; + + //agljport:todo umtx_lock(null); + name = gDefaultConverterName; + //agljport:todo umtx_unlock(null); + + if (name == null) { + //UConverter cnv = null; + long length = 0; + + name = CharsetICU.getDefaultCharsetName(); + + /* if the name is there, test it out and get the canonical name with options */ + if (name != null) { + // cnv = UConverter.open(name); + // name = cnv.getName(cnv); + // TODO: fix me + } + + if (name == null || name.length() == 0 ||/* cnv == null ||*/ + length >= gDefaultConverterNameBuffer.length) { + /* Panic time, let's use a fallback. */ + name = new String("US-ASCII"); + } + + //length=(int32_t)(strlen(name)); + + /* Copy the name before we close the converter. */ + name = gDefaultConverterName; + } + + return name; + } + + //end bld.c +} \ No newline at end of file diff --git a/icu4j/src/com/ibm/icu/impl/UConverterAliasDataReader.java b/icu4j/src/com/ibm/icu/impl/UConverterAliasDataReader.java new file mode 100644 index 00000000000..660ef0b5b72 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterAliasDataReader.java @@ -0,0 +1,218 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.impl; +import java.io.*; +import com.ibm.icu.impl.ICUDebug; + +/* Format of cnvalias.icu ----------------------------------------------------- + * + * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. + * This binary form contains several tables. All indexes are to uint16_t + * units, and not to the bytes (uint8_t units). Addressing everything on + * 16-bit boundaries allows us to store more information with small index + * numbers, which are also 16-bit in size. The majority of the table (except + * the string table) are 16-bit numbers. + * + * First there is the size of the Table of Contents (TOC). The TOC + * entries contain the size of each section. In order to find the offset + * you just need to sum up the previous offsets. + * The TOC length and entries are an array of uint32_t values. + * The first section after the TOC starts immediately after the TOC. + * + * 1) This section contains a list of converters. This list contains indexes + * into the string table for the converter name. The index of this list is + * also used by other sections, which are mentioned later on. + * This list is not sorted. + * + * 2) This section contains a list of tags. This list contains indexes + * into the string table for the tag name. The index of this list is + * also used by other sections, which are mentioned later on. + * This list is in priority order of standards. + * + * 3) This section contains a list of sorted unique aliases. This + * list contains indexes into the string table for the alias name. The + * index of this list is also used by other sections, like the 4th section. + * The index for the 3rd and 4th section is used to get the + * alias -> converter name mapping. Section 3 and 4 form a two column table. + * + * 4) This section contains a list of mapped converter names. Consider this + * as a table that maps the 3rd section to the 1st section. This list contains + * indexes into the 1st section. The index of this list is the same index in + * the 3rd section. There is also some extra information in the high bits of + * each converter index in this table. Currently it's only used to say that + * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK + * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is + * the predigested form of the 5th section so that an alias lookup can be fast. + * + * 5) This section contains a 2D array with indexes to the 6th section. This + * section is the full form of all alias mappings. The column index is the + * index into the converter list (column header). The row index is the index + * to tag list (row header). This 2D array is the top part a 3D array. The + * third dimension is in the 6th section. + * + * 6) This is blob of variable length arrays. Each array starts with a size, + * and is followed by indexes to alias names in the string table. This is + * the third dimension to the section 5. No other section should be referencing + * this section. + * + * 7) Reserved at this time (There is no information). This _usually_ has a + * size of 0. Future versions may add more information here. + * + * 8) This is the string table. All strings are indexed on an even address. + * There are two reasons for this. First many chip architectures locate strings + * faster on even address boundaries. Second, since all indexes are 16-bit + * numbers, this string table can be 128KB in size instead of 64KB when we + * only have strings starting on an even address. + * + * + * Here is the concept of section 5 and 6. It's a 3D cube. Each tag + * has a unique alias among all converters. That same alias can + * be mentioned in other standards on different converters, + * but only one alias per tag can be unique. + * + * + * Converter Names (Usually in TR22 form) + * -------------------------------------------. + * T / /| + * a / / | + * g / / | + * s / / | + * / / | + * ------------------------------------------/ | + * A | | | + * l | | | + * i | | / + * a | | / + * s | | / + * e | | / + * s | |/ + * ------------------------------------------- + * + * + * + * Here is what it really looks like. It's like swiss cheese. + * There are holes. Some converters aren't recognized by + * a standard, or they are really old converters that the + * standard doesn't recognize anymore. + * + * Converter Names (Usually in TR22 form) + * -------------------------------------------. + * T /##########################################/| + * a / # # /# + * g / # ## ## ### # ### ### ### #/ + * s / # ##### #### ## ## #/# + * / ### # # ## # # # ### # # #/## + * ------------------------------------------/# # + * A |### # # ## # # # ### # # #|# # + * l |# # # # # ## # #|# # + * i |# # # # # # #|# + * a |# #|# + * s | #|# + * e + * s + * + */ + +final class UConverterAliasDataReader implements ICUBinary.Authenticate { + private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader"); + + /** + *

Protected constructor.

+ * @param inputStream ICU uprop.dat file input stream + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + protected UConverterAliasDataReader(InputStream inputStream) + throws IOException{ + if(debug) System.out.println("Bytes in inputStream " + inputStream.available()); + + unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); + + if(debug) System.out.println("Bytes left in inputStream " +inputStream.available()); + + dataInputStream = new DataInputStream(inputStream); + + if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available()); + } + + // protected methods ------------------------------------------------- + + protected long[] readToc(int n)throws IOException + { + long[] toc = new long[n]; + //Read the toc + for (int i = 0; i < n ; ++i) { + toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK; + } + return toc; + } + + protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, byte[] reservedBytes, byte[] stringTable) throws IOException{ + int i; + //int listnum = 1; + //long listsize; + + for(i = 0; i < convList.length; ++i) + convList[i] = dataInputStream.readUnsignedShort(); + + for(i = 0; i < tagList.length; ++i) + tagList[i] = dataInputStream.readUnsignedShort(); + + for(i = 0; i < aliasList.length; ++i) + aliasList[i] = dataInputStream.readUnsignedShort(); + + for(i = 0; i < untaggedConvArray.length; ++i) + untaggedConvArray[i] = dataInputStream.readUnsignedShort(); + + for(i = 0; i < taggedAliasArray.length; ++i) + taggedAliasArray[i] = dataInputStream.readUnsignedShort(); + + for(i = 0; i < taggedAliasLists.length; ++i) + taggedAliasLists[i] = dataInputStream.readUnsignedShort(); + + dataInputStream.read(reservedBytes); + dataInputStream.read(stringTable); +} + + public byte[] getDataFormatVersion(){ + return DATA_FORMAT_VERSION; + } + + public boolean isDataVersionAcceptable(byte version[]) + { + return version[0] == DATA_FORMAT_VERSION[0]; + } + + public byte[] getUnicodeVersion(){ + return unicodeVersion; + } + // private data members ------------------------------------------------- + + + /** + * ICU data file input stream + */ + private DataInputStream dataInputStream; + + private byte[] unicodeVersion; + + /** + * File format version that this class understands. + * No guarantees are made if a older version is used + * see store.c of gennorm for more information and values + */ + // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c) + private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl" + private static final byte DATA_FORMAT_VERSION[] = {(byte)0x3}; + + //private static final int UNSIGNED_SHORT_MASK = 0xffff; + private static final long UNSIGNED_INT_MASK = 0xffffffffL; + +} diff --git a/icu4j/src/com/ibm/icu/impl/UConverterAliasesEnumeration.java b/icu4j/src/com/ibm/icu/impl/UConverterAliasesEnumeration.java new file mode 100644 index 00000000000..0fc292f22a3 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterAliasesEnumeration.java @@ -0,0 +1,83 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.util.Enumeration; + + +/** + * Enumeration for Converter Aliases + */ + +public class UConverterAliasesEnumeration implements Enumeration { + + private UAliasContext context; + + /* Set alias context + */ + public void setContext(UAliasContext context){ + this.context = context; + } + + public int count() { + int value = 0; + + if (context.listOffset!=0) { + value = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset]; + } + return value; + } + + public Object nextElement() { + + if (context.listOffset!=0) { + long listCount = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset]; + int[] currListArray = UConverterAlias.gTaggedAliasListsArray; + long currListArrayIndex = context.getListOffset() + 1; + + if (context.getListIdx() < listCount) { + String str = UConverterAlias.GET_STRING(currListArray[(int)(context.listIdx+currListArrayIndex)]); + context.listIdx++; + return str; + } + } + /* Either we accessed a zero length list, or we enumerated too far. */ + throw new IndexOutOfBoundsException(); + } + + public void reset() { + context.listIdx = 0; + } + + /** + * Class to store context for alias + */ + public static class UAliasContext{ + private long listOffset; + private long listIdx; + + public UAliasContext(long listOffset, long listIdx){ + this.listOffset = listOffset; + this.listIdx = listIdx; + } + + public long getListOffset(){ + return listOffset; + } + + public long getListIdx(){ + return listIdx; + } + } + + public boolean hasMoreElements() { + long listCount = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset]; + return (context.getListIdx() < listCount); + } +} diff --git a/icu4j/src/com/ibm/icu/impl/UConverterConstants.java b/icu4j/src/com/ibm/icu/impl/UConverterConstants.java new file mode 100644 index 00000000000..a4bcdff5857 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterConstants.java @@ -0,0 +1,156 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +public interface UConverterConstants { + + public static final short UNSIGNED_BYTE_MASK = 0xff; + public static final int UNSIGNED_SHORT_MASK = 0xffff; + public static final long UNSIGNED_INT_MASK = 0xffffffffL; + + public static final int U_IS_BIG_ENDIAN = 0; + + /** + * Useful constant for the maximum size of the whole locale ID + * (including the terminating NULL). + * @draft ICU 3.6 + */ + public static final int ULOC_FULLNAME_CAPACITY = 56; + + /** + * This value is intended for sentinel values for APIs that + * (take or) return single code points (UChar32). + * It is outside of the Unicode code point range 0..0x10ffff. + * + * For example, a "done" or "error" value in a new API + * could be indicated with U_SENTINEL. + * + * ICU APIs designed before ICU 2.4 usually define service-specific "done" + * values, mostly 0xffff. + * Those may need to be distinguished from + * actual U+ffff text contents by calling functions like + * CharacterIterator::hasNext() or UnicodeString::length(). + * @draft ICU 2.4 + */ + public static final int U_SENTINEL = -1; + + //end utf.h + + //begin ucnv.h + /** + * Character that separates converter names from options and options from each other. + * @see open + * @draft ICU 3.6 + */ + static final byte OPTION_SEP_CHAR = ','; + + /** Maximum length of a converter name including the terminating NULL @draft ICU 3.6 */ + public static final int MAX_CONVERTER_NAME_LENGTH = 60; + /** Maximum length of a converter name including path and terminating NULL @draft ICU 3.6 */ + public static final int MAX_FULL_FILE_NAME_LENGTH = (600+MAX_CONVERTER_NAME_LENGTH); + + /** Shift in for EBDCDIC_STATEFUL and iso2022 states @draft ICU 3.6 */ + public static final int SI = 0x0F; + /** Shift out for EBDCDIC_STATEFUL and iso2022 states @draft ICU 3.6 */ + public static final int SO = 0x0E; + + //end ucnv.h + + // begin bld.h + /* size of the overflow buffers in UConverter, enough for escaping callbacks */ + //#define ERROR_BUFFER_LENGTH 32 + public static final int ERROR_BUFFER_LENGTH = 32; + + /* at most 4 bytes per substitution character (part of .cnv file format! see UConverterStaticData) */ + public static final int MAX_SUBCHAR_LEN = 4; + + /* at most 8 bytes per character in toUBytes[] (UTF-8 uses up to 6) */ + public static final int MAX_CHAR_LEN = 8; + + /* converter options bits */ + public static final int OPTION_VERSION = 0xf; + public static final int OPTION_SWAP_LFNL = 0x10; + public static final int OPTION_MAC = 0x20; //agljport:comment added for Mac ISCII encodings + + /** values for the unicodeMask */ + public static final int HAS_SUPPLEMENTARY = 1; + public static final int HAS_SURROGATES = 2; + // end bld.h + + // begin cnv.h + /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ + public static final int missingCharMarker = 0xFFFF; + + public final class UConverterResetChoice { + public static final int RESET_BOTH = 0; + public static final int RESET_TO_UNICODE = RESET_BOTH + 1; + public static final int RESET_FROM_UNICODE = RESET_TO_UNICODE + 1; + } + + // begin utf16.h + /** + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). + * @return 2 + * @draft ICU 2.4 + */ + public static final int U16_MAX_LENGTH = 2; + // end utf16.h + + // begin err.h + /** + * FROM_U, TO_U context options for sub callback + * @draft ICU 3.6 + */ + public static byte[] SUB_STOP_ON_ILLEGAL = {'i'}; + + /** + * FROM_U, TO_U context options for skip callback + * @draft ICU 3.6 + */ + public static byte[] SKIP_STOP_ON_ILLEGAL = {'i'}; + + /** + * The process condition code to be used with the callbacks. + * Codes which are greater than IRREGULAR should be + * passed on to any chained callbacks. + * @draft ICU 3.6 + */ + public static final class UConverterCallbackReason { + public static final int UNASSIGNED = 0; /**< The code point is unassigned. + The error code U_INVALID_CHAR_FOUND will be set. */ + public static final int ILLEGAL = 1; /**< The code point is illegal. For example, + \\x81\\x2E is illegal in SJIS because \\x2E + is not a valid trail byte for the \\x81 + lead byte. + Also, starting with Unicode 3.0.1, non-shortest byte sequences + in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061) + are also illegal, not just irregular. + The error code U_ILLEGAL_CHAR_FOUND will be set. */ + public static final int IRREGULAR = 2; /**< The codepoint is not a regular sequence in + the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF + are irregular UTF-8 byte sequences for single surrogate + code points. + The error code U_INVALID_CHAR_FOUND will be set. */ + public static final int RESET = 3; /**< The callback is called with this reason when a + 'reset' has occured. Callback should reset all + state. */ + public static final int CLOSE = 4; /**< Called when the converter is closed. The + callback should release any allocated memory.*/ + public static final int CLONE = 5; /**< Called when safeClone() is called on the + converter. the pointer available as the + 'context' is an alias to the original converters' + context pointer. If the context must be owned + by the new converter, the callback must clone + the data and call setFromUCallback + (or setToUCallback) with the correct pointer. + @draft ICU 2.2 + */ + } + //end err.h +} diff --git a/icu4j/src/com/ibm/icu/impl/UConverterDataReader.java b/icu4j/src/com/ibm/icu/impl/UConverterDataReader.java new file mode 100644 index 00000000000..48ab399cf6e --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterDataReader.java @@ -0,0 +1,552 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ + +package com.ibm.icu.impl; + +import com.ibm.icu.impl.ICUDebug; + +import java.io.IOException; +import java.io.InputStream; +import java.io.DataInputStream; +import java.nio.ByteBuffer; + +/** + * ucnvmbcs.h + * + * ICU conversion (.cnv) data file structure, following the usual UDataInfo + * header. + * + * Format version: 6.2 + * + * struct UConverterStaticData -- struct containing the converter name, IBM CCSID, + * min/max bytes per character, etc. + * see ucnv_bld.h + * + * -------------------- + * + * The static data is followed by conversionType-specific data structures. + * At the moment, there are only variations of MBCS converters. They all have + * the same toUnicode structures, while the fromUnicode structures for SBCS + * differ from those for other MBCS-style converters. + * + * _MBCSHeader.version 4.2 adds an optional conversion extension data structure. + * If it is present, then an ICU version reading header versions 4.0 or 4.1 + * will be able to use the base table and ignore the extension. + * + * The unicodeMask in the static data is part of the base table data structure. + * Especially, the UCNV_HAS_SUPPLEMENTARY flag determines the length of the + * fromUnicode stage 1 array. + * The static data unicodeMask refers only to the base table's properties if + * a base table is included. + * In an extension-only file, the static data unicodeMask is 0. + * The extension data indexes have a separate field with the unicodeMask flags. + * + * MBCS-style data structure following the static data. + * Offsets are counted in bytes from the beginning of the MBCS header structure. + * Details about usage in comments in ucnvmbcs.c. + * + * struct _MBCSHeader (see the definition in this header file below) + * contains 32-bit fields as follows: + * 8 values: + * 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0) + * 1 uint32_t countStates + * 2 uint32_t countToUFallbacks + * 3 uint32_t offsetToUCodeUnits + * 4 uint32_t offsetFromUTable + * 5 uint32_t offsetFromUBytes + * 6 uint32_t flags, bits: + * 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher + * 0 for older versions and if + * there is not extension structure + * 7.. 0 outputType + * 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher + * counts bytes in fromUBytes[] + * + * if(outputType==MBCS_OUTPUT_EXT_ONLY) { + * -- base table name for extension-only table + * char baseTableName[variable]; -- with NUL plus padding for 4-alignment + * + * -- all _MBCSHeader fields except for version and flags are 0 + * } else { + * -- normal base table with optional extension + * + * int32_t stateTable[countStates][256]; + * + * struct _MBCSToUFallback { (fallbacks are sorted by offset) + * uint32_t offset; + * UChar32 codePoint; + * } toUFallbacks[countToUFallbacks]; + * + * uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2]; + * (padded to an even number of units) + * + * -- stage 1 tables + * if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) { + * -- stage 1 table for all of Unicode + * uint16_t fromUTable[0x440]; (32-bit-aligned) + * } else { + * -- BMP-only tables have a smaller stage 1 table + * uint16_t fromUTable[0x40]; (32-bit-aligned) + * } + * + * -- stage 2 tables + * length determined by top of stage 1 and bottom of stage 3 tables + * if(outputType==MBCS_OUTPUT_1) { + * -- SBCS: pure indexes + * uint16_t stage 2 indexes[?]; + * } else { + * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes + * uint32_t stage 2 flags and indexes[?]; + * } + * + * -- stage 3 tables with byte results + * if(outputType==MBCS_OUTPUT_1) { + * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c + * uint16_t fromUBytes[fromUBytesLength/2]; + * } else { + * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c + * uint8_t fromUBytes[fromUBytesLength]; or + * uint16_t fromUBytes[fromUBytesLength/2]; or + * uint32_t fromUBytes[fromUBytesLength/4]; + * } + * } + * + * -- extension table, details see ucnv_ext.h + * int32_t indexes[>=32]; ... + */ +/* + * ucnv_ext.h + * + * See icuhtml/design/conversion/conversion_extensions.html + * + * Conversion extensions serve two purposes: + * 1. They support m:n mappings. + * 2. They support extension-only conversion files that are used together + * with the regular conversion data in base files. + * + * A base file may contain an extension table (explicitly requested or + * implicitly generated for m:n mappings), but its extension table is not + * used when an extension-only file is used. + * + * It is an error if a base file contains any regular (not extension) mapping + * from the same sequence as a mapping in the extension file + * because the base mapping would hide the extension mapping. + * + * + * Data for conversion extensions: + * + * One set of data structures per conversion direction (to/from Unicode). + * The data structures are sorted by input units to allow for binary search. + * Input sequences of more than one unit are handled like contraction tables + * in collation: + * The lookup value of a unit points to another table that is to be searched + * for the next unit, recursively. + * + * For conversion from Unicode, the initial code point is looked up in + * a 3-stage trie for speed, + * with an additional table of unique results to save space. + * + * Long output strings are stored in separate arrays, with length and index + * in the lookup tables. + * Output results also include a flag distinguishing roundtrip from + * (reverse) fallback mappings. + * + * Input Unicode strings must not begin or end with unpaired surrogates + * to avoid problems with matches on parts of surrogate pairs. + * + * Mappings from multiple characters (code points or codepage state + * table sequences) must be searched preferring the longest match. + * For this to work and be efficient, the variable-width table must contain + * all mappings that contain prefixes of the multiple characters. + * If an extension table is built on top of a base table in another file + * and a base table entry is a prefix of a multi-character mapping, then + * this is an error. + * + * + * Implementation note: + * + * Currently, the parser and several checks in the code limit the number + * of UChars or bytes in a mapping to + * UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively, + * which are output value limits in the data structure. + * + * For input, this is not strictly necessary - it is a hard limit only for the + * buffers in UConverter that are used to store partial matches. + * + * Input sequences could otherwise be arbitrarily long if partial matches + * need not be stored (i.e., if a sequence does not span several buffers with too + * many units before the last buffer), although then results would differ + * depending on whether partial matches exceed the limits or not, + * which depends on the pattern of buffer sizes. + * + * + * Data structure: + * + * int32_t indexes[>=32]; + * + * Array of indexes and lengths etc. The length of the array is at least 32. + * The actual length is stored in indexes[0] to be forward compatible. + * + * Each index to another array is the number of bytes from indexes[]. + * Each length of an array is the number of array base units in that array. + * + * Some of the structures may not be present, in which case their indexes + * and lengths are 0. + * + * Usage of indexes[i]: + * [0] length of indexes[] + * + * // to Unicode table + * [1] index of toUTable[] (array of uint32_t) + * [2] length of toUTable[] + * [3] index of toUUChars[] (array of UChar) + * [4] length of toUUChars[] + * + * // from Unicode table, not for the initial code point + * [5] index of fromUTableUChars[] (array of UChar) + * [6] index of fromUTableValues[] (array of uint32_t) + * [7] length of fromUTableUChars[] and fromUTableValues[] + * [8] index of fromUBytes[] (array of char) + * [9] length of fromUBytes[] + * + * // from Unicode trie for initial-code point lookup + * [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2) + * [11] length of stage 1 portion of fromUStage12[] + * [12] length of fromUStage12[] + * [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[]) + * [14] length of fromUStage3[] + * [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[]) + * [16] length of fromUStage3b[] + * + * [17] Bit field containing numbers of bytes: + * 31..24 reserved, 0 + * 23..16 maximum input bytes + * 15.. 8 maximum output bytes + * 7.. 0 maximum bytes per UChar + * + * [18] Bit field containing numbers of UChars: + * 31..24 reserved, 0 + * 23..16 maximum input UChars + * 15.. 8 maximum output UChars + * 7.. 0 maximum UChars per byte + * + * [19] Bit field containing flags: + * (extension table unicodeMask) + * 1 UCNV_HAS_SURROGATES flag for the extension table + * 0 UCNV_HAS_SUPPLEMENTARY flag for the extension table + * + * [20]..[30] reserved, 0 + * [31] number of bytes for the entire extension structure + * [>31] reserved; there are indexes[0] indexes + * + * + * uint32_t toUTable[]; + * + * Array of byte/value pairs for lookups for toUnicode conversion. + * The array is partitioned into sections like collation contraction tables. + * Each section contains one word with the number of following words and + * a default value for when the lookup in this section yields no match. + * + * A section is sorted in ascending order of input bytes, + * allowing for fast linear or binary searches. + * The builder may store entries for a contiguous range of byte values + * (compare difference between the first and last one with count), + * which then allows for direct array access. + * The builder should always do this for the initial table section. + * + * Entries may have 0 values, see below. + * No two entries in a section have the same byte values. + * + * Each uint32_t contains an input byte value in bits 31..24 and the + * corresponding lookup value in bits 23..0. + * Interpret the value as follows: + * if(value==0) { + * no match, see below + * } else if(value<0x1f0000) { + * partial match - use value as index to the next toUTable section + * and match the next unit; (value indexes toUTable[value]) + * } else { + * if(bit 23 set) { + * roundtrip; + * } else { + * fallback; + * } + * unset value bit 23; + * if(value<=0x2fffff) { + * (value-0x1f0000) is a code point; (BMP: value<=0x1fffff) + * } else { + * bits 17..0 (value&0x3ffff) is an index to + * the result UChars in toUUChars[]; (0 indexes toUUChars[0]) + * length of the result=((value>>18)-12); (length=0..19) + * } + * } + * + * The first word in a section contains the number of following words in the + * input byte position (bits 31..24, number=1..0xff). + * The value of the initial word is used when the current byte is not found + * in this section. + * If the value is not 0, then it represents a result as above. + * If the value is 0, then the search has to return a shorter match with an + * earlier default value as the result, or result in "unmappable" even for the + * initial bytes. + * If the value is 0 for the initial toUTable entry, then the initial byte + * does not start any mapping input. + * + * + * UChar toUUChars[]; + * + * Contains toUnicode mapping results, stored as sequences of UChars. + * Indexes and lengths stored in the toUTable[]. + * + * + * UChar fromUTableUChars[]; + * uint32_t fromUTableValues[]; + * + * The fromUTable is split into two arrays, but works otherwise much like + * the toUTable. The array is partitioned into sections like collation + * contraction tables and toUTable. + * A row in the table consists of same-index entries in fromUTableUChars[] + * and fromUTableValues[]. + * + * Interpret a value as follows: + * if(value==0) { + * no match, see below + * } else if(value<=0xffffff) { (bits 31..24 are 0) + * partial match - use value as index to the next fromUTable section + * and match the next unit; (value indexes fromUTable[value]) + * } else { + * if(value==0x80000001) { + * return no mapping, but request for ; + * } + * if(bit 31 set) { + * roundtrip; + * } else { + * fallback; + * } + * // bits 30..29 reserved, 0 + * length=(value>>24)&0x1f; (bits 28..24) + * if(length==1..3) { + * bits 23..0 contain 1..3 bytes, padded with 00s on the left; + * } else { + * bits 23..0 (value&0xffffff) is an index to + * the result bytes in fromUBytes[]; (0 indexes fromUBytes[0]) + * } + * } + * + * The first pair in a section contains the number of following pairs in the + * UChar position (16 bits, number=1..0xffff). + * The value of the initial pair is used when the current UChar is not found + * in this section. + * If the value is not 0, then it represents a result as above. + * If the value is 0, then the search has to return a shorter match with an + * earlier default value as the result, or result in "unmappable" even for the + * initial UChars. + * + * If the from Unicode trie is present, then the from Unicode search tables + * are not used for initial code points. + * In this case, the first entries (index 0) in the tables are not used + * (reserved, set to 0) because a value of 0 is used in trie results + * to indicate no mapping. + * + * + * uint16_t fromUStage12[]; + * + * Stages 1 & 2 of a trie that maps an initial code point. + * Indexes in stage 1 are all offset by the length of stage 1 so that the + * same array pointer can be used for both stages. + * If (c>>10)>=(length of stage 1) then c does not start any mapping. + * Same bit distribution as for regular conversion tries. + * + * + * uint16_t fromUStage3[]; + * uint32_t fromUStage3b[]; + * + * Stage 3 of the trie. The first array simply contains indexes to the second, + * which contains words in the same format as fromUTableValues[]. + * Use a stage 3 granularity of 4, which allows for 256k stage 3 entries, + * and 16-bit entries in stage 3 allow for 64k stage 3b entries. + * The stage 3 granularity means that the stage 2 entry needs to be left-shifted. + * + * Two arrays are used because it is expected that more than half of the stage 3 + * entries will be zero. The 16-bit index stage 3 array saves space even + * considering storing a total of 6 bytes per non-zero entry in both arrays + * together. + * Using a stage 3 granularity of >1 diminishes the compactability in that stage + * but provides a larger effective addressing space in stage 2. + * All but the final result stage use 16-bit entries to save space. + * + * fromUStage3b[] contains a zero for "no mapping" at its index 0, + * and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for " SUB mapping" + * (i.e., "no mapping" with preference for rather than ), + * and all other items are unique non-zero results. + * + * The default value of a fromUTableValues[] section that is referenced + * _directly_ from a fromUStage3b[] item may also be UCNV_EXT_FROM_U_SUBCHAR1, + * but this value must not occur anywhere else in fromUTableValues[] + * because "no mapping" is always a property of a single code point, + * never of multiple. + * + * + * char fromUBytes[]; + * + * Contains fromUnicode mapping results, stored as sequences of chars. + * Indexes and lengths stored in the fromUTableValues[]. + */ + +public final class UConverterDataReader implements ICUBinary.Authenticate { + private final static boolean debug = ICUDebug.enabled("UConverterDataReader"); + + /* + * public UConverterDataReader(UConverterDataReader r) + { + dataInputStream = new DataInputStream(r.dataInputStream); + unicodeVersion = r.unicodeVersion; + } + */ + + /** + *

Protected constructor.

+ * @param inputStream ICU uprop.dat file input stream + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + protected UConverterDataReader(InputStream inputStream) + throws IOException{ + if(debug) System.out.println("Bytes in inputStream " + inputStream.available()); + + unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); + + if(debug) System.out.println("Bytes left in inputStream " +inputStream.available()); + + dataInputStream = new DataInputStream(inputStream); + + if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available()); + } + + // protected methods ------------------------------------------------- + + protected void readStaticData(UConverterStaticData sd) throws IOException + { + sd.structSize = dataInputStream.readInt(); + byte[] name = new byte[UConverterConstants.MAX_CONVERTER_NAME_LENGTH]; + int length = dataInputStream.read(name); + sd.name = new String(name, 0, length); + sd.codepage = dataInputStream.readInt(); + sd.platform = dataInputStream.readByte(); + sd.conversionType = dataInputStream.readByte(); + sd.minBytesPerChar = dataInputStream.readByte(); + sd.maxBytesPerChar = dataInputStream.readByte(); + dataInputStream.read(sd.subChar); + sd.subCharLen = dataInputStream.readByte(); + sd.hasToUnicodeFallback = dataInputStream.readByte(); + sd.hasFromUnicodeFallback = dataInputStream.readByte(); + sd.unicodeMask = (short)dataInputStream.readUnsignedByte(); + sd.subChar1 = dataInputStream.readByte(); + dataInputStream.read(sd.reserved); + } + + protected void readMBCSHeader(UConverterSharedData.MBCSHeader h) throws IOException + { + dataInputStream.read(h.version); + h.countStates = dataInputStream.readInt(); + h.countToUFallbacks = dataInputStream.readInt(); + h.offsetToUCodeUnits = dataInputStream.readInt(); + h.offsetFromUTable = dataInputStream.readInt(); + h.offsetFromUBytes = dataInputStream.readInt(); + h.flags = dataInputStream.readInt(); + h.fromUBytesLength = dataInputStream.readInt(); + } + + protected void readMBCSTable(int[][] stateTableArray, UConverterSharedData.MBCSToUFallback[] toUFallbacksArray, char[] unicodeCodeUnitsArray, char[] fromUnicodeTableArray, byte[] fromUnicodeBytesArray) throws IOException + { + int i, j; + for(i = 0; i < stateTableArray.length; ++i) + for(j = 0; j < stateTableArray[i].length; ++j) + stateTableArray[i][j] = dataInputStream.readInt(); + for(i = 0; i < toUFallbacksArray.length; ++i) { + toUFallbacksArray[i].offset = dataInputStream.readInt(); + toUFallbacksArray[i].codePoint = dataInputStream.readInt(); + } + for(i = 0; i < unicodeCodeUnitsArray.length; ++i) + unicodeCodeUnitsArray[i] = dataInputStream.readChar(); + for(i = 0; i < fromUnicodeTableArray.length; ++i) + fromUnicodeTableArray[i] = dataInputStream.readChar(); + for(i = 0; i < fromUnicodeBytesArray.length; ++i) + fromUnicodeBytesArray[i] = dataInputStream.readByte(); + } + + protected String readBaseTableName() throws IOException + { + char c; + StringBuffer name = new StringBuffer(); + while((c = (char)dataInputStream.readByte()) != 0) + name.append(c); + return name.toString(); + } + + //protected int[] readExtIndexes(int skip) throws IOException + protected ByteBuffer readExtIndexes(int skip) throws IOException + { + dataInputStream.skipBytes(skip); + + int n = dataInputStream.readInt(); + int[] indexes = new int[n]; + indexes[0] = n; + for(int i = 1; i < n; ++i) { + indexes[i] = dataInputStream.readInt(); + } + //return indexes; + + ByteBuffer b = ByteBuffer.allocate(indexes[31]); + for(int i = 0; i < n; ++i) { + b.putInt(indexes[i]); + } + dataInputStream.read(b.array(), b.position(), b.remaining()); + return b; + } + + protected byte[] readExtTables(int n) throws IOException + { + byte[] tables = new byte[n]; + dataInputStream.read(tables); + return tables; + } + + public byte[] getDataFormatVersion(){ + return DATA_FORMAT_VERSION; + } + + public boolean isDataVersionAcceptable(byte version[]) + { + return version[0] == DATA_FORMAT_VERSION[0]; + } + + public byte[] getUnicodeVersion(){ + return unicodeVersion; + } + // private data members ------------------------------------------------- + + /** + * ICU data file input stream + */ + private DataInputStream dataInputStream; + + private byte[] unicodeVersion; + + /** + * File format version that this class understands. + * No guarantees are made if a older version is used + * see store.c of gennorm for more information and values + */ + // DATA_FORMAT_ID_ values taken from icu4c isCnvAcceptable (ucnv_bld.c) + private static final byte DATA_FORMAT_ID[] = {(byte)0x63, (byte)0x6e, (byte)0x76, (byte)0x74}; // dataFormat="cnvt" + private static final byte DATA_FORMAT_VERSION[] = {(byte)0x6}; + +} + diff --git a/icu4j/src/com/ibm/icu/impl/UConverterSharedData.java b/icu4j/src/com/ibm/icu/impl/UConverterSharedData.java new file mode 100644 index 00000000000..f1cb9f76874 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterSharedData.java @@ -0,0 +1,545 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.nio.ByteBuffer; + + +/* + * Defines the UConverterSharedData struct, + * the immutable, shared part of UConverter. + */ +public class UConverterSharedData { + //uint32_t structSize; /* Size of this structure */ + public int structSize; /* Size of this structure */ + //uint32_t referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */ + public int referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */ + public static final int MAX_VERSION_LENGTH=4; + //agljport:todo const void *dataMemory; /* from udata_openChoice() - for cleanup */ + //agljport:todo void *table; /* Unused. This used to be a UConverterTable - Pointer to conversion data - see mbcs below */ + + //const UConverterStaticData *staticData; /* pointer to the static (non changing) data. */ + public UConverterStaticData staticData; /* pointer to the static (non changing) data. */ + + //UBool sharedDataCached; /* TRUE: shared data is in cache, don't destroy on close() if 0 ref. FALSE: shared data isn't in the cache, do attempt to clean it up if the ref is 0 */ + public boolean sharedDataCached; /* TRUE: shared data is in cache, don't destroy on close() if 0 ref. FALSE: shared data isn't in the cache, do attempt to clean it up if the ref is 0 */ + /*UBool staticDataOwned; TRUE if static data owned by shared data & should be freed with it, NEVER true for udata() loaded statics. This ignored variable was removed to make space for sharedDataCached. */ + + //const UConverterImpl *impl; /* vtable-style struct of mostly function pointers */ + //public UConverterImpl impl; /* vtable-style struct of mostly function pointers */ + + /*initial values of some members of the mutable part of object */ + //uint32_t toUnicodeStatus; + public long toUnicodeStatus; + + /* + * Shared data structures currently come in two flavors: + * - readonly for built-in algorithmic converters + * - allocated for MBCS, with a pointer to an allocated UConverterTable + * which always has a UConverterMBCSTable + * + * To eliminate one allocation, I am making the UConverterMBCSTable + * a member of the shared data. It is the last member so that static + * definitions of UConverterSharedData work as before. + * The table field above also remains to avoid updating all static + * definitions, but is now unused. + * + * markus 2003-nov-07 + */ + public UConverterMBCSTable mbcs; + + public UConverterSharedData() + { + mbcs = new UConverterMBCSTable(); + } + + public UConverterSharedData(int structSize_, int referenceCounter_, UConverterStaticData staticData_, boolean sharedDataCached_,/* UConverterImpl impl_,*/ long toUnicodeStatus_) + { + this(); + structSize = structSize_; + referenceCounter = referenceCounter_; + staticData = staticData_; + sharedDataCached = sharedDataCached_; + //impl = impl_; + toUnicodeStatus = toUnicodeStatus_; + } + + /** + * UConverterImpl contains all the data and functions for a converter type. + * Its function pointers work much like a C++ vtable. + * Many converter types need to define only a subset of the functions; + * when a function pointer is NULL, then a default action will be performed. + * + * Every converter type must implement toUnicode, fromUnicode, and getNextUChar, + * otherwise the converter may crash. + * Every converter type that has variable-length codepage sequences should + * also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for + * correct offset handling. + * All other functions may or may not be implemented - it depends only on + * whether the converter type needs them. + * + * When open() fails, then close() will be called, if present. + */ + //public class UConverterImpl { + //UConverterType type; + //UConverterToUnicode toUnicode; +/* protected void doToUnicode(UConverterToUnicodeArgs args, int[] pErrorCode) + { + } + + public final void toUnicode(UConverterToUnicodeArgs args, int[] pErrorCode) + { + doToUnicode(args, pErrorCode); + } + + //UConverterFromUnicode fromUnicode; + protected void doFromUnicode(UConverterFromUnicodeArgs args, int[] pErrorCode) + { + } + + public final void fromUnicode(UConverterFromUnicodeArgs args, int[] pErrorCode) + { + doFromUnicode(args, pErrorCode); + } + + protected int doGetNextUChar(UConverterToUnicodeArgs args, int[] pErrorCode) + { + return 0; + } + + //UConverterGetNextUChar getNextUChar; + public final int getNextUChar(UConverterToUnicodeArgs args, int[] pErrorCode) + { + return doGetNextUChar(args, pErrorCode); + } + + //public interface UConverterImplLoadable extends UConverterImpl + protected void doLoad(UConverterLoadArgs pArgs, short[] raw, int[] pErrorCode) + { + } + +*/ + protected void doUnload() + { + } + + /* + //public interface UConverterImplOpenable extends UConverterImpl + protected void doOpen(UConverter cnv, String name, String locale, long options, int[] pErrorCode) + { + } + + //UConverterOpen open; + public final void open(UConverter cnv, String name, String locale, long options, int[] pErrorCode) + { + doOpen(cnv, name, locale, options, pErrorCode); + } + + protected void doClose(UConverter cnv) + { + } + + //UConverterClose close; + public final void close(UConverter cnv) + { + doClose(cnv); + } + + protected void doReset(UConverter cnv, int choice) + { + } + + //typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice); + //UConverterReset reset; + public final void reset(UConverter cnv, int choice) + { + doReset(cnv, choice); + } + + //public interface UConverterImplVariableLength extends UConverterImpl + protected void doToUnicodeWithOffsets(UConverterToUnicodeArgs args, int[] pErrorCode) + { + } + + //UConverterToUnicode toUnicodeWithOffsets; + public final void toUnicodeWithOffsets(UConverterToUnicodeArgs args, int[] pErrorCode) + { + doToUnicodeWithOffsets(args, pErrorCode); + } + + protected void doFromUnicodeWithOffsets(UConverterFromUnicodeArgs args, int[] pErrorCode) + { + } + + //UConverterFromUnicode fromUnicodeWithOffsets; + public final void fromUnicodeWithOffsets(UConverterFromUnicodeArgs args, int[] pErrorCode) + { + doFromUnicodeWithOffsets(args, pErrorCode); + } + + //public interface UConverterImplMisc extends UConverterImpl + protected void doGetStarters(UConverter converter, boolean starters[], int[] pErrorCode) + { + } + + //UConverterGetStarters getStarters; + public final void getStarters(UConverter converter, boolean starters[], int[] pErrorCode) + { + doGetStarters(converter, starters, pErrorCode); + } + + protected String doGetName(UConverter cnv) + { + return ""; + } + + //UConverterGetName getName; + public final String getName(UConverter cnv) + { + return doGetName(cnv); + } + + protected void doWriteSub(UConverterFromUnicodeArgs pArgs, long offsetIndex, int[] pErrorCode) + { + } + + //UConverterWriteSub writeSub; + public final void writeSub(UConverterFromUnicodeArgs pArgs, long offsetIndex, int[] pErrorCode) + { + doWriteSub(pArgs, offsetIndex, pErrorCode); + } + + protected UConverter doSafeClone(UConverter cnv, byte[] stackBuffer, int[] pBufferSize, int[] status) + { + return new UConverter(); + } + + //UConverterSafeClone safeClone; + public final UConverter safeClone(UConverter cnv, byte[] stackBuffer, int[] pBufferSize, int[] status) + { + return doSafeClone(cnv, stackBuffer, pBufferSize, status); + } + + protected void doGetUnicodeSet(UConverter cnv, UnicodeSet /*USetAdder* / sa, int /*UConverterUnicodeSet* / which, int[] pErrorCode) + { + } + + //UConverterGetUnicodeSet getUnicodeSet; + //public final void getUnicodeSet(UConverter cnv, UnicodeSet /*USetAdder* / sa, int /*UConverterUnicodeSet* / which, int[] pErrorCode) + //{ + // doGetUnicodeSet(cnv, sa, which, pErrorCode); + //} + + //} + + static final String DATA_TYPE = "cnv"; + private static final int CNV_DATA_BUFFER_SIZE = 25000; + public static final int sizeofUConverterSharedData = 100; + + //static UDataMemoryIsAcceptable isCnvAcceptable; + + /** + * Load a non-algorithmic converter. + * If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex). + + // UConverterSharedData * load(UConverterLoadArgs *pArgs, UErrorCode *err) + public static final UConverterSharedData load(UConverterLoadArgs pArgs, int[] err) + { + UConverterSharedData mySharedConverterData = null; + + if(err == null || ErrorCode.isFailure(err[0])) { + return null; + } + + if(pArgs.pkg != null && pArgs.pkg.length() != 0) { + application-provided converters are not currently cached + return UConverterSharedData.createConverterFromFile(pArgs, err); + } + + //agljport:fix mySharedConverterData = getSharedConverterData(pArgs.name); + if (mySharedConverterData == null) + { + Not cached, we need to stream it in from file + mySharedConverterData = UConverterSharedData.createConverterFromFile(pArgs, err); + if (ErrorCode.isFailure(err[0]) || (mySharedConverterData == null)) + { + return null; + } + else + { + share it with other library clients + //agljport:fix shareConverterData(mySharedConverterData); + } + } + else + { + The data for this converter was already in the cache. + Update the reference counter on the shared data: one more client + mySharedConverterData.referenceCounter++; + } + + return mySharedConverterData; + } + + Takes an alias name gets an actual converter file name + *goes to disk and opens it. + *allocates the memory and returns a new UConverter object + + //static UConverterSharedData *createConverterFromFile(UConverterLoadArgs *pArgs, UErrorCode * err) + public static final UConverterSharedData createConverterFromFile(UConverterLoadArgs pArgs, int[] err) + { + UDataMemory data = null; + UConverterSharedData sharedData = null; + + //agljport:todo UTRACE_ENTRY_OC(UTRACE_LOAD); + + if (err == null || ErrorCode.isFailure(err[0])) { + //agljport:todo UTRACE_EXIT_STATUS(*err); + return null; + } + + //agljport:todo UTRACE_DATA2(UTRACE_OPEN_CLOSE, "load converter %s from package %s", pArgs->name, pArgs->pkg); + + //agljport:fix data = udata_openChoice(pArgs.pkgArray, DATA_TYPE.getBytes(), pArgs.name, isCnvAcceptable, null, err); + if(ErrorCode.isFailure(err[0])) + { + //agljport:todo UTRACE_EXIT_STATUS(*err); + return null; + } + + sharedData = data_unFlattenClone(pArgs, data, err); + if(ErrorCode.isFailure(err[0])) + { + //agljport:fix udata_close(data); + //agljport:todo UTRACE_EXIT_STATUS(*err); + return null; + } + + + * TODO Store pkg in a field in the shared data so that delta-only converters + * can load base converters from the same package. + * If the pkg name is longer than the field, then either do not load the converter + * in the first place, or just set the pkg field to "". + + + return sharedData; + } +*/ + UConverterDataReader dataReader = null; + + + + /*returns a converter type from a string + */ + // static const UConverterSharedData * getAlgorithmicTypeFromName(const char *realName) + public static final UConverterSharedData getAlgorithmicTypeFromName(String realName) + { + long mid, start, limit; + long lastMid; + int result; + StringBuffer strippedName = new StringBuffer(UConverterConstants.MAX_CONVERTER_NAME_LENGTH); + + /* Lower case and remove ignoreable characters. */ + UConverterAlias.io_stripForCompare(strippedName, realName); + + /* do a binary search for the alias */ + start = 0; + limit = cnvNameType.length; + mid = limit; + lastMid = UConverterAlias.UINT32_MAX; + + for (;;) { + mid = (long)((start + limit) / 2); + if (lastMid == mid) { /* Have we moved? */ + break; /* We haven't moved, and it wasn't found. */ + } + lastMid = mid; + result = strippedName.substring(0).compareTo(cnvNameType[(int)mid].name); + + if (result < 0) { + limit = mid; + } else if (result > 0) { + start = mid; + } else { + return converterData[cnvNameType[(int)mid].type]; + } + } + + return null; + } + + /** + * Fallbacks to Unicode are stored outside the normal state table and code point structures + * in a vector of items of this type. They are sorted by offset. + */ + public final class MBCSToUFallback { + int offset; + int codePoint; + } + + /** + * This is the MBCS part of the UConverterTable union (a runtime data structure). + * It keeps all the per-converter data and points into the loaded mapping tables. + */ + public final class UConverterMBCSTable { + /* toUnicode */ + short countStates; + byte dbcsOnlyState; + boolean stateTableOwned; + int countToUFallbacks; + + int stateTable[/*countStates*/][/*256*/]; + int swapLFNLStateTable[/*countStates*/][/*256*/]; /* for swaplfnl */ + char unicodeCodeUnits[/*countUnicodeResults*/]; + MBCSToUFallback toUFallbacks[/*countToUFallbacks*/]; + + /* fromUnicode */ + char fromUnicodeTable[]; + byte fromUnicodeBytes[]; + byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */ + int fromUBytesLength; + short outputType, unicodeMask; + + /* converter name for swaplfnl */ + String swapLFNLName; + + /* extension data */ + UConverterSharedData baseSharedData; + //int extIndexes[]; + ByteBuffer extIndexes; // create int[] view etc. as needed + + UConverterMBCSTable() + { + } + + UConverterMBCSTable(UConverterMBCSTable t) + { + countStates = t.countStates; + dbcsOnlyState = t.dbcsOnlyState; + stateTableOwned = t.stateTableOwned; + countToUFallbacks = t.countToUFallbacks; + stateTable = t.stateTable; + swapLFNLStateTable = t.swapLFNLStateTable; + unicodeCodeUnits = t.unicodeCodeUnits; + toUFallbacks = t.toUFallbacks; + fromUnicodeTable = t.fromUnicodeTable; + fromUnicodeBytes = t.fromUnicodeBytes; + swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes; + fromUBytesLength = t.fromUBytesLength; + outputType = t.outputType; + unicodeMask = t.unicodeMask; + swapLFNLName = t.swapLFNLName; + baseSharedData = t.baseSharedData; + extIndexes = t.extIndexes; + } + } + + /** + * MBCS data header. See data format description above. + */ + public final class MBCSHeader { + byte version[/*U_MAX_VERSION_LENGTH*/]; + int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes; + int flags; + int fromUBytesLength; + + public MBCSHeader() + { + version = new byte[MAX_VERSION_LENGTH]; + } + } + + /** + * Enum for specifying basic types of converters + * @see getType + * @draft ICU 3.6 + */ + public static final class UConverterType { + public static final int UNSUPPORTED_CONVERTER = -1; + public static final int SBCS = 0; + public static final int DBCS = 1; + public static final int MBCS = 2; + public static final int LATIN_1 = 3; + public static final int UTF8 = 4; + public static final int UTF16_BigEndian = 5; + public static final int UTF16_LittleEndian = 6; + public static final int UTF32_BigEndian = 7; + public static final int UTF32_LittleEndian = 8; + public static final int EBCDIC_STATEFUL = 9; + public static final int ISO_2022 = 10; + + public static final int LMBCS_1 = 11; + public static final int LMBCS_2 = LMBCS_1 + 1; //12 + public static final int LMBCS_3 = LMBCS_2 + 1; //13 + public static final int LMBCS_4 = LMBCS_3 + 1; //14 + public static final int LMBCS_5 = LMBCS_4 + 1; //15 + public static final int LMBCS_6 = LMBCS_5 + 1; //16 + public static final int LMBCS_8 = LMBCS_6 + 1; //17 + public static final int LMBCS_11 = LMBCS_8 + 1; //18 + public static final int LMBCS_16 = LMBCS_11 + 1; //19 + public static final int LMBCS_17 = LMBCS_16 + 1; //20 + public static final int LMBCS_18 = LMBCS_17 + 1; //21 + public static final int LMBCS_19 = LMBCS_18 + 1; //22 + public static final int LMBCS_LAST = LMBCS_19; //22 + public static final int HZ =LMBCS_LAST + 1; //23 + public static final int SCSU = HZ + 1; //24 + public static final int ISCII = SCSU + 1; //25 + public static final int US_ASCII = ISCII + 1; //26 + public static final int UTF7 = US_ASCII + 1; //27 + public static final int BOCU1 = UTF7 + 1; //28 + public static final int UTF16 = BOCU1 + 1; //29 + public static final int UTF32 = UTF16 + 1; //30 + public static final int CESU8 = UTF32 + 1; //31 + public static final int IMAP_MAILBOX = CESU8 + 1; //32 + public static final int MAC_ARABIC = IMAP_MAILBOX + 1; //33 + public static final int MAC_HEBREW = MAC_ARABIC + 1; //34 + + /* Number of converter types for which we have conversion routines. */ + public static final int NUMBER_OF_SUPPORTED_CONVERTER_TYPES = MAC_HEBREW + 1; + + } + + /** + * Enum for specifying which platform a converter ID refers to. + * The use of platform/CCSID is not recommended. See openCCSID(). + * @draft ICU 3.6 + */ + public static final class UConverterPlatform { + public static final int UNKNOWN = -1; + public static final int IBM = 0; + } + + static UConverterSharedData _MBCSData = null, /*_Latin1Data = null,*/ /*_UTF8Data = null,*/ /*_UTF16BEData = null,*/ /*_UTF16LEData = null,*/ /*_UTF32BEData = null,*/ /*_UTF32LEData = null,*/ /*_ISO2022Data = null,*/ _LMBCSData1 = null,_LMBCSData2 = null, _LMBCSData3 = null, _LMBCSData4 = null, _LMBCSData5 = null, _LMBCSData6 = null, _LMBCSData8 = null,_LMBCSData11 = null,_LMBCSData16 = null,_LMBCSData17 = null,_LMBCSData18 = null,_LMBCSData19 = null, _HZData = null, _SCSUData = null, /*_ISCIIData = null,*/ /*_ASCIIData = null,*/ _UTF7Data = null, _Bocu1Data = null, /*_UTF16Data = null, _UTF32Data = null,*/ _CESU8Data = null, _IMAPData = null; + static UConverterSharedData[] converterData; + static class cnvNameTypeClass { + String name; + int type; + cnvNameTypeClass(String name_, int type_) { name = name_; type = type_; } + } + + static cnvNameTypeClass cnvNameType[]; + + static final String DATA_TYPE = "cnv"; + static final int CNV_DATA_BUFFER_SIZE = 25000; + static final int SIZE_OF_UCONVERTER_SHARED_DATA = 100; + + static final int MAXIMUM_UCS2 = 0x0000FFFF; + static final int MAXIMUM_UTF = 0x0010FFFF; + static final int MAXIMUM_UCS4 = 0x7FFFFFFF; + static final int HALF_SHIFT = 10; + static final int HALF_BASE = 0x0010000; + static final int HALF_MASK = 0x3FF; + static final int SURROGATE_HIGH_START = 0xD800; + static final int SURROGATE_HIGH_END = 0xDBFF; + static final int SURROGATE_LOW_START = 0xDC00; + static final int SURROGATE_LOW_END = 0xDFFF; + + /* -SURROGATE_LOW_START + HALF_BASE */ + static final int SURROGATE_LOW_BASE = 9216; +} diff --git a/icu4j/src/com/ibm/icu/impl/UConverterStaticData.java b/icu4j/src/com/ibm/icu/impl/UConverterStaticData.java new file mode 100644 index 00000000000..63c193b546d --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UConverterStaticData.java @@ -0,0 +1,61 @@ +/** +******************************************************************************* +* Copyright (C) 2006, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +public final class UConverterStaticData { /* +offset: size */ + public int structSize; /* +0: 4 Size of this structure */ + + public String name; /* +4: 60 internal name of the converter- invariant chars */ + + public int codepage; /* +64: 4 codepage # (now IBM-$codepage) */ + + public byte platform; /* +68: 1 platform of the converter (only IBM now) */ + public byte conversionType; /* +69: 1 conversion type */ + + public byte minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */ + public byte maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */ + + public byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */ + public byte subCharLen; /* +76: 1 */ + + public byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */ + public byte hasFromUnicodeFallback; /* +78: 1 */ + public short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */ + public byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */ + public byte reserved[/*19*/]; /* +81: 19 to round out the structure */ + /* total size: 100 */ + public UConverterStaticData() + { + subChar = new byte[UConverterConstants.MAX_SUBCHAR_LEN]; + reserved = new byte[19]; + } + + public UConverterStaticData(int structSize_, String name_, int codepage_, byte platform_, byte conversionType_, byte minBytesPerChar_, byte maxBytesPerChar_, byte[] subChar_, byte subCharLen_, byte hasToUnicodeFallback_, byte hasFromUnicodeFallback_, short unicodeMask_, byte subChar1_, byte[] reserved_) + { + structSize = structSize_; + name = name_; + codepage = codepage_; + platform = platform_; + conversionType = conversionType_; + minBytesPerChar = minBytesPerChar_; + maxBytesPerChar = maxBytesPerChar_; + subChar = new byte[UConverterConstants.MAX_SUBCHAR_LEN]; + System.arraycopy(subChar_, 0, subChar, 0, (subChar.length < subChar_.length? subChar.length : subChar_.length)); + subCharLen = subCharLen_; + hasToUnicodeFallback = hasToUnicodeFallback_; + hasFromUnicodeFallback = hasFromUnicodeFallback_; + unicodeMask = unicodeMask_; + subChar1 = subChar1_; + reserved = new byte[19]; + System.arraycopy(reserved_, 0, reserved, 0, (reserved.length < reserved_.length? reserved.length : reserved_.length)); + } + + public static final int sizeofUConverterStaticData = 100; +} +