ICU-5018 charset conversion support

X-SVN-Rev: 20172
This commit is contained in:
Yoshito Umaoka 2006-08-26 05:30:49 +00:00
parent d2841a5885
commit e33252c102
21 changed files with 9915 additions and 2 deletions

View file

@ -177,7 +177,7 @@
<!-- core does not build richedit or tests -->
<target name="core" depends="init,coreData,icudata" description="build core classes and data">
<javac includes="com/ibm/icu/util/**/*.java,com/ibm/icu/text/**/*.java,com/ibm/icu/math/**/*.java,com/ibm/icu/impl/**/*.java,com/ibm/icu/lang/*.java"
<javac includes="com/ibm/icu/util/**/*.java,com/ibm/icu/text/**/*.java,com/ibm/icu/math/**/*.java,com/ibm/icu/impl/**/*.java,com/ibm/icu/lang/*.java,com/ibm/icu/charset/**/*.java"
excludes="**/CVS/**/*"
srcdir="${src.dir}"
destdir="${build.dir}"
@ -431,9 +431,12 @@
<target name="jarRelease" depends="jar,jarSrc,jarDocs"/>
<target name="jar" depends="core,indices" description="build full 'icu4j.jar' jar file">
<copy todir="${build.dir}/META-INF">
<fileset dir="${src.dir}/META-INF" includes="**/*"/>
</copy>
<jar jarfile="${jar.file}"
compress="true"
includes="com/ibm/icu/util/**/*,com/ibm/icu/text/**/*,com/ibm/icu/math/**/*,com/ibm/icu/impl/**/*,com/ibm/icu/lang/**/*"
includes="com/ibm/icu/util/**/*,com/ibm/icu/text/**/*,com/ibm/icu/math/**/*,com/ibm/icu/impl/**/*,com/ibm/icu/lang/**/*,META-INF/services/*"
basedir="${build.dir}"
manifest="${icu4j.manifest}"/>
</target>

View file

@ -0,0 +1,3 @@
# Copyright (C) 2006, International Business Machines Corporation and others. All Rights Reserved.
# icu4j converters
com.ibm.icu.charset.CharsetProviderICU

View file

@ -0,0 +1,158 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CoderResult;
/*public*/ class CharsetCallback {
/**
* FROM_U, TO_U context options for sub callback
* @draft ICU 3.6
*/
/*public*/ static final String SUB_STOP_ON_ILLEGAL = "i";
/**
* FROM_U, TO_U context options for skip callback
* @draft ICU 3.6
*/
/*public*/ static final String SKIP_STOP_ON_ILLEGAL = "i";
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
* @draft ICU 3.6
*/
/*public*/ static final String ESCAPE_ICU = null;
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
* @draft ICU 3.6
*/
/*public*/ static final String ESCAPE_JAVA = "J";
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
* TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
* @draft ICU 3.6
*/
/*public*/ static final String ESCAPE_C = "C";
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
* TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
* @draft ICU 3.6
*/
/*public*/ static final String ESCAPE_XML_DEC = "D";
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
* TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
* @draft ICU 3.6
*/
/*public*/ static final String ESCAPE_XML_HEX = "X";
/**
* FROM_U_CALLBACK_ESCAPE context option to escape teh code unit according to Unicode (U+XXXXX)
* @draft ICU 3.6
*/
/*public*/ static final String ESCAPE_UNICODE = "U";
public interface Decoder {
public CoderResult call(CharsetDecoderICU decoder, Object context,
ByteBuffer source, CharBuffer target, IntBuffer offsets,
char[] buffer, int length, CoderResult cr);
}
public interface Encoder {
public CoderResult call(CharsetEncoderICU encoder, Object context,
CharBuffer source, ByteBuffer target, IntBuffer offsets,
char[] buffer, int length, int cp, CoderResult cr);
}
public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
public CoderResult call(CharsetEncoderICU encoder, Object context,
CharBuffer source, ByteBuffer target, IntBuffer offsets,
char[] buffer, int length, int cp, CoderResult cr){
if(context==null){
return CoderResult.UNDERFLOW;
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
if(!cr.isUnmappable()){
return cr;
}else{
return CoderResult.UNDERFLOW;
}
}
return cr;
}
};
public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
public CoderResult call(CharsetDecoderICU decoder, Object context,
ByteBuffer source, CharBuffer target, IntBuffer offsets,
char[] buffer, int length, CoderResult cr){
if(context==null){
return CoderResult.UNDERFLOW;
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
if(!cr.isUnmappable()){
return cr;
}else{
return CoderResult.UNDERFLOW;
}
}
return cr;
}
};
public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
public CoderResult call(CharsetEncoderICU encoder, Object context,
CharBuffer source, ByteBuffer target, IntBuffer offsets,
char[] buffer, int length, int cp, CoderResult cr){
if(context==null){
return encoder.cbFromUWriteSub(encoder, source, target, offsets);
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
if(!cr.isUnmappable()){
return cr;
}else{
return encoder.cbFromUWriteSub(encoder, source, target, offsets);
}
}
return cr;
}
};
public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() {
public CoderResult call(CharsetDecoderICU decoder, Object context,
ByteBuffer source, CharBuffer target, IntBuffer offsets,
char[] buffer, int length, CoderResult cr){
if(context==null){
return decoder.cbToUWriteSub(decoder, source, target, offsets);
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
if(!cr.isUnmappable()){
return cr;
}else{
return decoder.cbToUWriteSub(decoder, source, target, offsets);
}
}
return cr;
}
};
public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
public CoderResult call(CharsetEncoderICU encoder, Object context,
CharBuffer source, ByteBuffer target, IntBuffer offsets,
char[] buffer, int length, int cp, CoderResult cr){
return cr;
}
};
public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
public CoderResult call(CharsetDecoderICU decoder, Object context,
ByteBuffer source, CharBuffer target, IntBuffer offsets,
char[] buffer, int length, CoderResult cr){
return cr;
}
};
}

View file

@ -0,0 +1,639 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.BufferOverflowException;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.nio.ByteBuffer;
import com.ibm.icu.impl.Assert;
public abstract class CharsetDecoderICU extends CharsetDecoder{
protected int toUnicodeStatus;
protected byte[] toUBytesArray = new byte[128];
protected int toUBytesBegin = 0;
protected int toULength;
protected char[] charErrorBufferArray = new char[128];
protected int charErrorBufferLength;
protected int charErrorBufferBegin;
protected char[] invalidCharBuffer = new char[128];
protected int invalidCharLength;
/* store previous UChars/chars to continue partial matches */
protected byte[] preToUArray;
protected int preToUBegin;
protected int preToULength; /* negative: replay */
protected int preToUFirstLength; /* length of first character */
protected Object toUContext = null;
private CharsetCallback.Decoder onUnmappableInput = CharsetCallback.TO_U_CALLBACK_STOP;
private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
protected CharsetCallback.Decoder toCharErrorBehaviour= new CharsetCallback.Decoder(){
public CoderResult call(CharsetDecoderICU decoder, Object context,
ByteBuffer source, CharBuffer target, IntBuffer offsets,
char[] buffer, int length, CoderResult cr) {
if(cr.isUnmappable()){
return onUnmappableInput.call(decoder, context,
source, target, offsets,
buffer, length, cr);
}else if(cr.isMalformed()){
return onMalformedInput.call(decoder, context,
source, target, offsets,
buffer, length, cr);
}
return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context,
source, target, offsets,
buffer, length, cr);
}
};
protected CharsetDecoderICU(CharsetICU cs) {
super(cs, (float) (1/(float)cs.maxCharsPerByte), cs.maxCharsPerByte);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
* @param newAction action to be taken
* @exception IllegalArgumentException
* @draft ICU 3.6
*/
protected final void implOnMalformedInput(CodingErrorAction newAction) {
onMalformedInput = getCallback(newAction);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
* @param newAction action to be taken
* @exception IllegalArgumentException
* @draft ICU 3.6
*/
protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
onUnmappableInput = getCallback(newAction);
}
private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
if(action==CodingErrorAction.REPLACE){
return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
}else if(action==CodingErrorAction.IGNORE){
return CharsetCallback.TO_U_CALLBACK_SKIP;
}else if(action==CodingErrorAction.REPORT){
return CharsetCallback.TO_U_CALLBACK_STOP;
}
return CharsetCallback.TO_U_CALLBACK_STOP;
}
/**
* Flushes any characters saved in the converter's internal buffer and
* resets the converter.
* @param out action to be taken
* @return result of flushing action and completes the decoding all input.
* Returns CoderResult.UNDERFLOW if the action succeeds.
* @draft ICU 3.6
*/
protected final CoderResult implFlush(CharBuffer out) {
return CoderResult.UNDERFLOW;
}
/**
* Resets the to Unicode mode of converter
* @draft ICU 3.6
*/
protected void implReset() {
toUnicodeStatus = 0 ;
toULength = 0;
charErrorBufferLength = 0;
charErrorBufferBegin = 0;
/* store previous UChars/chars to continue partial matches */
preToUBegin = 0;
preToULength = 0; /* negative: replay */
preToUFirstLength = 0;
}
/**
* Decodes one or more bytes. The default behaviour of the converter
* is stop and report if an error in input stream is encountered.
* To set different behaviour use @see CharsetDecoder.onMalformedInput()
* This method allows a buffer by buffer conversion of a data stream.
* The state of the conversion is saved between calls to convert.
* Among other things, this means multibyte input sequences can be
* split between calls. If a call to convert results in an Error, the
* conversion may be continued by calling convert again with suitably
* modified parameters.All conversions should be finished with a call to
* the flush method.
* @param in buffer to decode
* @param out buffer to populate with decoded result
* @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
* @draft ICU 3.6
*/
protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
if(!in.hasRemaining()){
return CoderResult.UNDERFLOW;
}
in.position(in.position()+toUCountPending());
/* do the conversion */
CoderResult ret = decode(in, out, null, false);
setSourcePosition(in);
return ret;
}
/**
* Implements the ICU semantic for decode operation
* @param in
* @param out
* @return
*/
protected abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets);
/**
* Implements the ICU semantic for decode operation
* @param source
* @param target
* @param offsets
* @param flush
* @return
* @throws MalformedInputException
*/
protected final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
/* check parameters */
if(target==null || source==null) {
throw new IllegalArgumentException();
}
/*
* Make sure that the buffer sizes do not exceed the number range for
* int32_t because some functions use the size (in units or bytes)
* rather than comparing pointers, and because offsets are int32_t values.
*
* size_t is guaranteed to be unsigned and large enough for the job.
*
* Return with an error instead of adjusting the limits because we would
* not be able to maintain the semantics that either the source must be
* consumed or the target filled (unless an error occurs).
* An adjustment would be sourceLimit=t+0x7fffffff; for example.
*/
/*agljport:fix
if(
((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
) {
*err=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
*/
/* flush the target overflow buffer */
if(charErrorBufferLength>0) {
char[] overflow = null;
int i, length;
overflow=charErrorBufferArray;
length=charErrorBufferLength;
i=0;
do {
if(target.remaining()<0) {
/* the overflow buffer contains too much, keep the rest */
int j=0;
do {
overflow[j++]=overflow[i++];
} while(i<length);
charErrorBufferLength=(byte)j;
return CoderResult.OVERFLOW;
}
/* copy the overflow contents to the target */
target.put(overflow[i++]);
if(offsets!=null) {
offsets.put(-1); /* no source index available for old output */
}
} while(i<length);
/* the overflow buffer is completely copied to the target */
charErrorBufferLength=0;
}
if(!flush && source.remaining()==0 && preToULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
return CoderResult.UNDERFLOW;
}
/*
* Do not simply return with a buffer overflow error if
* !flush && t==targetLimit
* because it is possible that the source will not generate any output.
* For example, the skip callback may be called;
* it does not output anything.
*/
return toUnicodeWithCallback(source, target, offsets, flush);
}
/* maximum number of indexed bytes */
private static final int EXT_MAX_BYTES = 0x1f;
private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
int limit;
int delta, offset;
if(sourceIndex>=0) {
/*
* adjust each offset by adding the previous sourceIndex
* minus the length of the input sequence that caused an
* error, if any
*/
delta=sourceIndex-errorInputLength;
} else {
/*
* set each offset to -1 because this conversion function
* does not handle offsets
*/
delta=-1;
}
limit=offsets.position()+length;
if(delta==0) {
/* most common case, nothing to do */
} else if(delta>0) {
/* add the delta to each offset (but not if the offset is <0) */
while(offsets.position()<limit) {
offset=offsets.get(offsets.position());
if(offset>=0) {
offsets.put(offset+delta);
}
//FIXME: ++offsets;
}
} else /* delta<0 */ {
/*
* set each offset to -1 because this conversion function
* does not handle offsets
* or the error input sequence started in a previous buffer
*/
while(offsets.position()<limit) {
offsets.put(-1);
}
}
}
protected final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
int sourceIndex;
int errorInputLength;
boolean converterSawEndOfInput, calledCallback;
int t=target.position();
int s=source.position();
/* variables for m:n conversion */
ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
int replayArrayIndex = 0;
ByteBuffer realSource=null;
boolean realFlush=false;
int realSourceIndex=0;
CoderResult cr = CoderResult.UNDERFLOW;
/* get the converter implementation function */
sourceIndex=0;
if(preToULength>=0) {
/* normal mode */
} else {
/*
* Previous m:n conversion stored source units from a partial match
* and failed to consume all of them.
* We need to "replay" them from a temporary buffer and convert them first.
*/
realSource=source;
realFlush=flush;
realSourceIndex=sourceIndex;
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
source=replayArray;
source.position(0);
source.limit(replayArrayIndex-preToULength);
flush=false;
sourceIndex=-1;
preToULength=0;
}
/*
* loop for conversion and error handling
*
* loop {
* convert
* loop {
* update offsets
* handle end of input
* handle errors/call callback
* }
* }
*/
for(;;) {
if(cr.isUnderflow()) {
/* convert */
cr = decodeLoop(source, target, offsets);
/*
* set a flag for whether the converter
* successfully processed the end of the input
*
* need not check cnv->preToULength==0 because a replay (<0) will cause
* s<sourceLimit before converterSawEndOfInput is checked
*/
converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength==0);
} else {
/* handle error from getNextUChar() */
converterSawEndOfInput=false;
}
/* no callback called yet for this iteration */
calledCallback=false;
/* no sourceIndex adjustment for conversion, only for callback output */
errorInputLength=0;
/*
* loop for offsets and error handling
*
* iterates at most 3 times:
* 1. to clean up after the conversion function
* 2. after the callback
* 3. after the callback again if there was truncated input
*/
for(;;) {
/* update offsets if we write any */
if(offsets!=null) {
int length=(target.position()-t);
if(length>0) {
updateOffsets(offsets, length, sourceIndex, errorInputLength);
/*
* if a converter handles offsets and updates the offsets
* pointer at the end, then pArgs->offset should not change
* here;
* however, some converters do not handle offsets at all
* (sourceIndex<0) or may not update the offsets pointer
*/
//TODO: pArgs->offsets=offsets+=length;
}
if(sourceIndex>=0) {
sourceIndex+=(source.position()-s);
}
}
if(preToULength<0) {
/*
* switch the source to new replay units (cannot occur while replaying)
* after offset handling and before end-of-input and callback handling
*/
if(realSource==null)
{
realSource=source;
realFlush=flush;
realSourceIndex=sourceIndex;
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
replayArray.put(preToUArray,0, -preToULength);
source=replayArray;
source.limit(replayArrayIndex-preToULength);
flush=false;
if((sourceIndex+=preToULength)<0) {
sourceIndex=-1;
}
preToULength=0;
} else {
/* see implementation note before _fromUnicodeWithCallback() */
//agljport:todo U_ASSERT(realSource==NULL);
Assert.assrt(realSource==null);
}
}
/* update pointers */
s=source.position();
t=target.position();
if(cr.isUnderflow()) {
if(s<source.limit())
{
/*
* continue with the conversion loop while there is still input left
* (continue converting by breaking out of only the inner loop)
*/
break;
} else if(realSource!=null) {
/* switch back from replaying to the real source and continue */
source = realSource;
flush=realFlush;
sourceIndex=realSourceIndex;
realSource=null;
break;
} else if(flush && toULength>0) {
/*
* the entire input stream is consumed
* and there is a partial, truncated input sequence left
*/
/* inject an error and continue with callback handling */
cr = CoderResult.malformedForLength(toULength);
calledCallback=false; /* new error condition */
} else {
/* input consumed */
if(flush) {
/*
* return to the conversion loop once more if the flush
* flag is set and the conversion function has not
* successfully processed the end of the input yet
*
* (continue converting by breaking out of only the inner loop)
*/
if(!converterSawEndOfInput) {
break;
}
/* reset the converter without calling the callback function */
implReset();
}
/* done successfully */
return cr;
}
}
/* U_FAILURE(*err) */
{
if( calledCallback || cr.isOverflow() ||
(cr.isMalformed() && cr.isUnmappable())
) {
/*
* the callback did not or cannot resolve the error:
* set output pointers and return
*
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
*
* if we were replaying, then the replay buffer must be
* copied back into the UConverter
* and the real arguments must be restored
*/
if(realSource!=null) {
int length;
Assert.assrt(preToULength==0);
length=(int)(source.limit()-source.position());
if(length>0) {
//UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
source.get(preToUArray, preToUBegin, length);
preToULength=(byte)-length;
}
source=realSource;
flush=realFlush;
}
return cr;
}
}
/* copy toUBytes[] to invalidCharBuffer[] */
errorInputLength=invalidCharLength=toULength;
if(errorInputLength>0) {
copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
}
/* set the converter state to deal with the next character */
toULength=0;
/* call the callback function */
cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
/*
* loop back to the offset handling
*
* this flag will indicate after offset handling
* that a callback was called;
* if the callback did not resolve the error, then we return
*/
calledCallback=true;
}
}
}
/**
* Releases the system resources by cleanly closing ICU converter opened
* @draft ICU 3.6
*/
protected void finalize()throws Throwable{
}
/**
* Returns the number of chars held in the converter's internal state
* because more input is needed for completing the conversion. This function is
* useful for mapping semantics of ICU's converter interface to those of iconv,
* and this information is not needed for normal conversion.
* @param cnv The converter in which the input is held as internal state
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of chars in the state. -1 if an error is encountered.
* @draft ICU 3.4
*/
/*public*/ int toUCountPending() {
if(preToULength > 0){
return preToULength ;
}else if(preToULength < 0){
return -preToULength;
}else if(toULength > 0){
return toULength;
}
return 0;
}
private final void setSourcePosition(ByteBuffer source){
// ok was there input held in the previous invocation of decodeLoop
// that resulted in output in this invocation?
source.position(source.position() - toUCountPending());
}
private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
for(int i=srcOffset; i<length; i++){
dst[dstOffset++]=(char)src[srcOffset++];
}
}
protected static final CoderResult toUWriteUChars( CharsetDecoderICU cnv,
char[] ucharsArray, int ucharsBegin, int length,
CharBuffer target, IntBuffer offsets, int sourceIndex) {
CoderResult cr = CoderResult.UNDERFLOW;
/* write UChars */
if(offsets==null) {
try{
while(length>0) {
target.put(ucharsArray[ucharsBegin++]);
--length;
}
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
} else {
/* output with offsets */
try{
while(length>0) {
target.put(ucharsArray[ucharsBegin++]);
offsets.put(sourceIndex);
--length;
}
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
}
/* write overflow */
if(length>0) {
cnv.charErrorBufferLength= length;
do {
cnv.charErrorBufferArray[cnv.charErrorBufferBegin++]=ucharsArray[ucharsBegin++];
} while(--length>0);
}
return cr;
}
/**
* Sub classes to override this method if required
* @param decoder
* @param source
* @param target
* @param offsets
* @return
*/
protected CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
ByteBuffer source, CharBuffer target,
IntBuffer offsets){
String sub = decoder.replacement();
CharsetICU cs = (CharsetICU) decoder.charset();
if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
char[] subArr = new char[] { 0x1a };
return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
.length(), target, offsets, source.position());
} else {
return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
0, sub.length(), target, offsets, source.position());
}
}
}

View file

@ -0,0 +1,631 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.text.UTF16;
public abstract class CharsetEncoderICU extends CharsetEncoder {
protected byte[] errorBuffer = new byte[30];
protected int errorBufferLength = 0;
/** these are for encodeLoopICU */
protected int fromUnicodeStatus;
protected int fromUChar32;
protected boolean useSubChar1;
/* store previous UChars/chars to continue partial matches */
protected int preFromUFirstCP; /* >=0: partial match */
protected char[] preFromUArray;
protected int preFromUBegin;
protected int preFromULength; /* negative: replay */
protected char[] invalidUCharBuffer = new char[2];
protected int invalidUCharLength;
protected Object fromUContext;
private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP;
private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP;
protected CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder(){
public CoderResult call(CharsetEncoderICU encoder, Object context,
CharBuffer source, ByteBuffer target, IntBuffer offsets,
char[] buffer, int length, int cp, CoderResult cr) {
if(cr.isUnmappable()){
return onUnmappableInput.call(encoder, context,
source, target, offsets,
buffer, length, cp, cr);
}else if(cr.isMalformed()){
return onMalformedInput.call(encoder, context,
source, target, offsets,
buffer, length, cp, cr);
}
return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context,
source, target, offsets,
buffer, length, cp, cr);
}
};
/**
* Construcs a new encoder for the given charset
* @param cs for which the decoder is created
* @param cHandle the address of ICU converter
* @param replacement the substitution bytes
* @draft ICU 3.6
*/
protected CharsetEncoderICU(CharsetICU cs, byte[] replacement) {
super(cs, (cs.minBytesPerChar+cs.maxBytesPerChar)/2, cs.maxBytesPerChar, replacement);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
* @param newAction action to be taken
* @exception IllegalArgumentException
* @draft ICU 3.6
*/
protected void implOnMalformedInput(CodingErrorAction newAction) {
onMalformedInput = getCallback(newAction);
}
/**
* Sets the action to be taken if an illegal sequence is encountered
* @param newAction action to be taken
* @exception IllegalArgumentException
* @draft ICU 3.6
*/
protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
onUnmappableInput = getCallback(newAction);
}
private static CharsetCallback.Encoder getCallback(CodingErrorAction action){
if(action==CodingErrorAction.REPLACE){
return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE;
}else if(action==CodingErrorAction.IGNORE){
return CharsetCallback.FROM_U_CALLBACK_SKIP;
}else if(action==CodingErrorAction.REPORT){
return CharsetCallback.FROM_U_CALLBACK_STOP;
}
return CharsetCallback.FROM_U_CALLBACK_STOP;
}
/**
* Flushes any characters saved in the converter's internal buffer and
* resets the converter.
* @param out action to be taken
* @return result of flushing action and completes the decoding all input.
* Returns CoderResult.UNDERFLOW if the action succeeds.
* @draft ICU 3.6
*/
protected CoderResult implFlush(ByteBuffer out) {
return CoderResult.UNDERFLOW;
}
/**
* Resets the from Unicode mode of converter
* @draft ICU 3.6
*/
protected void implReset() {
errorBufferLength=0;
fromUChar32=0;
fromUnicodeStatus = 0;
preFromUBegin = 0;
preFromUFirstCP = 0;
preFromULength = 0;
}
/**
* Encodes one or more chars. The default behaviour of the
* converter is stop and report if an error in input stream is encountered.
* To set different behaviour use @see CharsetEncoder.onMalformedInput()
* @param in buffer to decode
* @param out buffer to populate with decoded result
* @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
* action succeeds or more input is needed for completing the decoding action.
* @draft ICU 3.6
*/
protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
if(!in.hasRemaining()){
return CoderResult.UNDERFLOW;
}
in.position(in.position()+fromUCountPending());
/* do the conversion */
CoderResult ret = encode(in, out, null, false);
setSourcePosition(in);
return ret;
}
/**
* Implements ICU semantics of buffer management
* @param source
* @param target
* @param offsets
* @return
* @throws MalformedInputException
*/
protected abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets);
/**
* Implements ICU semantics for encoding the buffer
* @param in
* @param out
* @return
*/
protected final CoderResult encode(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
/* check parameters */
if(target==null || source==null) {
throw new IllegalArgumentException();
}
/*
* Make sure that the buffer sizes do not exceed the number range for
* int32_t because some functions use the size (in units or bytes)
* rather than comparing pointers, and because offsets are int32_t values.
*
* size_t is guaranteed to be unsigned and large enough for the job.
*
* Return with an error instead of adjusting the limits because we would
* not be able to maintain the semantics that either the source must be
* consumed or the target filled (unless an error occurs).
* An adjustment would be targetLimit=t+0x7fffffff; for example.
*/
//Ram: not required
//if( ((long)(sourceLimit-sArrayIndex)>(long)0x3fffffff && sourceLimit>sArrayIndex) || ((long)(targetLimit-tArrayIndex)>(long)0x7fffffff && targetLimit>tArrayIndex)) {
// err[0]=ErrorCode.U_ILLEGAL_ARGUMENT_ERROR;
// return;
//}
/* flush the target overflow buffer */
if(errorBufferLength>0) {
byte[] overflowArray;
int i, length;
overflowArray=errorBuffer;
length=errorBufferLength;
i=0;
do {
if(target.remaining()==0) {
/* the overflow buffer contains too much, keep the rest */
int j=0;
do {
overflowArray[j++]=overflowArray[i++];
} while(i<length);
errorBufferLength=(byte)j;
return CoderResult.OVERFLOW;
}
/* copy the overflow contents to the target */
target.put(overflowArray[i++]);
if(offsets!=null) {
offsets.put(-1); /* no source index available for old output */
}
} while(i<length);
/* the overflow buffer is completely copied to the target */
errorBufferLength=0;
}
if(!flush && source.remaining()==0 && preFromULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
return CoderResult.UNDERFLOW;
}
/*
* Do not simply return with a buffer overflow error if
* !flush && t==targetLimit
* because it is possible that the source will not generate any output.
* For example, the skip callback may be called;
* it does not output anything.
*/
return fromUnicodeWithCallback(source, target, offsets, flush);
}
/* maximum number of indexed UChars */
public static final int EXT_MAX_UCHARS = 19;
protected final CoderResult fromUnicodeWithCallback(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
int sBufferIndex;
int sourceIndex;
int errorInputLength;
boolean converterSawEndOfInput, calledCallback;
/* variables for m:n conversion */
CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS);
int replayArrayIndex=0;
CharBuffer realSource;
boolean realFlush;
CoderResult cr = CoderResult.UNDERFLOW;
/* get the converter implementation function */
sourceIndex=0;
if(preFromULength>=0) {
/* normal mode */
realSource=null;
realFlush=false;
} else {
/*
* Previous m:n conversion stored source units from a partial match
* and failed to consume all of them.
* We need to "replay" them from a temporary buffer and convert them first.
*/
realSource=source;
realFlush = flush;
//UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
replayArray.put(preFromUArray,0, -preFromULength);
source.position(replayArrayIndex);
source.limit(replayArrayIndex-preFromULength); //preFromULength is negative, see declaration
source=replayArray;
flush=false;
preFromULength=0;
}
/*
* loop for conversion and error handling
*
* loop {
* convert
* loop {
* update offsets
* handle end of input
* handle errors/call callback
* }
* }
*/
for(;;) {
/* convert */
cr = encodeLoop(source, target, offsets);
/*
* set a flag for whether the converter
* successfully processed the end of the input
*
* need not check cnv.preFromULength==0 because a replay (<0) will cause
* s<sourceLimit before converterSawEndOfInput is checked
*/
converterSawEndOfInput= (boolean)(cr.isUnderflow() && flush && source.remaining()==0 && fromUChar32==0);
/* no callback called yet for this iteration */
calledCallback=false;
/* no sourceIndex adjustment for conversion, only for callback output */
errorInputLength=0;
/*
* loop for offsets and error handling
*
* iterates at most 3 times:
* 1. to clean up after the conversion function
* 2. after the callback
* 3. after the callback again if there was truncated input
*/
for(;;) {
/* update offsets if we write any */
if(offsets!=null) {
int length = target.remaining();
if(length>0) {
/*
* if a converter handles offsets and updates the offsets
* pointer at the end, then offset should not change
* here;
* however, some converters do not handle offsets at all
* (sourceIndex<0) or may not update the offsets pointer
*/
offsets.position(offsets.position()+length);
}
if(sourceIndex>=0) {
sourceIndex+=(int)(source.position());
}
}
if(preFromULength<0) {
/*
* switch the source to new replay units (cannot occur while replaying)
* after offset handling and before end-of-input and callback handling
*/
if(realSource==null) {
realSource=source;
realFlush=flush;
//UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
replayArray.put(preFromUArray,0, -preFromULength);
source=replayArray;
source.position(replayArrayIndex);
source.limit(replayArrayIndex-preFromULength);
flush=false;
if((sourceIndex+=preFromULength)<0) {
sourceIndex=-1;
}
preFromULength=0;
} else {
/* see implementation note before _fromUnicodeWithCallback() */
//agljport:todo U_ASSERT(realSource==NULL);
Assert.assrt(realSource==null);
}
}
/* update pointers */
sBufferIndex=source.position();
if(cr.isUnderflow()) {
if(sBufferIndex<source.limit()) {
/*
* continue with the conversion loop while there is still input left
* (continue converting by breaking out of only the inner loop)
*/
break;
} else if(realSource!=null) {
/* switch back from replaying to the real source and continue */
source=realSource;
flush=realFlush;
sourceIndex=source.position();
realSource=null;
break;
} else if(flush && fromUChar32!=0) {
/*
* the entire input stream is consumed
* and there is a partial, truncated input sequence left
*/
/* inject an error and continue with callback handling */
//err[0]=ErrorCode.U_TRUNCATED_CHAR_FOUND;
cr = CoderResult.malformedForLength(1);
calledCallback=false; /* new error condition */
} else {
/* input consumed */
if(flush) {
/*
* return to the conversion loop once more if the flush
* flag is set and the conversion function has not
* successfully processed the end of the input yet
*
* (continue converting by breaking out of only the inner loop)
*/
if(!converterSawEndOfInput) {
break;
}
/* reset the converter without calling the callback function */
implReset();
}
/* done successfully */
return cr;
}
}
/*U_FAILURE(*err) */
{
if( calledCallback || cr.isOverflow() ||
(cr.isMalformed() && cr.isUnmappable())
){
/*
* the callback did not or cannot resolve the error:
* set output pointers and return
*
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
*
* if we were replaying, then the replay buffer must be
* copied back into the UConverter
* and the real arguments must be restored
*/
if(realSource!=null) {
int length;
//agljport:todo U_ASSERT(cnv.preFromULength==0);
length=source.remaining();
if(length>0) {
//UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR);
source.get(preFromUArray, 0, length );
preFromULength=(byte)-length;
}
source=realSource;
flush=realFlush;
}
return cr;
}
}
/* callback handling */
{
/* get and write the code point */
errorInputLength = UTF16.append(invalidUCharBuffer, 0, fromUChar32);
invalidUCharLength = errorInputLength;
/* set the converter state to deal with the next character */
fromUChar32=0;
/* call the callback function */
cr = fromCharErrorBehaviour.call(this, fromUContext, source, target, offsets, invalidUCharBuffer, invalidUCharLength, fromUChar32, cr);
}
/*
* loop back to the offset handling
*
* this flag will indicate after offset handling
* that a callback was called;
* if the callback did not resolve the error, then we return
*/
calledCallback=true;
}
}
}
/**
* Ascertains if a given Unicode code point (32bit value for handling surrogates)
* can be converted to the target encoding. If the caller wants to test if a
* surrogate pair can be converted to target encoding then the
* responsibility of assembling the int value lies with the caller.
* For assembling a code point the caller can use UTF16 class of ICU4J and do something like:
* <pre>
* while(i<mySource.length){
* if(UTF16.isLeadSurrogate(mySource[i])&& i+1< mySource.length){
* if(UTF16.isTrailSurrogate(mySource[i+1])){
* int temp = UTF16.charAt(mySource,i,i+1,0);
* if(!((CharsetEncoderICU) myConv).canEncode(temp)){
* passed=false;
* }
* i++;
* i++;
* }
* }
* }
* </pre>
* or
* <pre>
* String src = new String(mySource);
* int i,codepoint;
* boolean passed = false;
* while(i<src.length()){
* codepoint = UTF16.charAt(src,i);
* i+= (codepoint>0xfff)? 2:1;
* if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){
* passed = false;
* }
* }
* </pre>
*
* @param codepoint Unicode code point as int value
* @return true if a character can be converted
* @draft ICU 3.6
*
*/
public boolean canEncode(int codepoint) {
return true;
}
public boolean isLegalReplacement(byte[] repl){
return true;
}
/**
* Releases the system resources by cleanly closing ICU converter opened
* @exception Throwable exception thrown by super class' finalize method
* @draft ICU 3.6
*/
protected void finalize() throws Throwable {
}
protected static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv,
byte[] bytesArray, int bytesBegin, int bytesLength,
ByteBuffer out, IntBuffer offsets, int sourceIndex){
//write bytes
int obl = bytesLength;
CoderResult cr = CoderResult.UNDERFLOW;
int bytesLimit = bytesBegin + bytesLength;
try{
for (;bytesBegin< bytesLimit;){
out.put(bytesArray[bytesBegin]);
bytesBegin++;
}
// success
bytesLength=0;
}catch( BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
if(offsets!=null) {
while(obl>bytesLength) {
offsets.put(sourceIndex);
--obl;
}
}
//write overflow
cnv.errorBufferLength = bytesLimit - bytesBegin;
if(cnv.errorBufferLength >0) {
if(cnv!=null) {
int index = 0;
while(bytesBegin<bytesLimit) {
cnv.errorBuffer[index++]=bytesArray[bytesBegin++];
}
}
cr = CoderResult.OVERFLOW;
}
return cr;
}
/**
* Returns the number of chars held in the converter's internal state
* because more input is needed for completing the conversion. This function is
* useful for mapping semantics of ICU's converter interface to those of iconv,
* and this information is not needed for normal conversion.
* @param cnv The converter in which the input is held as internal state
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return The number of chars in the state. -1 if an error is encountered.
* @draft ICU 3.4
*/
/*public*/ int fromUCountPending(){
if(preFromULength > 0){
return UTF16.getCharCount(preFromUFirstCP)+preFromULength ;
}else if(preFromULength < 0){
return -preFromULength ;
}else if(fromUChar32 > 0){
return 1;
}else if(preFromUFirstCP >0){
return UTF16.getCharCount(preFromUFirstCP);
}
return 0;
}
/**
*
* @param source
*/
private final void setSourcePosition(CharBuffer source){
// ok was there input held in the previous invocation of decodeLoop
// that resulted in output in this invocation?
source.position(source.position() - fromUCountPending());
}
/**
* Write the codepage substitution character.
* Subclasses to override this method.
* For stateful converters, it is typically necessary to handle this
* specificially for the converter in order to properly maintain the state.
*/
protected CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
CharBuffer source, ByteBuffer target,
IntBuffer offsets){
CharsetICU cs = (CharsetICU) encoder.charset();
byte[] sub = encoder.replacement();
if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) {
return CharsetEncoderICU.fromUWriteBytes(encoder,
new byte[] { cs.subChar1 }, 0, 1, target, offsets, source
.position());
} else {
return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0,
sub.length, target, offsets, source.position());
}
}
}

View file

@ -0,0 +1,192 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashMap;
import com.ibm.icu.lang.UCharacter;
public abstract class CharsetICU extends Charset{
protected String icuCanonicalName;
protected String javaCanonicalName;
protected int options;
protected int maxBytesPerChar;
protected int minBytesPerChar;
protected float maxCharsPerByte;
protected byte subChar1 = 0x00;
protected int mode;
protected boolean flush;
protected boolean useFallback;
/**
*
* @param icuCanonicalName
* @param canonName
* @param aliases
* @draft ICU 3.6
*/
protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
super(canonicalName,aliases);
if(canonicalName.length() == 0){
throw new IllegalCharsetNameException(canonicalName);
}
this.javaCanonicalName = canonicalName;
this.icuCanonicalName = icuCanonicalName;
}
/**
* Ascertains if a charset is a sub set of this charset
* @param cs charset to test
* @return true if the given charset is a subset of this charset
*/
public boolean contains(Charset cs){
if (null == cs) {
return false;
} else if (this.equals(cs)) {
return true;
}
return false;
}
private static final HashMap algorithmicCharsets = new HashMap();
static{
algorithmicCharsets.put("BOCU-1", "com.ibm.icu.impl.CharsetBOCU1" );
algorithmicCharsets.put("CESU-8", "com.ibm.icu.impl.CharsetCESU8" );
algorithmicCharsets.put("HZ", "com.ibm.icu.impl.CharsetHZ" );
algorithmicCharsets.put("imapmailboxname", "com.ibm.icu.impl.CharsetIMAP" );
algorithmicCharsets.put("ISCII", "com.ibm.icu.impl.CharsetISCII" );
algorithmicCharsets.put("iso2022", "com.ibm.icu.impl.CharsetISO2022" );
algorithmicCharsets.put("iso88591", "com.ibm.icu.impl.CharsetBOCU1" );
algorithmicCharsets.put("lmbcs1", "com.ibm.icu.impl.CharsetLMBCS1" );
algorithmicCharsets.put("lmbcs11", "com.ibm.icu.impl.CharsetLMBCS11" );
algorithmicCharsets.put("lmbcs16", "com.ibm.icu.impl.CharsetLMBCS16" );
algorithmicCharsets.put("lmbcs17", "com.ibm.icu.impl.CharsetLMBCS17" );
algorithmicCharsets.put("lmbcs18", "com.ibm.icu.impl.CharsetLMBCS18" );
algorithmicCharsets.put("lmbcs19", "com.ibm.icu.impl.CharsetLMBCS19" );
algorithmicCharsets.put("lmbcs2", "com.ibm.icu.impl.CharsetLMBCS2" );
algorithmicCharsets.put("lmbcs3", "com.ibm.icu.impl.CharsetLMBCS3" );
algorithmicCharsets.put("lmbcs4", "com.ibm.icu.impl.CharsetLMBCS4" );
algorithmicCharsets.put("lmbcs5", "com.ibm.icu.impl.CharsetLMBCS5" );
algorithmicCharsets.put("lmbcs6", "com.ibm.icu.impl.CharsetLMBCS6" );
algorithmicCharsets.put("lmbcs8", "com.ibm.icu.impl.CharsetLMBCS8" );
algorithmicCharsets.put("scsu", "com.ibm.icu.impl.CharsetSCSU" );
algorithmicCharsets.put("usascii", "com.ibm.icu.impl.CharsetUSASCII" );
algorithmicCharsets.put("UTF-16", "com.ibm.icu.impl.CharsetUTF16" );
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.impl.CharsetUTF16" );
algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.impl.CharsetUTF16LE" );
algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.impl.CharsetUTF16LE" );
algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.impl.CharsetUTF16" );
algorithmicCharsets.put("UTF-32", "com.ibm.icu.impl.CharsetUTF32" );
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.impl.CharsetUTF32" );
algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.impl.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.impl.CharsetUTF32LE" );
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.impl.CharsetUTF32" );
algorithmicCharsets.put("UTF-7", "com.ibm.icu.impl.CharsetUTF7" );
algorithmicCharsets.put("UTF-8", "com.ibm.icu.impl.CharsetUTF8" );
}
/*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
String className = (String) algorithmicCharsets.get(icuCanonicalName);
if(className==null){
//all the cnv files are loaded as MBCS
className = "com.ibm.icu.impl.CharsetMBCS";
}
try{
CharsetICU conv = null;
Class cs = Class.forName(className);
Class[] paramTypes = new Class[]{ String.class, String.class, String[].class};
final Constructor c = cs.getConstructor(paramTypes);
Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
java.security.AccessController.doPrivileged
(new java.security.PrivilegedAction() {
public Object run() {
c.setAccessible(true);
return null;
}
});
// Run constructor
try {
Object obj = c.newInstance(params);
if(obj!=null && obj instanceof CharsetICU){
conv = (CharsetICU)obj;
return conv;
}
}catch (InvocationTargetException e) {
throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());
}
}catch(ClassNotFoundException ex){
}catch(NoSuchMethodException ex){
}catch (IllegalAccessException ex){
}catch (InstantiationException ex){
}
throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
}
/** Always use fallbacks from codepage to Unicode */
protected final boolean isToUUseFallback() {
return true;
}
/** Use fallbacks from Unicode to codepage when useFallback or for private-use code points */
protected final boolean isFromUUseFallback(int c) {
return (useFallback) || isPrivateUse(c);
}
/**
*
*/
public static final String getDefaultCharsetName(){
String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
return defaultEncoding;
}
/*public*/ static final boolean isPrivateUse(int c) {
return (UCharacter.getType(c) == UCharacter.PRIVATE_USE);
}
/**
* Returns a charset object for the named charset.
* This method gurantee that ICU charset is returned when
* available. If the ICU charset provider does not support
* the specified charset, then try other charset providers
* including the standard Java charset provider.
*
* @param charsetName The name of the requested charset,
* may be either a canonical name or an alias
* @return A charset object for the named charset
* @throws IllegalCharsetNameException If the given charset name
* is illegal
* @throws UnsupportedCharsetException If no support for the
* named charset is available in this instance of th Java
* virtual machine
*/
public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
CharsetProviderICU icuProvider = new CharsetProviderICU();
Charset cs = icuProvider.charsetForName(charsetName);
if (cs != null) {
return cs;
}
return Charset.forName(charsetName);
}
}

View file

@ -0,0 +1,260 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.charset;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.charset.spi.CharsetProvider;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import com.ibm.icu.impl.UConverterAlias;
public final class CharsetProviderICU extends CharsetProvider{
/**
* Constructs a CharsetProviderICU object
* @stable ICU 2.4
*/
public CharsetProviderICU(){
}
/**
* Constructs a charset for the given charset name
* @param charsetName charset name
* @return charset objet for the given charset name, null if unsupported
* @stable ICU 2.4
*/
public final Charset charsetForName(String charsetName){
try{
// get the canonical name
String icuCanonicalName = getICUCanonicalName(charsetName);
// create the converter object and return it
if(icuCanonicalName==null || icuCanonicalName.length()==0){
// this would make the Charset API to throw
// unsupported encoding exception
return null;
}
return getCharset(icuCanonicalName);
}catch(UnsupportedCharsetException ex){
}catch(IOException ex){
}
return null;
}
/**
* Gets the canonical name of the converter as defined by Java
* @param enc converter name
* @return canonical name of the converter
* @internal ICU 3.4
*/
public static final String getICUCanonicalName(String enc)
throws UnsupportedCharsetException{
String canonicalName = null;
String ret = null;
try{
if(enc!=null){
if((canonicalName = UConverterAlias.getCanonicalName(enc, "MIME"))!=null){
ret = canonicalName;
}else if((canonicalName = UConverterAlias.getCanonicalName(enc, "IANA"))!=null){
ret = canonicalName;
}else if((canonicalName = UConverterAlias.getCanonicalName(enc, ""))!=null){
ret = canonicalName;
}else if((canonicalName = UConverterAlias.getAlias(enc, 0))!=null){
/* we have some aliases in the form x-blah .. match those first */
ret = canonicalName;
}else if(enc.indexOf("x-")==0){
/* TODO: Match with getJavaCanonicalName method */
/*
char temp[ UCNV_MAX_CONVERTER_NAME_LENGTH] = {0};
strcpy(temp, encName+2);
*/
ret = enc.substring(2);
}else{
/* unsupported encoding */
ret = "";
}
}
return ret;
}catch(IOException ex){
throw new UnsupportedCharsetException(enc);
}
}
private static final Charset getCharset(String icuCanonicalName) throws IOException{
String[] aliases = (String[])getAliases(icuCanonicalName);
String canonicalName = getJavaCanonicalName(icuCanonicalName);
return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases));
}
/**
* Gets the canonical name of the converter as defined by Java
* @param icuCanonicalName converter name
* @return canonical name of the converter
* @internal ICU 3.4
*/
private static String getJavaCanonicalName(String icuCanonicalName){
/*
If a charset listed in the IANA Charset Registry is supported by an implementation
of the Java platform then its canonical name must be the name listed in the registry.
Many charsets are given more than one name in the registry, in which case the registry
identifies one of the names as MIME-preferred. If a charset has more than one registry
name then its canonical name must be the MIME-preferred name and the other names in
the registry must be valid aliases. If a supported charset is not listed in the IANA
registry then its canonical name must begin with one of the strings "X-" or "x-".
*/
if(icuCanonicalName==null ){
return null;
}
try{
String cName = null;
/* find out the alias with MIME tag */
if((cName=UConverterAlias.getStandardName(icuCanonicalName, "MIME"))!=null){
/* find out the alias with IANA tag */
}else if((cName=UConverterAlias.getStandardName(icuCanonicalName, "IANA"))!=null){
}else {
/*
check to see if an alias already exists with x- prefix, if yes then
make that the canonical name
*/
int aliasNum = UConverterAlias.countAliases(icuCanonicalName);
String name;
for(int i=0;i<aliasNum;i++){
name = UConverterAlias.getAlias(icuCanonicalName, i);
if(name!=null && name.indexOf("x-")==0){
cName = name;
break;
}
}
/* last resort just append x- to any of the alias and
make it the canonical name */
if((cName==null || cName.length()==0)){
name = UConverterAlias.getStandardName(icuCanonicalName, "UTR22");
if(name==null && icuCanonicalName.indexOf(",")!=-1){
name = UConverterAlias.getAlias(icuCanonicalName, 1);
}
/* if there is no UTR22 canonical name .. then just return itself*/
if(name==null){
name = icuCanonicalName;
}
cName = "x-"+ name;
}
}
return cName;
}catch (IOException ex){
}
return null;
}
/**
* Gets the aliases associated with the converter name
* @param encName converter name
* @return converter names as elements in an object array
* @internal ICU 2.4
*/
private static final String[] getAliases(String encName)throws IOException{
String[] ret = null;
int aliasNum = 0;
int i=0;
int j=0;
String aliasArray[/*50*/] = new String[50];
if(encName != null){
aliasNum = UConverterAlias.countAliases(encName);
for(i=0,j=0;i<aliasNum;i++){
String name = UConverterAlias.getAlias(encName,i);
if(name.indexOf('+')==-1 && name.indexOf(',')==-1){
aliasArray[j++]= name;
}
}
ret = new String[j];
for(;--j>=0;) {
ret[j] = aliasArray[j];
}
}
return (ret);
}
/**
* Class that implements the iterator for charsets
* @stable ICU 2.4
*/
protected final class CharsetIterator implements Iterator{
private String[] names;
private int currentIndex;
protected CharsetIterator(String[] strs){
names = strs;
currentIndex=0;
}
public boolean hasNext(){
return (currentIndex< names.length);
}
public Object next(){
if(currentIndex<names.length){
return charsetForName(names[currentIndex++]);
}else{
throw new NoSuchElementException();
}
}
public void remove(){
throw new UnsupportedOperationException();
}
}
private static final void putCharsets(Map map){
int num = UConverterAlias.countAvailable();
for(int i=0;i<num;i++) {
String name = UConverterAlias.getAvailableName(i);
try {
Charset cs = getCharset(name);
map.put(cs, getJavaCanonicalName(name));
}catch(UnsupportedCharsetException ex){
}catch (IOException e) {
}
// add only charsets that can be created!
}
}
/**
* Returns an iterator for the available charsets
* @return Iterator the charset name iterator
*/
public final Iterator charsets(){
HashMap map = new HashMap();
putCharsets(map);
return map.keySet().iterator();
}
/**
* Gets the canonical names of available converters
* @return Object[] names as an object array
*/
public static final Object[] getAvailableNames(){
HashMap map = new HashMap();
putCharsets(map);
return map.values().toArray();
}
/**
* Return all names available
* @return
*/
public static final String[] getAllNames(){
int num = UConverterAlias.countAvailable();
String[] names = new String[num];
for(int i=0;i<num;i++) {
names[i] = UConverterAlias.getAvailableName(i);
}
return names;
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,446 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.charset.CharsetDecoderICU;
import com.ibm.icu.charset.CharsetEncoderICU;
import com.ibm.icu.charset.CharsetICU;
import com.ibm.icu.text.UTF16;
public class CharsetUTF16 extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0xff, (byte)0xfd};
public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
minBytesPerChar = 2;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF16 extends CharsetDecoderICU{
public CharsetDecoderUTF16(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining() && toUnicodeStatus==0) {
/* no input, nothing to do */
return cr;
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
int sourceIndex=0, count=0, length, sourceArrayIndex;
char c=0, trail;
length = source.remaining();
sourceArrayIndex = source.position();
try{
/* complete a partial UChar or pair from the last call */
if(toUnicodeStatus!=0) {
/*
* special case: single byte from a previous buffer,
* where the byte turned out not to belong to a trail surrogate
* and the preceding, unmatched lead surrogate was put into toUBytes[]
* for error handling
*/
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
toULength=1;
toUnicodeStatus=0;
}
if((count=toULength)!=0) {
byte[] pArray=toUBytesArray;
int pArrayIndex = toUBytesBegin;
do {
pArray[count++]=source.get(sourceArrayIndex++);
++sourceIndex;
--length;
if(count==2) {
c=(char)(((pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(!UTF16.isSurrogate(c)) {
/* output the BMP code point */
target.put(c);
if(offsets!=null) {
offsets.put(-1);
}
count=0;
c=0;
break;
} else if(UTF16.isLeadSurrogate(c)) {
/* continue collecting bytes for the trail surrogate */
c=0; /* avoid unnecessary surrogate handling below */
} else {
/* fall through to error handling for an unmatched trail surrogate */
break;
}
} else if(count==4) {
c=(char)(((pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK));
trail=(char)(((pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(UTF16.isTrailSurrogate(trail)) {
/* output the surrogate pair */
target.put(c);
if(target.remaining()>=1) {
target.put(trail);
if(offsets!=null) {
offsets.put(-1);
offsets.put(-1);
}
} else /* targetCapacity==1 */ {
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
throw new BufferOverflowException();
}
count=0;
c=0;
break;
} else {
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
/* back out reading the code unit after it */
if((source.position()-sourceArrayIndex)>=2) {
sourceArrayIndex-=2;
} else {
/*
* if the trail unit's first byte was in a previous buffer, then
* we need to put it into a special place because toUBytes[] will be
* used for the lead unit's bytes
*/
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
--sourceArrayIndex;
}
toULength=2;
cr = CoderResult.malformedForLength(sourceArrayIndex);;
}
}
} while(length>0);
toULength=(byte)count;
}
/* copy an even number of bytes for complete UChars */
count=2*target.remaining();
if(count>length) {
count=length&~1;
}
if(c==0 && count>0) {
length-=count;
count>>=1;
//targetCapacity-=count;
if(offsets==null) {
do {
c=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
} else {
break;
}
} while(--count>0);
} else {
do {
c=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
offsets.put(sourceIndex);
sourceIndex+=2;
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
sourceIndex+=4;
} else {
break;
}
} while(--count>0);
}
if(count==0) {
/* done with the loop for complete UChars */
c=0;
} else {
/* keep c for surrogate handling, trail will be set there */
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
}
}
if(c!=0) {
/*
* c is a surrogate, and
* - source or target too short
* - or the surrogate is unmatched
*/
toUBytesArray[toUBytesBegin+0]=(byte)(c>>>8);
toUBytesArray[toUBytesBegin+1]=(byte)c;
toULength=2;
if(UTF16.isLeadSurrogate(c)) {
if(length>=2) {
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
/* output the surrogate pair, will overflow (see conditions comment above) */
sourceArrayIndex+=2;
length-=2;
target.put(c);
if(offsets!=null) {
offsets.put(sourceIndex);
}
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
toULength=0;
cr = CoderResult.OVERFLOW;
} else {
/* unmatched lead surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
} else {
/* see if the trail surrogate is in the next buffer */
}
} else {
/* unmatched trail surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
}
/* check for a remaining source byte */
if(length>0) {
if(!target.hasRemaining()) {
cr = CoderResult.OVERFLOW;
} else {
/* it must be length==1 because otherwise the above would have copied more */
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
}
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
class CharsetEncoderUTF16 extends CharsetEncoderICU{
public CharsetEncoderUTF16(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining()) {
/* no input, nothing to do */
return cr;
}
char c;
/* write the BOM if necessary */
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
byte bom[]={ (byte)0xfe, (byte)0xff };
cr = fromUWriteBytes(this,bom, 0, bom.length, target, offsets, -1);
if(cr.isError()){
return cr;
}
fromUnicodeStatus=0;
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
int sourceIndex = 0;
char trail = 0;
int length = source.remaining();
try{
/* c!=0 indicates in several places outside the main loops that a surrogate was found */
if((c=(char)fromUChar32)!=0 && UTF16.isTrailSurrogate(trail=source.get(sourceIndex)) && target.remaining()>=4) {
/* the last buffer ended with a lead surrogate, output the surrogate pair */
++sourceIndex;
--length;
target.put((byte)(c>>>8));
target.put((byte)c);
target.put((byte)(trail>>>8));
target.put((byte)trail);
if(offsets!=null && offsets.remaining()>=4) {
offsets.put(-1);
offsets.put(-1);
offsets.put(-1);
offsets.put(-1);
}
sourceIndex=1;
fromUChar32=c=0;
}
byte overflow[/*4*/] = new byte[4];
int sourceArrayIndex = source.position();
if(c==0) {
/* copy an even number of bytes for complete UChars */
int count=2*length;
int targetCapacity = target.limit();
if(count>targetCapacity) {
count=targetCapacity&~1;
}
/* count is even */
targetCapacity-=count;
count>>=1;
length-=count;
if(offsets==null) {
while(count>0) {
c= source.get(sourceArrayIndex++);
if(!UTF16.isSurrogate(c)) {
target.put((byte)(c>>>8));
target.put((byte)c);
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
++sourceArrayIndex;
--count;
target.put((byte)(c>>>8));
target.put((byte)c);
target.put((byte)(trail>>>8));
target.put((byte)trail);
} else {
break;
}
--count;
}
} else {
while(count>0) {
c=source.get(sourceArrayIndex++);
if(!UTF16.isSurrogate(c)) {
target.put((byte)(c>>>8));
target.put((byte)c);
offsets.put(sourceIndex);
offsets.put(sourceIndex++);
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
++sourceArrayIndex;
--count;
target.put((byte)(c>>>8));
target.put((byte)c);
target.put((byte)(trail>>>8));
target.put((byte)trail);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
sourceIndex+=2;
} else {
break;
}
--count;
}
}
if(count==0) {
/* done with the loop for complete UChars */
if(length>0 && targetCapacity>0) {
/*
* there is more input and some target capacity -
* it must be targetCapacity==1 because otherwise
* the above would have copied more;
* prepare for overflow output
*/
if(!UTF16.isSurrogate(c=source.get(sourceArrayIndex++))) {
overflow[0]=(byte)(c>>>8);
overflow[1]=(byte)c;
length=2; /* 2 bytes to output */
c=0;
/* } else { keep c for surrogate handling, length will be set there */
}
} else {
length=0;
c=0;
}
} else {
/* keep c for surrogate handling, length will be set there */
targetCapacity+=2*count;
}
} else {
length=0; /* from here on, length counts the bytes in overflow[] */
}
if(c!=0) {
/*
* c is a surrogate, and
* - source or target too short
* - or the surrogate is unmatched
*/
length=0;
if(UTF16.isLeadSurrogate(c)) {
if(sourceArrayIndex<source.limit()) {
if(UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
/* output the surrogate pair, will overflow (see conditions comment above) */
++sourceArrayIndex;
overflow[0]=(byte)(c>>>8);
overflow[1]=(byte)c;
overflow[2]=(byte)(trail>>>8);
overflow[3]=(byte)trail;
length=4; /* 4 bytes to output */
c=0;
} else {
/* unmatched lead surrogate */
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
} else {
/* see if the trail surrogate is in the next buffer */
}
} else {
/* unmatched trail surrogate */
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
}
fromUChar32=c;
}
source.position(sourceArrayIndex);
if(length>0) {
/* output length bytes with overflow (length>targetCapacity>0) */
fromUWriteBytes(this, overflow, 0, length, target, offsets, sourceIndex);
}
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF16(this);
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF16(this);
}
}

View file

@ -0,0 +1,449 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.charset.CharsetDecoderICU;
import com.ibm.icu.charset.CharsetEncoderICU;
import com.ibm.icu.charset.CharsetICU;
import com.ibm.icu.text.UTF16;
/**
* @author Niti Hantaweepant
*/
public class CharsetUTF16LE extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff};
public CharsetUTF16LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
minBytesPerChar = 2;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF16LE extends CharsetDecoderICU{
public CharsetDecoderUTF16LE(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining() && toUnicodeStatus==0) {
/* no input, nothing to do */
return cr;
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
int sourceIndex=0, count=0, length, sourceArrayIndex;
char c=0, trail;
length = source.remaining();
sourceArrayIndex = source.position();
try{
/* complete a partial UChar or pair from the last call */
if(toUnicodeStatus!=0) {
/*
* special case: single byte from a previous buffer,
* where the byte turned out not to belong to a trail surrogate
* and the preceding, unmatched lead surrogate was put into toUBytes[]
* for error handling
*/
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
toULength=1;
toUnicodeStatus=0;
}
if((count=toULength)!=0) {
byte[] pArray=toUBytesArray;
int pArrayIndex = toUBytesBegin;
do {
pArray[count++]=source.get(sourceArrayIndex++);
++sourceIndex;
--length;
if(count==2) {
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(!UTF16.isSurrogate(c)) {
/* output the BMP code point */
target.put(c);
if(offsets!=null) {
offsets.put(-1);
}
count=0;
c=0;
break;
} else if(UTF16.isLeadSurrogate(c)) {
/* continue collecting bytes for the trail surrogate */
c=0; /* avoid unnecessary surrogate handling below */
} else {
/* fall through to error handling for an unmatched trail surrogate */
break;
}
} else if(count==4) {
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK));
if(UTF16.isTrailSurrogate(trail)) {
/* output the surrogate pair */
target.put(c);
if(target.remaining()>=1) {
target.put(trail);
if(offsets!=null) {
offsets.put(-1);
offsets.put(-1);
}
} else /* targetCapacity==1 */ {
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
throw new BufferOverflowException();
}
count=0;
c=0;
break;
} else {
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
/* back out reading the code unit after it */
if((source.position()-sourceArrayIndex)>=2) {
sourceArrayIndex-=2;
} else {
/*
* if the trail unit's first byte was in a previous buffer, then
* we need to put it into a special place because toUBytes[] will be
* used for the lead unit's bytes
*/
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
--sourceArrayIndex;
}
toULength=2;
cr = CoderResult.malformedForLength(sourceArrayIndex);;
}
}
} while(length>0);
toULength=(byte)count;
}
/* copy an even number of bytes for complete UChars */
count=2*target.remaining();
if(count>length) {
count=length&~1;
}
if(c==0 && count>0) {
length-=count;
count>>=1;
//targetCapacity-=count;
if(offsets==null) {
do {
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
} else {
break;
}
} while(--count>0);
} else {
do {
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
sourceArrayIndex+=2;
if(!UTF16.isSurrogate(c)) {
target.put(c);
offsets.put(sourceIndex);
sourceIndex+=2;
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
) {
sourceArrayIndex+=2;
--count;
target.put(c);
target.put(trail);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
sourceIndex+=4;
} else {
break;
}
} while(--count>0);
}
if(count==0) {
/* done with the loop for complete UChars */
c=0;
} else {
/* keep c for surrogate handling, trail will be set there */
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
}
}
if(c!=0) {
/*
* c is a surrogate, and
* - source or target too short
* - or the surrogate is unmatched
*/
toUBytesArray[toUBytesBegin+0]=(byte)c;
toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8);
toULength=2;
if(UTF16.isLeadSurrogate(c)) {
if(length>=2) {
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
/* output the surrogate pair, will overflow (see conditions comment above) */
sourceArrayIndex+=2;
length-=2;
target.put(c);
if(offsets!=null) {
offsets.put(sourceIndex);
}
charErrorBufferArray[charErrorBufferBegin+0]=trail;
charErrorBufferLength=1;
toULength=0;
cr = CoderResult.OVERFLOW;
} else {
/* unmatched lead surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
} else {
/* see if the trail surrogate is in the next buffer */
}
} else {
/* unmatched trail surrogate */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
}
/* check for a remaining source byte */
if(length>0) {
if(!target.hasRemaining()) {
cr = CoderResult.OVERFLOW;
} else {
/* it must be length==1 because otherwise the above would have copied more */
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
}
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
class CharsetEncoderUTF16LE extends CharsetEncoderICU{
public CharsetEncoderUTF16LE(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining()) {
/* no input, nothing to do */
return cr;
}
char c;
/* write the BOM if necessary */
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
byte bom[]={ (byte)0xff, (byte)0xfe };
cr = fromUWriteBytes(this,bom, 0, bom.length, target, offsets, -1);
if(cr.isError()){
return cr;
}
fromUnicodeStatus=0;
}
if(!target.hasRemaining()) {
return CoderResult.OVERFLOW;
}
int sourceIndex = 0;
char trail = 0;
int length = source.remaining();
try{
/* c!=0 indicates in several places outside the main loops that a surrogate was found */
if((c=(char)fromUChar32)!=0 && UTF16.isTrailSurrogate(trail=source.get(sourceIndex)) && target.remaining()>=4) {
/* the last buffer ended with a lead surrogate, output the surrogate pair */
++sourceIndex;
--length;
target.put((byte)c);
target.put((byte)(c>>>8));
target.put((byte)trail);
target.put((byte)(trail>>>8));
if(offsets!=null && offsets.remaining()>=4) {
offsets.put(-1);
offsets.put(-1);
offsets.put(-1);
offsets.put(-1);
}
sourceIndex=1;
fromUChar32=c=0;
}
byte overflow[/*4*/] = new byte[4];
int sourceArrayIndex = source.position();
if(c==0) {
/* copy an even number of bytes for complete UChars */
int count=2*length;
int targetCapacity = target.limit();
if(count>targetCapacity) {
count=targetCapacity&~1;
}
/* count is even */
targetCapacity-=count;
count>>=1;
length-=count;
if(offsets==null) {
while(count>0) {
c= source.get(sourceArrayIndex++);
if(!UTF16.isSurrogate(c)) {
target.put((byte)c);
target.put((byte)(c>>>8));
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
++sourceArrayIndex;
--count;
target.put((byte)c);
target.put((byte)(c>>>8));
target.put((byte)trail);
target.put((byte)(trail>>>8));
} else {
break;
}
--count;
}
} else {
while(count>0) {
c=source.get(sourceArrayIndex++);
if(!UTF16.isSurrogate(c)) {
target.put((byte)c);
target.put((byte)(c>>>8));
offsets.put(sourceIndex);
offsets.put(sourceIndex++);
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
++sourceArrayIndex;
--count;
target.put((byte)c);
target.put((byte)(c>>>8));
target.put((byte)trail);
target.put((byte)(trail>>>8));
offsets.put(sourceIndex);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
offsets.put(sourceIndex);
sourceIndex+=2;
} else {
break;
}
--count;
}
}
if(count==0) {
/* done with the loop for complete UChars */
if(length>0 && targetCapacity>0) {
/*
* there is more input and some target capacity -
* it must be targetCapacity==1 because otherwise
* the above would have copied more;
* prepare for overflow output
*/
if(!UTF16.isSurrogate(c=source.get(sourceArrayIndex++))) {
overflow[0]=(byte)c;
overflow[1]=(byte)(c>>>8);
length=2; /* 2 bytes to output */
c=0;
/* } else { keep c for surrogate handling, length will be set there */
}
} else {
length=0;
c=0;
}
} else {
/* keep c for surrogate handling, length will be set there */
targetCapacity+=2*count;
}
} else {
length=0; /* from here on, length counts the bytes in overflow[] */
}
if(c!=0) {
/*
* c is a surrogate, and
* - source or target too short
* - or the surrogate is unmatched
*/
length=0;
if(UTF16.isLeadSurrogate(c)) {
if(sourceArrayIndex<source.limit()) {
if(UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
/* output the surrogate pair, will overflow (see conditions comment above) */
++sourceArrayIndex;
overflow[0]=(byte)c;
overflow[1]=(byte)(c>>>8);
overflow[2]=(byte)trail;
overflow[3]=(byte)(trail>>>8);
length=4; /* 4 bytes to output */
c=0;
} else {
/* unmatched lead surrogate */
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
} else {
/* see if the trail surrogate is in the next buffer */
}
} else {
/* unmatched trail surrogate */
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
}
fromUChar32=c;
}
source.position(sourceArrayIndex);
if(length>0) {
/* output length bytes with overflow (length>targetCapacity>0) */
fromUWriteBytes(this, overflow, 0, length, target, offsets, sourceIndex);
}
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF16LE(this);
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF16LE(this);
}
}

View file

@ -0,0 +1,318 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.charset.CharsetDecoderICU;
import com.ibm.icu.charset.CharsetEncoderICU;
import com.ibm.icu.charset.CharsetICU;
import com.ibm.icu.text.UTF16;
/**
* @author Niti Hantaweepant
*/
public class CharsetUTF32 extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0, (byte)0, (byte)0xff, (byte)0xfd};
public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
minBytesPerChar = 4;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF32 extends CharsetDecoderICU{
public CharsetDecoderUTF32(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
int ch, i;
try{
donefornow:
{
/* UTF-8 returns here for only non-offset, this needs to change.*/
if (toUnicodeStatus != 0 && target.hasRemaining()) {
i = toULength; /* restore # of bytes consumed */
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
toUnicodeStatus = 0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch = (ch << 8) | ((byte)(source.get(sourceArrayIndex)) & UConverterConstants.UNSIGNED_BYTE_MASK);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char)ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
throw new BufferOverflowException();
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break donefornow;
}
}
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
i = 0;
ch = 0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch = (ch << 8) | ((byte)(source.get(sourceArrayIndex)) & UConverterConstants.UNSIGNED_BYTE_MASK);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char) ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
throw new BufferOverflowException();
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
/* End of target buffer */
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
class CharsetEncoderUTF32 extends CharsetEncoderICU{
public CharsetEncoderUTF32(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining()) {
/* no input, nothing to do */
return cr;
}
/* write the BOM if necessary */
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
byte[] bom={ 0, 0, (byte)0xfe, (byte)0xff };
cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
if(cr.isError()){
return cr;
}
fromUnicodeStatus=0;
}
int ch, ch2;
int indexToWrite;
byte temp[] = new byte[4];
temp[0] = 0;
int sourceArrayIndex = source.position();
try{
boolean doloop = true;
if (fromUChar32 != 0) {
ch = fromUChar32;
fromUChar32 = 0;
//lowsurogate:
if (sourceArrayIndex < source.limit()) {
ch2 = source.get(sourceArrayIndex);
if (UTF16.isTrailSurrogate((char)ch2)) {
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
sourceArrayIndex++;
}
else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
fromUChar32 = ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);
doloop = false;
}
}
else {
/* ran out of source */
fromUChar32 = ch;
if (flush) {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
doloop = false;
}
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
temp[1] = (byte) (ch >>> 16 & 0x1F);
temp[2] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
temp[3] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
if (target.hasRemaining()) {
target.put(temp[indexToWrite]);
}
else {
errorBuffer[errorBufferLength++] = temp[indexToWrite];
cr = CoderResult.OVERFLOW;
}
}
}
if(doloop) {
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
ch = source.get(sourceArrayIndex++);
if (UTF16.isSurrogate((char)ch)) {
if (UTF16.isLeadSurrogate((char)ch)) {
//lowsurogate:
if (sourceArrayIndex < source.limit()) {
ch2 = source.get(sourceArrayIndex);
if (UTF16.isTrailSurrogate((char)ch2)) {
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
sourceArrayIndex++;
}
else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
fromUChar32 = ch;
cr = CoderResult.OVERFLOW;
break;
}
}
else {
/* ran out of source */
fromUChar32 = ch;
if (flush) {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
break;
}
}
else {
fromUChar32 = ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
temp[1] = (byte) (ch >>> 16 & 0x1F);
temp[2] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
temp[3] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
if (target.hasRemaining()) {
target.put(temp[indexToWrite]);
}
else {
errorBuffer[errorBufferLength++] = temp[indexToWrite];
cr = CoderResult.OVERFLOW;
}
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF32(this);
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF32(this);
}
}

View file

@ -0,0 +1,318 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.charset.CharsetDecoderICU;
import com.ibm.icu.charset.CharsetEncoderICU;
import com.ibm.icu.charset.CharsetICU;
import com.ibm.icu.text.UTF16;
/**
* @author Niti Hantaweepant
*/
public class CharsetUTF32LE extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff, (byte)0, (byte)0};
public CharsetUTF32LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
minBytesPerChar = 4;
maxCharsPerByte = 1;
}
class CharsetDecoderUTF32LE extends CharsetDecoderICU{
public CharsetDecoderUTF32LE(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
int ch, i;
try{
donefornow:
{
/* UTF-8 returns here for only non-offset, this needs to change.*/
if (toUnicodeStatus != 0 && target.hasRemaining()) {
i = toULength; /* restore # of bytes consumed */
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
toUnicodeStatus = 0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char)ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
throw new BufferOverflowException();
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break donefornow;
}
}
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
i = 0;
ch = 0;
while (i < 4) {
if (sourceArrayIndex < source.limit()) {
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
}
else {
/* stores a partially calculated target*/
/* + 1 to make 0 a valid character */
toUnicodeStatus = ch + 1;
toULength = (byte) i;
break donefornow;
}
}
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char) ch);
}
else {
/* write out the surrogates */
target.put(UTF16.getLeadSurrogate(ch));
ch = UTF16.getTrailSurrogate(ch);
if (target.hasRemaining()) {
target.put((char)ch);
}
else {
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[0] = (char) ch;
charErrorBufferLength = 1;
throw new BufferOverflowException();
}
}
}
else {
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
/* End of target buffer */
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
class CharsetEncoderUTF32LE extends CharsetEncoderICU{
public CharsetEncoderUTF32LE(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
private final static int NEED_TO_WRITE_BOM = 1;
protected void implReset() {
super.implReset();
fromUnicodeStatus = NEED_TO_WRITE_BOM;
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
if(!source.hasRemaining()) {
/* no input, nothing to do */
return cr;
}
/* write the BOM if necessary */
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
byte[] bom={ (byte)0xff, (byte)0xfe, 0, 0 };
cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
if(cr.isError()){
return cr;
}
fromUnicodeStatus=0;
}
int ch, ch2;
int indexToWrite;
byte temp[] = new byte[4];
temp[3] = 0;
int sourceArrayIndex = source.position();
try{
boolean doloop = true;
if (fromUChar32 != 0) {
ch = fromUChar32;
fromUChar32 = 0;
//lowsurogate:
if (sourceArrayIndex < source.limit()) {
ch2 = source.get(sourceArrayIndex);
if (UTF16.isTrailSurrogate((char)ch2)) {
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
sourceArrayIndex++;
}
else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
fromUChar32 = ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);
doloop = false;
}
}
else {
/* ran out of source */
fromUChar32 = ch;
if (flush) {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
doloop = false;
}
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
temp[2] = (byte) (ch >>> 16 & 0x1F);
temp[1] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
temp[0] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
if (target.hasRemaining()) {
target.put(temp[indexToWrite]);
}
else {
errorBuffer[errorBufferLength++] = temp[indexToWrite];
cr = CoderResult.OVERFLOW;
}
}
}
if(doloop) {
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
ch = source.get(sourceArrayIndex++);
if (UTF16.isSurrogate((char)ch)) {
if (UTF16.isLeadSurrogate((char)ch)) {
//lowsurogate:
if (sourceArrayIndex < source.limit()) {
ch2 = source.get(sourceArrayIndex);
if (UTF16.isTrailSurrogate((char)ch2)) {
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
sourceArrayIndex++;
}
else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
fromUChar32 = ch;
cr = CoderResult.OVERFLOW;
break;
}
}
else {
/* ran out of source */
fromUChar32 = ch;
if (flush) {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
cr = CoderResult.malformedForLength(sourceArrayIndex);
}
break;
}
}
else {
fromUChar32 = ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
temp[2] = (byte) (ch >>> 16 & 0x1F);
temp[1] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
temp[0] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
if (target.hasRemaining()) {
target.put(temp[indexToWrite]);
}
else {
errorBuffer[errorBufferLength++] = temp[indexToWrite];
cr = CoderResult.OVERFLOW;
}
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF32LE(this);
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF32LE(this);
}
}

View file

@ -0,0 +1,508 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import com.ibm.icu.charset.CharsetDecoderICU;
import com.ibm.icu.charset.CharsetEncoderICU;
import com.ibm.icu.charset.CharsetICU;
import com.ibm.icu.text.UTF16;
/**
* @author Niti Hantaweepant
*/
public class CharsetUTF8 extends CharsetICU {
protected byte[] fromUSubstitution = new byte[]{(byte)0xef, (byte)0xbf, (byte)0xbd};
public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases){
super(icuCanonicalName, javaCanonicalName, aliases);
maxBytesPerChar = 4;
minBytesPerChar = 1;
maxCharsPerByte = 1;
}
/* UTF-8 Conversion DATA
* for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
*/
private static final long OFFSETS_FROM_UTF8[] = {0,
0x00000000L, 0x00003080L, 0x000E2080L,
0x03C82080L, 0xFA082080L, 0x82082080L};
private static final byte BYTES_FROM_UTF8[] =
{
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
};
/*
* Starting with Unicode 3.0.1:
* UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
* byte sequences with more than 4 bytes are illegal in UTF-8,
* which is tested with impossible values for them
*/
private static final long UTF8_MIN_CHAR32[] = { 0L, 0L, 0x80L, 0x800L, 0x10000L, 0xffffffffL, 0xffffffffL };
class CharsetDecoderUTF8 extends CharsetDecoderICU{
public CharsetDecoderUTF8(CharsetICU cs) {
super(cs);
}
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
// Todo: CESU8 implementation
// boolean isCESU8 = args.converter.sharedData == _CESU8Data;
boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
int ch, ch2 = 0;
int i, inBytes;
try{
donefornow:
{
if (toUnicodeStatus!=0 && target.hasRemaining())
{
inBytes = mode; /* restore # of bytes to consume */
i = toULength; /* restore # of bytes consumed */
ch = toUnicodeStatus; /*Stores the previously calculated ch from a previous call*/
toUnicodeStatus = 0;
while (i < inBytes)
{
if (sourceArrayIndex<source.limit())
{
toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK);
if (!isTrail((byte)ch2))
{
break; /* i < inBytes */
}
ch = (ch << 6) + ch2;
++sourceArrayIndex;
i++;
}
else
{
/* stores a partially calculated target*/
toUnicodeStatus = ch;
mode = inBytes;
toULength = (byte) i;
break donefornow;
}
}
/* Remove the accumulated high bits */
ch -= OFFSETS_FROM_UTF8[inBytes];
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
*/
if (i == inBytes && ch <= UConverterSharedData.MAXIMUM_UTF && ch >= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch)))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
toULength = 0;
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char)ch);
}
else
{
/* write out the surrogates */
ch -= UConverterSharedData.HALF_BASE;
target.put((char) ((ch >> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START;
if(target.hasRemaining()) {
target.put((char)ch);
} else /* targetCapacity==1 */ {
charErrorBufferArray[charErrorBufferBegin+0]=(char)ch;
charErrorBufferLength=1;
throw new BufferOverflowException();
}
}
}
else
{
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break donefornow;
}
}
while (sourceArrayIndex < source.limit() && target.hasRemaining())
{
ch = source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK;
if (ch < 0x80) /* Simple case */
{
target.put((char)ch);
}
else
{
/* store the first char */
toUBytesArray[0] = (byte)ch;
inBytes = BYTES_FROM_UTF8[(int)ch]; /* lookup current sequence length */
i = 1;
while (i < inBytes)
{
if (sourceArrayIndex < source.limit())
{
toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK);
if (!isTrail((byte)ch2))
{
break; /* i < inBytes */
}
ch = (ch << 6) + ch2;
++sourceArrayIndex;
i++;
}
else
{
/* stores a partially calculated target*/
toUnicodeStatus = ch;
mode = inBytes;
toULength = (byte) i;
break donefornow;
}
}
/* Remove the accumulated high bits */
ch -= OFFSETS_FROM_UTF8[inBytes];
/*
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
* - use only trail bytes after a lead byte (checked above)
* - use the right number of trail bytes for a given lead byte
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
*/
if (i == inBytes && ch <= UConverterSharedData.MAXIMUM_UTF && ch >= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch)))
{
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
toULength = 0;
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
{
/* fits in 16 bits */
target.put((char) ch);
}
else
{
/* write out the surrogates */
ch -= UConverterSharedData.HALF_BASE;
target.put((char) ((ch >>> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START;
if (target.hasRemaining())
{
target.put((char)ch);
}
else
{
/* Put in overflow buffer (not handled here) */
charErrorBufferArray[charErrorBufferBegin+0]=(char)ch;
charErrorBufferLength=1;
throw new BufferOverflowException();
}
}
}
else
{
toULength = (byte)i;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining())
{
/* End of target buffer */
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
class CharsetEncoderUTF8 extends CharsetEncoderICU{
public CharsetEncoderUTF8(CharsetICU cs) {
super(cs, fromUSubstitution);
implReset();
}
protected void implReset() {
super.implReset();
}
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
CoderResult cr = CoderResult.UNDERFLOW;
int sourceArrayIndex = source.position();
// Todo: CESU8 implementation
// boolean isCESU8 = args.converter.sharedData == _CESU8Data;
boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
int ch;
short indexToWrite;
byte temp[] = new byte[4];
boolean doloop = true;
try{
if (fromUChar32 != 0 && target.hasRemaining())
{
ch = fromUChar32;
fromUChar32 = 0;
if (sourceArrayIndex < source.limit()) {
/* test the following code unit */
char trail = source.get(sourceArrayIndex);
if(UTF16.isTrailSurrogate(trail)) {
++sourceArrayIndex;
ch = UTF16.getCodePoint((char)ch, trail);
/* convert this supplementary code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
fromUChar32 = (int)ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);
doloop = false;
}
} else {
/* no more input */
fromUChar32 = (int)ch;
doloop = false;
}
if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE)
{
indexToWrite = 2;
temp[2] = (byte) ((ch >>> 12) | 0xe0);
}
else
{
indexToWrite = 3;
temp[3] = (byte) ((ch >>> 18) | 0xf0);
temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
}
temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
temp[0] = (byte) ((ch & 0x3f) | 0x80);
for (; indexToWrite >= 0; indexToWrite--)
{
if (target.hasRemaining())
{
target.put(temp[indexToWrite]);
}
else
{
errorBuffer[errorBufferLength++] = temp[indexToWrite];
cr = CoderResult.OVERFLOW;
}
}
}
if(doloop) {
while (sourceArrayIndex < source.limit() && target.hasRemaining())
{
ch = source.get(sourceArrayIndex++);
if (ch < 0x80) /* Single byte */
{
target.put((byte)ch);
}
else if (ch < 0x800) /* Double byte */
{
target.put((byte) ((ch >>> 6) | 0xc0));
if (target.hasRemaining())
{
target.put((byte) ((ch & 0x3f) | 0x80));
}
else
{
errorBuffer[0] = (byte) ((ch & 0x3f) | 0x80);
errorBufferLength = 1;
throw new BufferOverflowException();
}
}
else
/* Check for surrogates */
{
if(UTF16.isSurrogate((char)ch) && !isCESU8) {
if(UTF16.isLeadSurrogate((char)ch)) {
if (sourceArrayIndex < source.limit()) {
/* test the following code unit */
char trail = source.get(sourceArrayIndex);
if(UTF16.isTrailSurrogate(trail)) {
++sourceArrayIndex;
ch = UTF16.getCodePoint((char)ch, trail);
//ch2 = 0;
/* convert this supplementary code point */
/* exit this condition tree */
}
else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
fromUChar32 = ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);;
break;
}
}
else {
/* no more input */
fromUChar32 = ch;
break;
}
}
else {
fromUChar32 = (int)ch;
cr = CoderResult.malformedForLength(sourceArrayIndex);
break;
}
}
if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE)
{
indexToWrite = 2;
temp[2] = (byte) ((ch >>> 12) | 0xe0);
}
else
{
indexToWrite = 3;
temp[3] = (byte) ((ch >>> 18) | 0xf0);
temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
}
temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
temp[0] = (byte) ((ch & 0x3f) | 0x80);
for (; indexToWrite >= 0; indexToWrite--)
{
if (target.hasRemaining())
{
target.put(temp[indexToWrite]);
}
else
{
errorBuffer[errorBufferLength++] = temp[indexToWrite];
cr = CoderResult.OVERFLOW;
}
}
}
}
}
if (sourceArrayIndex < source.limit() && !target.hasRemaining())
{
cr = CoderResult.OVERFLOW;
}
source.position(sourceArrayIndex);
}catch(BufferOverflowException ex){
cr = CoderResult.OVERFLOW;
}
return cr;
}
}
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
public static boolean isSingle(byte c) {return (((c)&0x80)==0);}
/**
* Is this code unit (byte) a UTF-8 lead byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
public static boolean isLead(byte c) {return ((((c)-0xc0) & UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
/**
* Is this code unit (byte) a UTF-8 trail byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @stable ICU 2.4
*/
public static boolean isTrail(byte c) {return (((c)&0xc0)==0x80);}
/**
* How many code units (bytes) are used for the UTF-8 encoding
* of this Unicode code point?
* @param c 32-bit code point
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @stable ICU 2.4
*/
public static final int length(int c)
{
long uc = c & UConverterConstants.UNSIGNED_INT_MASK;
return
(uc<=0x7f ? 1 :
(uc<=0x7ff ? 2 :
(uc<=0xd7ff ? 3 :
(uc<=0xdfff || uc>0x10ffff ? 0 :
(uc<=0xffff ? 3 : 4)
)
)
)
);
}
public CharsetDecoder newDecoder() {
return new CharsetDecoderUTF8(this);
}
public CharsetEncoder newEncoder() {
return new CharsetEncoderUTF8(this);
}
}

View file

@ -0,0 +1,16 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
public class InvalidFormatException extends Exception {
public InvalidFormatException(){}
public InvalidFormatException(String message){
super(message);
}
}

View file

@ -0,0 +1,789 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.IOException;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.nio.ByteBuffer;
import com.ibm.icu.charset.CharsetICU;
public final class UConverterAlias {
/** The largest value a 32 bit unsigned integer can hold @draft ICU 3.6 */
public static final long UINT32_MAX = 4294967295L;
public static final int AMBIGUOUS_ALIAS_MAP_BIT = 0x8000;
public static final int CONVERTER_INDEX_MASK = 0xFFF;
public static final int NUM_RESERVED_TAGS = 2;
public static final int NUM_HIDDEN_TAGS = 1;
static int[] gConverterListArray = null;
static int gConverterListArrayIndex;
static int[] gTagListArray = null;
static int gTagListArrayIndex;
static int[] gAliasListArray = null;
static int gAliasListArrayIndex;
static int[] gUntaggedConvArrayArray = null;
static int gUntaggedConvArrayArrayIndex;
static int[] gTaggedAliasArrayArray = null;
static int gTaggedAliasArrayArrayIndex;
static int[] gTaggedAliasListsArray = null;
static int gTaggedAliasListsArrayIndex;
static byte[] gStringTableArray = null;
static int gStringTableArrayIndex;
static long gConverterListSize;
static long gTagListSize;
static long gAliasListSize;
static long gUntaggedConvArraySize;
static long gTaggedAliasArraySize;
static long gTaggedAliasListsSize;
static long gStringTableSize;
static final String GET_STRING(int idx) {
return new String(gStringTableArray, 2 * idx, (int) strlen(gStringTableArray, 2 * idx));
}
public static final int strlen(byte[] sArray, int sBegin)
{
int i = sBegin;
while(i < sArray.length && sArray[i++] != 0) {}
return i - sBegin - 1;
}
public static final int tocLengthIndex = 0;
public static final int converterListIndex = 1;
public static final int tagListIndex = 2;
public static final int aliasListIndex = 3;
public static final int untaggedConvArrayIndex = 4;
public static final int taggedAliasArrayIndex = 5;
public static final int taggedAliasListsIndex = 6;
public static final int reservedIndex1 = 7;
public static final int stringTableIndex = 8;
public static final int minTocLength = 8; /*
* min. tocLength in the file,
* does not count the
* tocLengthIndex!
*/
public static final int offsetsCount = minTocLength + 1; /*
* length of the
* swapper's
* temporary
* offsets[]
*/
static ByteBuffer gAliasData = null;
private static final boolean isAlias(String alias) {
if (alias == null) {
throw new IllegalArgumentException("Alias param is null!");
} else if (alias.length() == 0) {
return false;
} else {
return true;
}
}
private static final String CNVALIAS_DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE + "/cnvalias.icu";
/**
* Default buffer size of datafile
*/
private static final int CNVALIAS_DATA_BUFFER_SIZE = 25000;
private static final synchronized boolean haveAliasData()
throws IOException{
boolean needInit;
// agljport:todo umtx_lock(NULL);
needInit = gAliasData == null;
/* load converter alias data from file if necessary */
if (needInit) {
ByteBuffer data = null;
long[] tableArray = null;
long tableStart;
long reservedSize1;
byte[] reservedBytes = null;
// agljport:fix data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME,
// isAcceptable, NULL, pErrorCode);
// data = udata_openChoice(null, DATA_TYPE, DATA_NAME, 0,
// isAcceptable, null, pErrorCode);
InputStream i = ICUData.getRequiredStream(CNVALIAS_DATA_FILE_NAME);
BufferedInputStream b = new BufferedInputStream(i, CNVALIAS_DATA_BUFFER_SIZE);
UConverterAliasDataReader reader = new UConverterAliasDataReader(b);
tableArray = reader.readToc(offsetsCount);
tableStart = tableArray[0];
if (tableStart < minTocLength) {
throw new IOException("Invalid data format.");
}
gConverterListSize = tableArray[1];
gTagListSize = tableArray[2];
gAliasListSize = tableArray[3];
gUntaggedConvArraySize = tableArray[4];
gTaggedAliasArraySize = tableArray[5];
gTaggedAliasListsSize = tableArray[6];
reservedSize1 = tableArray[7] * 2;
gStringTableSize = tableArray[8] * 2;
gConverterListArray = new int[(int) gConverterListSize];
gTagListArray = new int[(int) gTagListSize];
gAliasListArray = new int[(int) gAliasListSize];
gUntaggedConvArrayArray = new int[(int) gUntaggedConvArraySize];
gTaggedAliasArrayArray = new int[(int) gTaggedAliasArraySize];
gTaggedAliasListsArray = new int[(int) gTaggedAliasListsSize];
reservedBytes = new byte[(int) reservedSize1];
gStringTableArray = new byte[(int) gStringTableSize];
reader.read(gConverterListArray, gTagListArray,
gAliasListArray, gUntaggedConvArrayArray,
gTaggedAliasArrayArray, gTaggedAliasListsArray,
reservedBytes, gStringTableArray);
data = ByteBuffer.allocate(0); // dummy UDataMemory object in absence
// of memory mapping
// agljport:todo umtx_lock(NULL);
if (gAliasData == null) {
gAliasData = data;
data = null;
// agljport:fix ucln_common_registerCleanup(UCLN_COMMON_IO,
// io_cleanup);
}
// agljport:todo umtx_unlock(NULL);
/* if a different thread set it first, then close the extra data */
if (data != null) {
// agljport:fix udata_close(data); /* NULL if it was set
// correctly */
}
}
return true;
}
// U_CFUNC const char * io_getConverterName(const char *alias, UErrorCode
// *pErrorCode)
public static final String io_getConverterName(String alias)
throws IOException{
if (haveAliasData() && isAlias(alias)) {
boolean[] isAmbigous = new boolean[1];
long convNum = findConverter(alias, isAmbigous);
if (convNum < gConverterListSize) {
return GET_STRING(gConverterListArray[(int) convNum]);
}
/* else converter not found */
}
return null;
}
/*
* search for an alias return the converter number index for gConverterList
*/
// static U_INLINE uint32_t findConverter(const char *alias, UErrorCode
// *pErrorCode)
private static final long findConverter(String alias, boolean[] isAmbigous) {
long mid, start, limit;
long lastMid;
long result;
/* do a binary search for the alias */
start = 0;
limit = gUntaggedConvArraySize;
mid = limit;
lastMid = UINT32_MAX;
for (;;) {
mid = (start + limit) / 2;
if (lastMid == mid) { /* Have we moved? */
break; /* We haven't moved, and it wasn't found. */
}
lastMid = mid;
result = compareNames(alias, GET_STRING(gAliasListArray[(int) mid]));
if (result < 0) {
limit = mid;
} else if (result > 0) {
start = mid;
} else {
/*
* Since the gencnval tool folds duplicates into one entry, this
* alias in gAliasList is unique, but different standards may
* map an alias to different converters.
*/
if ((gUntaggedConvArrayArray[(int) mid] & AMBIGUOUS_ALIAS_MAP_BIT) != 0) {
isAmbigous[0]=true;
}
return gUntaggedConvArrayArray[(int) mid] & CONVERTER_INDEX_MASK;
}
}
// public static final long UINT32_MAX = 4294967295L;
return Long.MAX_VALUE;
}
/**
* \var io_stripForCompare Remove the underscores, dashes and spaces from
* the name, and convert the name to lower case.
*
* @param dst
* The destination buffer, which is <= the buffer of name.
* @param dst
* The destination buffer, which is <= the buffer of name.
* @return the destination buffer.
*/
public static final StringBuffer io_stripForCompare(StringBuffer dst, String name) {
return io_stripASCIIForCompare(dst, name);
}
/* @see compareNames */
private static final StringBuffer io_stripASCIIForCompare(StringBuffer dst, String name) {
name = name.concat("\000");
int nameIndex = 0;
char c1 = name.charAt(0);
int dstItr = 0;
while (c1 != 0) {
/* Ignore delimiters '-', '_', and ' ' */
while ((c1 = name.charAt(nameIndex)) == 0x2d || c1 == 0x5f
|| c1 == 0x20) {
++nameIndex;
}
/* lowercase for case-insensitive comparison */
dst.append(Character.toLowerCase(c1));
++dstItr;
++nameIndex;
}
if (dst.length() > 0)
dst.deleteCharAt(dst.length() - 1);
return dst;
}
/**
* Do a fuzzy compare of a two converter/alias names. The comparison is
* case-insensitive. It also ignores the characters '-', '_', and ' ' (dash,
* underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8"
* are exactly equivalent.
*
* This is a symmetrical (commutative) operation; order of arguments is
* insignificant. This is an important property for sorting the list (when
* the list is preprocessed into binary form) and for performing binary
* searches on it at run time.
*
* @param name1
* a converter name or alias, zero-terminated
* @param name2
* a converter name or alias, zero-terminated
* @return 0 if the names match, or a negative value if the name1 lexically
* precedes name2, or a positive value if the name1 lexically
* follows name2.
*
* @see io_stripForCompare
*/
public static int compareNames(String name1, String name2){
int result = 0;
int i1 = 0;
int i2 = 0;
while (true) {
char ch1 = 0;
char ch2 = 0;
// Ignore delimiters '-', '_', and ASCII White_Space
if (i1 < name1.length()) {
ch1 = name1.charAt(i1 ++);
}
while (ch1 == '-' || ch1 == '_' || ch1 == ' ' ) {
if (i1 < name1.length()) {
ch1 = name1.charAt(i1 ++);
}
else {
ch1 = 0;
}
}
if (i2 < name2.length()) {
ch2 = name2.charAt(i2 ++);
}
while (ch2 == '-' || ch2 == '_' || ch2 == ' ' ) {
if (i2 < name2.length()) {
ch2 = name2.charAt(i2 ++);
}
else {
ch2 = 0;
}
}
// If we reach the ends of both strings then they match
if (ch1 == 0 && ch2 == 0) {
return 0;
}
// Case-insensitive comparison
if (ch1 != ch2) {
result = Character.toLowerCase(ch1)- Character.toLowerCase(ch2);
if (result != 0) {
return result;
}
}
}
}
public static int io_countAliases(String alias)
throws IOException{
if (haveAliasData() && isAlias(alias)) {
boolean[] isAmbigous = new boolean[1];
long convNum = findConverter(alias, isAmbigous);
if (convNum < gConverterListSize) {
/* tagListNum - 1 is the ALL tag */
int listOffset = gTaggedAliasArrayArray[(int) ((gTagListSize - 1)
* gConverterListSize + convNum)];
if (listOffset != 0) {
return gTaggedAliasListsArray[listOffset];
}
/* else this shouldn't happen. internal program error */
}
/* else converter not found */
}
return 0;
}
/**
* Return the number of all aliases (and converter names).
*
* @param pErrorCode
* The error code
* @return the number of all aliases
*/
// U_CFUNC uint16_t io_countTotalAliases(UErrorCode *pErrorCode);
public static int io_countTotalAliases() throws IOException{
if (haveAliasData()) {
return (int) gAliasListSize;
}
return 0;
}
// U_CFUNC const char * io_getAlias(const char *alias, uint16_t n,
// UErrorCode *pErrorCode)
public static String io_getAlias(String alias, int n) throws IOException{
if (haveAliasData() && isAlias(alias)) {
boolean[] isAmbigous = new boolean[1];
long convNum = findConverter(alias,isAmbigous);
if (convNum < gConverterListSize) {
/* tagListNum - 1 is the ALL tag */
int listOffset = gTaggedAliasArrayArray[(int) ((gTagListSize - 1)
* gConverterListSize + convNum)];
if (listOffset != 0) {
//long listCount = gTaggedAliasListsArray[listOffset];
/* +1 to skip listCount */
int[] currListArray = gTaggedAliasListsArray;
int currListArrayIndex = listOffset + 1;
return GET_STRING(currListArray[currListArrayIndex + n]);
}
/* else this shouldn't happen. internal program error */
}
/* else converter not found */
}
return null;
}
// U_CFUNC uint16_t io_countStandards(UErrorCode *pErrorCode) {
public static int io_countStandards() throws IOException{
if (haveAliasData()) {
return (int) (gTagListSize - NUM_HIDDEN_TAGS);
}
return 0;
}
// U_CAPI const char * U_EXPORT2getStandard(uint16_t n, UErrorCode
// *pErrorCode)
public static String getStandard(int n) throws IOException{
if (haveAliasData()) {
return GET_STRING(gTagListArray[n]);
}
return null;
}
// U_CAPI const char * U_EXPORT2 getStandardName(const char *alias, const
// char *standard, UErrorCode *pErrorCode)
public static final String getStandardName(String alias, String standard)throws IOException {
if (haveAliasData() && isAlias(alias)) {
long listOffset = findTaggedAliasListsOffset(alias, standard);
if (0 < listOffset && listOffset < gTaggedAliasListsSize) {
int[] currListArray = gTaggedAliasListsArray;
long currListArrayIndex = listOffset + 1;
if (currListArray[0] != 0) {
return GET_STRING(currListArray[(int) currListArrayIndex]);
}
}
}
return null;
}
// U_CAPI uint16_t U_EXPORT2 countAliases(const char *alias, UErrorCode
// *pErrorCode)
public static int countAliases(String alias) throws IOException{
return io_countAliases(alias);
}
// U_CAPI const char* U_EXPORT2 getAlias(const char *alias, uint16_t n,
// UErrorCode *pErrorCode)
public static String getAlias(String alias, int n) throws IOException{
return io_getAlias(alias, n);
}
// U_CFUNC uint16_t countStandards(void)
public static int countStandards()throws IOException{
return io_countStandards();
}
/*returns a single Name from the list, will return NULL if out of bounds
*/
public static String getAvailableName (int n){
try{
if (0 <= n && n <= 0xffff) {
String name = bld_getAvailableConverter(n);
return name;
}
}catch(IOException ex){
//throw away exception
}
return null;
}
// U_CAPI const char * U_EXPORT2 getCanonicalName(const char *alias, const
// char *standard, UErrorCode *pErrorCode) {
public static String getCanonicalName(String alias, String standard) throws IOException{
if (haveAliasData() && isAlias(alias)) {
long convNum = findTaggedConverterNum(alias, standard);
if (convNum < gConverterListSize) {
return GET_STRING(gConverterListArray[(int) convNum]);
}
}
return null;
}
public static int countAvailable (){
try{
return bld_countAvailableConverters();
}catch(IOException ex){
//throw away exception
}
return -1;
}
// U_CAPI UEnumeration * U_EXPORT2 openStandardNames(const char *convName,
// const char *standard, UErrorCode *pErrorCode)
public static final UConverterAliasesEnumeration openStandardNames(String convName, String standard)throws IOException {
UConverterAliasesEnumeration aliasEnum = null;
if (haveAliasData() && isAlias(convName)) {
long listOffset = findTaggedAliasListsOffset(convName, standard);
/*
* When listOffset == 0, we want to acknowledge that the converter
* name and standard are okay, but there is nothing to enumerate.
*/
if (listOffset < gTaggedAliasListsSize) {
UConverterAliasesEnumeration.UAliasContext context = new UConverterAliasesEnumeration.UAliasContext(listOffset, 0);
aliasEnum = new UConverterAliasesEnumeration();
aliasEnum.setContext(context);
}
/* else converter or tag not found */
}
return aliasEnum;
}
// static uint32_t getTagNumber(const char *tagname)
private static long getTagNumber(String tagName) {
if (gTagListArray != null) {
long tagNum;
for (tagNum = 0; tagNum < gTagListSize; tagNum++) {
if (tagName.equals(GET_STRING(gTagListArray[(int) tagNum]))) {
return tagNum;
}
}
}
return UINT32_MAX;
}
// static uint32_t findTaggedAliasListsOffset(const char *alias, const char
// *standard, UErrorCode *pErrorCode)
private static long findTaggedAliasListsOffset(String alias, String standard) {
long idx;
long listOffset;
long convNum;
long tagNum = getTagNumber(standard);
boolean[] isAmbigous = new boolean[1];
/* Make a quick guess. Hopefully they used a TR22 canonical alias. */
convNum = findConverter(alias, isAmbigous);
if (tagNum < (gTagListSize - NUM_HIDDEN_TAGS)
&& convNum < gConverterListSize) {
listOffset = gTaggedAliasArrayArray[(int) (tagNum
* gConverterListSize + convNum)];
if (listOffset != 0
&& gTaggedAliasListsArray[(int) listOffset + 1] != 0) {
return listOffset;
}
if (isAmbigous[0]==true) {
/*
* Uh Oh! They used an ambiguous alias. We have to search the
* whole swiss cheese starting at the highest standard affinity.
* This may take a while.
*/
for (idx = 0; idx < gTaggedAliasArraySize; idx++) {
listOffset = gTaggedAliasArrayArray[(int) idx];
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
long currTagNum = idx / gConverterListSize;
long currConvNum = (idx - currTagNum
* gConverterListSize);
long tempListOffset = gTaggedAliasArrayArray[(int) (tagNum
* gConverterListSize + currConvNum)];
if (tempListOffset != 0
&& gTaggedAliasListsArray[(int) tempListOffset + 1] != 0) {
return tempListOffset;
}
/*
* else keep on looking We could speed this up by
* starting on the next row because an alias is unique
* per row, right now. This would change if alias
* versioning appears.
*/
}
}
/* The standard doesn't know about the alias */
}
/* else no default name */
return 0;
}
/* else converter or tag not found */
return UINT32_MAX;
}
/* Return the canonical name */
// static uint32_t findTaggedConverterNum(const char *alias, const char
// *standard, UErrorCode *pErrorCode)
private static long findTaggedConverterNum(String alias, String standard) {
long idx;
long listOffset;
long convNum;
long tagNum = getTagNumber(standard);
boolean[] isAmbigous = new boolean[1];
/* Make a quick guess. Hopefully they used a TR22 canonical alias. */
convNum = findConverter(alias, isAmbigous);
if (tagNum < (gTagListSize - NUM_HIDDEN_TAGS)
&& convNum < gConverterListSize) {
listOffset = gTaggedAliasArrayArray[(int) (tagNum
* gConverterListSize + convNum)];
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
return convNum;
}
if (isAmbigous[0] == true) {
/*
* Uh Oh! They used an ambiguous alias. We have to search one
* slice of the swiss cheese. We search only in the requested
* tag, not the whole thing. This may take a while.
*/
long convStart = (tagNum) * gConverterListSize;
long convLimit = (tagNum + 1) * gConverterListSize;
for (idx = convStart; idx < convLimit; idx++) {
listOffset = gTaggedAliasArrayArray[(int) idx];
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
return idx - convStart;
}
}
/* The standard doesn't know about the alias */
}
/* else no canonical name */
}
/* else converter or tag not found */
return UINT32_MAX;
}
// static U_INLINE UBool isAliasInList(const char *alias, uint32_t
// listOffset)
private static boolean isAliasInList(String alias, long listOffset) {
if (listOffset != 0) {
long currAlias;
long listCount = gTaggedAliasListsArray[(int) listOffset];
/* +1 to skip listCount */
int[] currList = gTaggedAliasListsArray;
long currListArrayIndex = listOffset + 1;
for (currAlias = 0; currAlias < listCount; currAlias++) {
if (currList[(int) (currAlias + currListArrayIndex)] != 0
&& compareNames(
alias,
GET_STRING(currList[(int) (currAlias + currListArrayIndex)])) == 0) {
return true;
}
}
}
return false;
}
// begin bld.c
static String[] gAvailableConverters = null;
static int gAvailableConverterCount = 0;
static byte[] gDefaultConverterNameBuffer; // [MAX_CONVERTER_NAME_LENGTH +
// 1]; /* +1 for NULL */
static String gDefaultConverterName = null;
// static UBool haveAvailableConverterList(UErrorCode *pErrorCode)
static boolean haveAvailableConverterList() throws IOException{
if (gAvailableConverters == null) {
int idx;
int localConverterCount;
String converterName;
String[] localConverterList;
if (!haveAliasData()) {
return false;
}
/* We can't have more than "*converterTable" converters to open */
localConverterList = new String[(int) gConverterListSize];
localConverterCount = 0;
for (idx = 0; idx < gConverterListSize; idx++) {
converterName = GET_STRING(gConverterListArray[idx]);
//UConverter cnv = UConverter.open(converterName);
//TODO: Fix me
localConverterList[localConverterCount++] = converterName;
}
// agljport:todo umtx_lock(NULL);
if (gAvailableConverters == null) {
gAvailableConverters = localConverterList;
gAvailableConverterCount = localConverterCount;
/* haveData should have already registered the cleanup function */
} else {
// agljport:todo free((char **)localConverterList);
}
// agljport:todo umtx_unlock(NULL);
}
return true;
}
// U_CFUNC uint16_t bld_countAvailableConverters(UErrorCode *pErrorCode)
public static int bld_countAvailableConverters() throws IOException{
if (haveAvailableConverterList()) {
return gAvailableConverterCount;
}
return 0;
}
// U_CFUNC const char * bld_getAvailableConverter(uint16_t n, UErrorCode
// *pErrorCode)
public static String bld_getAvailableConverter(int n) throws IOException{
if (haveAvailableConverterList()) {
if (n < gAvailableConverterCount) {
return gAvailableConverters[n];
}
}
return null;
}
/* default converter name --------------------------------------------------- */
/*
* In order to be really thread-safe, the get function would have to take
* a buffer parameter and copy the current string inside a mutex block.
* This implementation only tries to be really thread-safe while
* setting the name.
* It assumes that setting a pointer is atomic.
*/
// U_CFUNC const char * getDefaultName()
public static final synchronized String getDefaultName() {
/* local variable to be thread-safe */
String name;
//agljport:todo umtx_lock(null);
name = gDefaultConverterName;
//agljport:todo umtx_unlock(null);
if (name == null) {
//UConverter cnv = null;
long length = 0;
name = CharsetICU.getDefaultCharsetName();
/* if the name is there, test it out and get the canonical name with options */
if (name != null) {
// cnv = UConverter.open(name);
// name = cnv.getName(cnv);
// TODO: fix me
}
if (name == null || name.length() == 0 ||/* cnv == null ||*/
length >= gDefaultConverterNameBuffer.length) {
/* Panic time, let's use a fallback. */
name = new String("US-ASCII");
}
//length=(int32_t)(strlen(name));
/* Copy the name before we close the converter. */
name = gDefaultConverterName;
}
return name;
}
//end bld.c
}

View file

@ -0,0 +1,218 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.*;
import com.ibm.icu.impl.ICUDebug;
/* Format of cnvalias.icu -----------------------------------------------------
*
* cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
* This binary form contains several tables. All indexes are to uint16_t
* units, and not to the bytes (uint8_t units). Addressing everything on
* 16-bit boundaries allows us to store more information with small index
* numbers, which are also 16-bit in size. The majority of the table (except
* the string table) are 16-bit numbers.
*
* First there is the size of the Table of Contents (TOC). The TOC
* entries contain the size of each section. In order to find the offset
* you just need to sum up the previous offsets.
* The TOC length and entries are an array of uint32_t values.
* The first section after the TOC starts immediately after the TOC.
*
* 1) This section contains a list of converters. This list contains indexes
* into the string table for the converter name. The index of this list is
* also used by other sections, which are mentioned later on.
* This list is not sorted.
*
* 2) This section contains a list of tags. This list contains indexes
* into the string table for the tag name. The index of this list is
* also used by other sections, which are mentioned later on.
* This list is in priority order of standards.
*
* 3) This section contains a list of sorted unique aliases. This
* list contains indexes into the string table for the alias name. The
* index of this list is also used by other sections, like the 4th section.
* The index for the 3rd and 4th section is used to get the
* alias -> converter name mapping. Section 3 and 4 form a two column table.
*
* 4) This section contains a list of mapped converter names. Consider this
* as a table that maps the 3rd section to the 1st section. This list contains
* indexes into the 1st section. The index of this list is the same index in
* the 3rd section. There is also some extra information in the high bits of
* each converter index in this table. Currently it's only used to say that
* an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
* and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
* the predigested form of the 5th section so that an alias lookup can be fast.
*
* 5) This section contains a 2D array with indexes to the 6th section. This
* section is the full form of all alias mappings. The column index is the
* index into the converter list (column header). The row index is the index
* to tag list (row header). This 2D array is the top part a 3D array. The
* third dimension is in the 6th section.
*
* 6) This is blob of variable length arrays. Each array starts with a size,
* and is followed by indexes to alias names in the string table. This is
* the third dimension to the section 5. No other section should be referencing
* this section.
*
* 7) Reserved at this time (There is no information). This _usually_ has a
* size of 0. Future versions may add more information here.
*
* 8) This is the string table. All strings are indexed on an even address.
* There are two reasons for this. First many chip architectures locate strings
* faster on even address boundaries. Second, since all indexes are 16-bit
* numbers, this string table can be 128KB in size instead of 64KB when we
* only have strings starting on an even address.
*
*
* Here is the concept of section 5 and 6. It's a 3D cube. Each tag
* has a unique alias among all converters. That same alias can
* be mentioned in other standards on different converters,
* but only one alias per tag can be unique.
*
*
* Converter Names (Usually in TR22 form)
* -------------------------------------------.
* T / /|
* a / / |
* g / / |
* s / / |
* / / |
* ------------------------------------------/ |
* A | | |
* l | | |
* i | | /
* a | | /
* s | | /
* e | | /
* s | |/
* -------------------------------------------
*
*
*
* Here is what it really looks like. It's like swiss cheese.
* There are holes. Some converters aren't recognized by
* a standard, or they are really old converters that the
* standard doesn't recognize anymore.
*
* Converter Names (Usually in TR22 form)
* -------------------------------------------.
* T /##########################################/|
* a / # # /#
* g / # ## ## ### # ### ### ### #/
* s / # ##### #### ## ## #/#
* / ### # # ## # # # ### # # #/##
* ------------------------------------------/# #
* A |### # # ## # # # ### # # #|# #
* l |# # # # # ## # #|# #
* i |# # # # # # #|#
* a |# #|#
* s | #|#
* e
* s
*
*/
final class UConverterAliasDataReader implements ICUBinary.Authenticate {
private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
protected UConverterAliasDataReader(InputStream inputStream)
throws IOException{
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
dataInputStream = new DataInputStream(inputStream);
if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
}
// protected methods -------------------------------------------------
protected long[] readToc(int n)throws IOException
{
long[] toc = new long[n];
//Read the toc
for (int i = 0; i < n ; ++i) {
toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
}
return toc;
}
protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, byte[] reservedBytes, byte[] stringTable) throws IOException{
int i;
//int listnum = 1;
//long listsize;
for(i = 0; i < convList.length; ++i)
convList[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < tagList.length; ++i)
tagList[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < aliasList.length; ++i)
aliasList[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < untaggedConvArray.length; ++i)
untaggedConvArray[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < taggedAliasArray.length; ++i)
taggedAliasArray[i] = dataInputStream.readUnsignedShort();
for(i = 0; i < taggedAliasLists.length; ++i)
taggedAliasLists[i] = dataInputStream.readUnsignedShort();
dataInputStream.read(reservedBytes);
dataInputStream.read(stringTable);
}
public byte[] getDataFormatVersion(){
return DATA_FORMAT_VERSION;
}
public boolean isDataVersionAcceptable(byte version[])
{
return version[0] == DATA_FORMAT_VERSION[0];
}
public byte[] getUnicodeVersion(){
return unicodeVersion;
}
// private data members -------------------------------------------------
/**
* ICU data file input stream
*/
private DataInputStream dataInputStream;
private byte[] unicodeVersion;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
// DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x3};
//private static final int UNSIGNED_SHORT_MASK = 0xffff;
private static final long UNSIGNED_INT_MASK = 0xffffffffL;
}

View file

@ -0,0 +1,83 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.util.Enumeration;
/**
* Enumeration for Converter Aliases
*/
public class UConverterAliasesEnumeration implements Enumeration {
private UAliasContext context;
/* Set alias context
*/
public void setContext(UAliasContext context){
this.context = context;
}
public int count() {
int value = 0;
if (context.listOffset!=0) {
value = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset];
}
return value;
}
public Object nextElement() {
if (context.listOffset!=0) {
long listCount = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset];
int[] currListArray = UConverterAlias.gTaggedAliasListsArray;
long currListArrayIndex = context.getListOffset() + 1;
if (context.getListIdx() < listCount) {
String str = UConverterAlias.GET_STRING(currListArray[(int)(context.listIdx+currListArrayIndex)]);
context.listIdx++;
return str;
}
}
/* Either we accessed a zero length list, or we enumerated too far. */
throw new IndexOutOfBoundsException();
}
public void reset() {
context.listIdx = 0;
}
/**
* Class to store context for alias
*/
public static class UAliasContext{
private long listOffset;
private long listIdx;
public UAliasContext(long listOffset, long listIdx){
this.listOffset = listOffset;
this.listIdx = listIdx;
}
public long getListOffset(){
return listOffset;
}
public long getListIdx(){
return listIdx;
}
}
public boolean hasMoreElements() {
long listCount = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset];
return (context.getListIdx() < listCount);
}
}

View file

@ -0,0 +1,156 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
public interface UConverterConstants {
public static final short UNSIGNED_BYTE_MASK = 0xff;
public static final int UNSIGNED_SHORT_MASK = 0xffff;
public static final long UNSIGNED_INT_MASK = 0xffffffffL;
public static final int U_IS_BIG_ENDIAN = 0;
/**
* Useful constant for the maximum size of the whole locale ID
* (including the terminating NULL).
* @draft ICU 3.6
*/
public static final int ULOC_FULLNAME_CAPACITY = 56;
/**
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with U_SENTINEL.
*
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
* @draft ICU 2.4
*/
public static final int U_SENTINEL = -1;
//end utf.h
//begin ucnv.h
/**
* Character that separates converter names from options and options from each other.
* @see open
* @draft ICU 3.6
*/
static final byte OPTION_SEP_CHAR = ',';
/** Maximum length of a converter name including the terminating NULL @draft ICU 3.6 */
public static final int MAX_CONVERTER_NAME_LENGTH = 60;
/** Maximum length of a converter name including path and terminating NULL @draft ICU 3.6 */
public static final int MAX_FULL_FILE_NAME_LENGTH = (600+MAX_CONVERTER_NAME_LENGTH);
/** Shift in for EBDCDIC_STATEFUL and iso2022 states @draft ICU 3.6 */
public static final int SI = 0x0F;
/** Shift out for EBDCDIC_STATEFUL and iso2022 states @draft ICU 3.6 */
public static final int SO = 0x0E;
//end ucnv.h
// begin bld.h
/* size of the overflow buffers in UConverter, enough for escaping callbacks */
//#define ERROR_BUFFER_LENGTH 32
public static final int ERROR_BUFFER_LENGTH = 32;
/* at most 4 bytes per substitution character (part of .cnv file format! see UConverterStaticData) */
public static final int MAX_SUBCHAR_LEN = 4;
/* at most 8 bytes per character in toUBytes[] (UTF-8 uses up to 6) */
public static final int MAX_CHAR_LEN = 8;
/* converter options bits */
public static final int OPTION_VERSION = 0xf;
public static final int OPTION_SWAP_LFNL = 0x10;
public static final int OPTION_MAC = 0x20; //agljport:comment added for Mac ISCII encodings
/** values for the unicodeMask */
public static final int HAS_SUPPLEMENTARY = 1;
public static final int HAS_SURROGATES = 2;
// end bld.h
// begin cnv.h
/* this is used in fromUnicode DBCS tables as an "unassigned" marker */
public static final int missingCharMarker = 0xFFFF;
public final class UConverterResetChoice {
public static final int RESET_BOTH = 0;
public static final int RESET_TO_UNICODE = RESET_BOTH + 1;
public static final int RESET_FROM_UNICODE = RESET_TO_UNICODE + 1;
}
// begin utf16.h
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
* @return 2
* @draft ICU 2.4
*/
public static final int U16_MAX_LENGTH = 2;
// end utf16.h
// begin err.h
/**
* FROM_U, TO_U context options for sub callback
* @draft ICU 3.6
*/
public static byte[] SUB_STOP_ON_ILLEGAL = {'i'};
/**
* FROM_U, TO_U context options for skip callback
* @draft ICU 3.6
*/
public static byte[] SKIP_STOP_ON_ILLEGAL = {'i'};
/**
* The process condition code to be used with the callbacks.
* Codes which are greater than IRREGULAR should be
* passed on to any chained callbacks.
* @draft ICU 3.6
*/
public static final class UConverterCallbackReason {
public static final int UNASSIGNED = 0; /**< The code point is unassigned.
The error code U_INVALID_CHAR_FOUND will be set. */
public static final int ILLEGAL = 1; /**< The code point is illegal. For example,
\\x81\\x2E is illegal in SJIS because \\x2E
is not a valid trail byte for the \\x81
lead byte.
Also, starting with Unicode 3.0.1, non-shortest byte sequences
in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
are also illegal, not just irregular.
The error code U_ILLEGAL_CHAR_FOUND will be set. */
public static final int IRREGULAR = 2; /**< The codepoint is not a regular sequence in
the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
are irregular UTF-8 byte sequences for single surrogate
code points.
The error code U_INVALID_CHAR_FOUND will be set. */
public static final int RESET = 3; /**< The callback is called with this reason when a
'reset' has occured. Callback should reset all
state. */
public static final int CLOSE = 4; /**< Called when the converter is closed. The
callback should release any allocated memory.*/
public static final int CLONE = 5; /**< Called when safeClone() is called on the
converter. the pointer available as the
'context' is an alias to the original converters'
context pointer. If the context must be owned
by the new converter, the callback must clone
the data and call setFromUCallback
(or setToUCallback) with the correct pointer.
@draft ICU 2.2
*/
}
//end err.h
}

View file

@ -0,0 +1,552 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.impl.ICUDebug;
import java.io.IOException;
import java.io.InputStream;
import java.io.DataInputStream;
import java.nio.ByteBuffer;
/**
* ucnvmbcs.h
*
* ICU conversion (.cnv) data file structure, following the usual UDataInfo
* header.
*
* Format version: 6.2
*
* struct UConverterStaticData -- struct containing the converter name, IBM CCSID,
* min/max bytes per character, etc.
* see ucnv_bld.h
*
* --------------------
*
* The static data is followed by conversionType-specific data structures.
* At the moment, there are only variations of MBCS converters. They all have
* the same toUnicode structures, while the fromUnicode structures for SBCS
* differ from those for other MBCS-style converters.
*
* _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
* If it is present, then an ICU version reading header versions 4.0 or 4.1
* will be able to use the base table and ignore the extension.
*
* The unicodeMask in the static data is part of the base table data structure.
* Especially, the UCNV_HAS_SUPPLEMENTARY flag determines the length of the
* fromUnicode stage 1 array.
* The static data unicodeMask refers only to the base table's properties if
* a base table is included.
* In an extension-only file, the static data unicodeMask is 0.
* The extension data indexes have a separate field with the unicodeMask flags.
*
* MBCS-style data structure following the static data.
* Offsets are counted in bytes from the beginning of the MBCS header structure.
* Details about usage in comments in ucnvmbcs.c.
*
* struct _MBCSHeader (see the definition in this header file below)
* contains 32-bit fields as follows:
* 8 values:
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
* 1 uint32_t countStates
* 2 uint32_t countToUFallbacks
* 3 uint32_t offsetToUCodeUnits
* 4 uint32_t offsetFromUTable
* 5 uint32_t offsetFromUBytes
* 6 uint32_t flags, bits:
* 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher
* 0 for older versions and if
* there is not extension structure
* 7.. 0 outputType
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
* counts bytes in fromUBytes[]
*
* if(outputType==MBCS_OUTPUT_EXT_ONLY) {
* -- base table name for extension-only table
* char baseTableName[variable]; -- with NUL plus padding for 4-alignment
*
* -- all _MBCSHeader fields except for version and flags are 0
* } else {
* -- normal base table with optional extension
*
* int32_t stateTable[countStates][256];
*
* struct _MBCSToUFallback { (fallbacks are sorted by offset)
* uint32_t offset;
* UChar32 codePoint;
* } toUFallbacks[countToUFallbacks];
*
* uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
* (padded to an even number of units)
*
* -- stage 1 tables
* if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
* -- stage 1 table for all of Unicode
* uint16_t fromUTable[0x440]; (32-bit-aligned)
* } else {
* -- BMP-only tables have a smaller stage 1 table
* uint16_t fromUTable[0x40]; (32-bit-aligned)
* }
*
* -- stage 2 tables
* length determined by top of stage 1 and bottom of stage 3 tables
* if(outputType==MBCS_OUTPUT_1) {
* -- SBCS: pure indexes
* uint16_t stage 2 indexes[?];
* } else {
* -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
* uint32_t stage 2 flags and indexes[?];
* }
*
* -- stage 3 tables with byte results
* if(outputType==MBCS_OUTPUT_1) {
* -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
* uint16_t fromUBytes[fromUBytesLength/2];
* } else {
* -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
* uint8_t fromUBytes[fromUBytesLength]; or
* uint16_t fromUBytes[fromUBytesLength/2]; or
* uint32_t fromUBytes[fromUBytesLength/4];
* }
* }
*
* -- extension table, details see ucnv_ext.h
* int32_t indexes[>=32]; ...
*/
/*
* ucnv_ext.h
*
* See icuhtml/design/conversion/conversion_extensions.html
*
* Conversion extensions serve two purposes:
* 1. They support m:n mappings.
* 2. They support extension-only conversion files that are used together
* with the regular conversion data in base files.
*
* A base file may contain an extension table (explicitly requested or
* implicitly generated for m:n mappings), but its extension table is not
* used when an extension-only file is used.
*
* It is an error if a base file contains any regular (not extension) mapping
* from the same sequence as a mapping in the extension file
* because the base mapping would hide the extension mapping.
*
*
* Data for conversion extensions:
*
* One set of data structures per conversion direction (to/from Unicode).
* The data structures are sorted by input units to allow for binary search.
* Input sequences of more than one unit are handled like contraction tables
* in collation:
* The lookup value of a unit points to another table that is to be searched
* for the next unit, recursively.
*
* For conversion from Unicode, the initial code point is looked up in
* a 3-stage trie for speed,
* with an additional table of unique results to save space.
*
* Long output strings are stored in separate arrays, with length and index
* in the lookup tables.
* Output results also include a flag distinguishing roundtrip from
* (reverse) fallback mappings.
*
* Input Unicode strings must not begin or end with unpaired surrogates
* to avoid problems with matches on parts of surrogate pairs.
*
* Mappings from multiple characters (code points or codepage state
* table sequences) must be searched preferring the longest match.
* For this to work and be efficient, the variable-width table must contain
* all mappings that contain prefixes of the multiple characters.
* If an extension table is built on top of a base table in another file
* and a base table entry is a prefix of a multi-character mapping, then
* this is an error.
*
*
* Implementation note:
*
* Currently, the parser and several checks in the code limit the number
* of UChars or bytes in a mapping to
* UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
* which are output value limits in the data structure.
*
* For input, this is not strictly necessary - it is a hard limit only for the
* buffers in UConverter that are used to store partial matches.
*
* Input sequences could otherwise be arbitrarily long if partial matches
* need not be stored (i.e., if a sequence does not span several buffers with too
* many units before the last buffer), although then results would differ
* depending on whether partial matches exceed the limits or not,
* which depends on the pattern of buffer sizes.
*
*
* Data structure:
*
* int32_t indexes[>=32];
*
* Array of indexes and lengths etc. The length of the array is at least 32.
* The actual length is stored in indexes[0] to be forward compatible.
*
* Each index to another array is the number of bytes from indexes[].
* Each length of an array is the number of array base units in that array.
*
* Some of the structures may not be present, in which case their indexes
* and lengths are 0.
*
* Usage of indexes[i]:
* [0] length of indexes[]
*
* // to Unicode table
* [1] index of toUTable[] (array of uint32_t)
* [2] length of toUTable[]
* [3] index of toUUChars[] (array of UChar)
* [4] length of toUUChars[]
*
* // from Unicode table, not for the initial code point
* [5] index of fromUTableUChars[] (array of UChar)
* [6] index of fromUTableValues[] (array of uint32_t)
* [7] length of fromUTableUChars[] and fromUTableValues[]
* [8] index of fromUBytes[] (array of char)
* [9] length of fromUBytes[]
*
* // from Unicode trie for initial-code point lookup
* [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
* [11] length of stage 1 portion of fromUStage12[]
* [12] length of fromUStage12[]
* [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
* [14] length of fromUStage3[]
* [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
* [16] length of fromUStage3b[]
*
* [17] Bit field containing numbers of bytes:
* 31..24 reserved, 0
* 23..16 maximum input bytes
* 15.. 8 maximum output bytes
* 7.. 0 maximum bytes per UChar
*
* [18] Bit field containing numbers of UChars:
* 31..24 reserved, 0
* 23..16 maximum input UChars
* 15.. 8 maximum output UChars
* 7.. 0 maximum UChars per byte
*
* [19] Bit field containing flags:
* (extension table unicodeMask)
* 1 UCNV_HAS_SURROGATES flag for the extension table
* 0 UCNV_HAS_SUPPLEMENTARY flag for the extension table
*
* [20]..[30] reserved, 0
* [31] number of bytes for the entire extension structure
* [>31] reserved; there are indexes[0] indexes
*
*
* uint32_t toUTable[];
*
* Array of byte/value pairs for lookups for toUnicode conversion.
* The array is partitioned into sections like collation contraction tables.
* Each section contains one word with the number of following words and
* a default value for when the lookup in this section yields no match.
*
* A section is sorted in ascending order of input bytes,
* allowing for fast linear or binary searches.
* The builder may store entries for a contiguous range of byte values
* (compare difference between the first and last one with count),
* which then allows for direct array access.
* The builder should always do this for the initial table section.
*
* Entries may have 0 values, see below.
* No two entries in a section have the same byte values.
*
* Each uint32_t contains an input byte value in bits 31..24 and the
* corresponding lookup value in bits 23..0.
* Interpret the value as follows:
* if(value==0) {
* no match, see below
* } else if(value<0x1f0000) {
* partial match - use value as index to the next toUTable section
* and match the next unit; (value indexes toUTable[value])
* } else {
* if(bit 23 set) {
* roundtrip;
* } else {
* fallback;
* }
* unset value bit 23;
* if(value<=0x2fffff) {
* (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
* } else {
* bits 17..0 (value&0x3ffff) is an index to
* the result UChars in toUUChars[]; (0 indexes toUUChars[0])
* length of the result=((value>>18)-12); (length=0..19)
* }
* }
*
* The first word in a section contains the number of following words in the
* input byte position (bits 31..24, number=1..0xff).
* The value of the initial word is used when the current byte is not found
* in this section.
* If the value is not 0, then it represents a result as above.
* If the value is 0, then the search has to return a shorter match with an
* earlier default value as the result, or result in "unmappable" even for the
* initial bytes.
* If the value is 0 for the initial toUTable entry, then the initial byte
* does not start any mapping input.
*
*
* UChar toUUChars[];
*
* Contains toUnicode mapping results, stored as sequences of UChars.
* Indexes and lengths stored in the toUTable[].
*
*
* UChar fromUTableUChars[];
* uint32_t fromUTableValues[];
*
* The fromUTable is split into two arrays, but works otherwise much like
* the toUTable. The array is partitioned into sections like collation
* contraction tables and toUTable.
* A row in the table consists of same-index entries in fromUTableUChars[]
* and fromUTableValues[].
*
* Interpret a value as follows:
* if(value==0) {
* no match, see below
* } else if(value<=0xffffff) { (bits 31..24 are 0)
* partial match - use value as index to the next fromUTable section
* and match the next unit; (value indexes fromUTable[value])
* } else {
* if(value==0x80000001) {
* return no mapping, but request for <subchar1>;
* }
* if(bit 31 set) {
* roundtrip;
* } else {
* fallback;
* }
* // bits 30..29 reserved, 0
* length=(value>>24)&0x1f; (bits 28..24)
* if(length==1..3) {
* bits 23..0 contain 1..3 bytes, padded with 00s on the left;
* } else {
* bits 23..0 (value&0xffffff) is an index to
* the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
* }
* }
*
* The first pair in a section contains the number of following pairs in the
* UChar position (16 bits, number=1..0xffff).
* The value of the initial pair is used when the current UChar is not found
* in this section.
* If the value is not 0, then it represents a result as above.
* If the value is 0, then the search has to return a shorter match with an
* earlier default value as the result, or result in "unmappable" even for the
* initial UChars.
*
* If the from Unicode trie is present, then the from Unicode search tables
* are not used for initial code points.
* In this case, the first entries (index 0) in the tables are not used
* (reserved, set to 0) because a value of 0 is used in trie results
* to indicate no mapping.
*
*
* uint16_t fromUStage12[];
*
* Stages 1 & 2 of a trie that maps an initial code point.
* Indexes in stage 1 are all offset by the length of stage 1 so that the
* same array pointer can be used for both stages.
* If (c>>10)>=(length of stage 1) then c does not start any mapping.
* Same bit distribution as for regular conversion tries.
*
*
* uint16_t fromUStage3[];
* uint32_t fromUStage3b[];
*
* Stage 3 of the trie. The first array simply contains indexes to the second,
* which contains words in the same format as fromUTableValues[].
* Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
* and 16-bit entries in stage 3 allow for 64k stage 3b entries.
* The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
*
* Two arrays are used because it is expected that more than half of the stage 3
* entries will be zero. The 16-bit index stage 3 array saves space even
* considering storing a total of 6 bytes per non-zero entry in both arrays
* together.
* Using a stage 3 granularity of >1 diminishes the compactability in that stage
* but provides a larger effective addressing space in stage 2.
* All but the final result stage use 16-bit entries to save space.
*
* fromUStage3b[] contains a zero for "no mapping" at its index 0,
* and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for "<subchar1> SUB mapping"
* (i.e., "no mapping" with preference for <subchar1> rather than <subchar>),
* and all other items are unique non-zero results.
*
* The default value of a fromUTableValues[] section that is referenced
* _directly_ from a fromUStage3b[] item may also be UCNV_EXT_FROM_U_SUBCHAR1,
* but this value must not occur anywhere else in fromUTableValues[]
* because "no mapping" is always a property of a single code point,
* never of multiple.
*
*
* char fromUBytes[];
*
* Contains fromUnicode mapping results, stored as sequences of chars.
* Indexes and lengths stored in the fromUTableValues[].
*/
public final class UConverterDataReader implements ICUBinary.Authenticate {
private final static boolean debug = ICUDebug.enabled("UConverterDataReader");
/*
* public UConverterDataReader(UConverterDataReader r)
{
dataInputStream = new DataInputStream(r.dataInputStream);
unicodeVersion = r.unicodeVersion;
}
*/
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
protected UConverterDataReader(InputStream inputStream)
throws IOException{
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
dataInputStream = new DataInputStream(inputStream);
if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
}
// protected methods -------------------------------------------------
protected void readStaticData(UConverterStaticData sd) throws IOException
{
sd.structSize = dataInputStream.readInt();
byte[] name = new byte[UConverterConstants.MAX_CONVERTER_NAME_LENGTH];
int length = dataInputStream.read(name);
sd.name = new String(name, 0, length);
sd.codepage = dataInputStream.readInt();
sd.platform = dataInputStream.readByte();
sd.conversionType = dataInputStream.readByte();
sd.minBytesPerChar = dataInputStream.readByte();
sd.maxBytesPerChar = dataInputStream.readByte();
dataInputStream.read(sd.subChar);
sd.subCharLen = dataInputStream.readByte();
sd.hasToUnicodeFallback = dataInputStream.readByte();
sd.hasFromUnicodeFallback = dataInputStream.readByte();
sd.unicodeMask = (short)dataInputStream.readUnsignedByte();
sd.subChar1 = dataInputStream.readByte();
dataInputStream.read(sd.reserved);
}
protected void readMBCSHeader(UConverterSharedData.MBCSHeader h) throws IOException
{
dataInputStream.read(h.version);
h.countStates = dataInputStream.readInt();
h.countToUFallbacks = dataInputStream.readInt();
h.offsetToUCodeUnits = dataInputStream.readInt();
h.offsetFromUTable = dataInputStream.readInt();
h.offsetFromUBytes = dataInputStream.readInt();
h.flags = dataInputStream.readInt();
h.fromUBytesLength = dataInputStream.readInt();
}
protected void readMBCSTable(int[][] stateTableArray, UConverterSharedData.MBCSToUFallback[] toUFallbacksArray, char[] unicodeCodeUnitsArray, char[] fromUnicodeTableArray, byte[] fromUnicodeBytesArray) throws IOException
{
int i, j;
for(i = 0; i < stateTableArray.length; ++i)
for(j = 0; j < stateTableArray[i].length; ++j)
stateTableArray[i][j] = dataInputStream.readInt();
for(i = 0; i < toUFallbacksArray.length; ++i) {
toUFallbacksArray[i].offset = dataInputStream.readInt();
toUFallbacksArray[i].codePoint = dataInputStream.readInt();
}
for(i = 0; i < unicodeCodeUnitsArray.length; ++i)
unicodeCodeUnitsArray[i] = dataInputStream.readChar();
for(i = 0; i < fromUnicodeTableArray.length; ++i)
fromUnicodeTableArray[i] = dataInputStream.readChar();
for(i = 0; i < fromUnicodeBytesArray.length; ++i)
fromUnicodeBytesArray[i] = dataInputStream.readByte();
}
protected String readBaseTableName() throws IOException
{
char c;
StringBuffer name = new StringBuffer();
while((c = (char)dataInputStream.readByte()) != 0)
name.append(c);
return name.toString();
}
//protected int[] readExtIndexes(int skip) throws IOException
protected ByteBuffer readExtIndexes(int skip) throws IOException
{
dataInputStream.skipBytes(skip);
int n = dataInputStream.readInt();
int[] indexes = new int[n];
indexes[0] = n;
for(int i = 1; i < n; ++i) {
indexes[i] = dataInputStream.readInt();
}
//return indexes;
ByteBuffer b = ByteBuffer.allocate(indexes[31]);
for(int i = 0; i < n; ++i) {
b.putInt(indexes[i]);
}
dataInputStream.read(b.array(), b.position(), b.remaining());
return b;
}
protected byte[] readExtTables(int n) throws IOException
{
byte[] tables = new byte[n];
dataInputStream.read(tables);
return tables;
}
public byte[] getDataFormatVersion(){
return DATA_FORMAT_VERSION;
}
public boolean isDataVersionAcceptable(byte version[])
{
return version[0] == DATA_FORMAT_VERSION[0];
}
public byte[] getUnicodeVersion(){
return unicodeVersion;
}
// private data members -------------------------------------------------
/**
* ICU data file input stream
*/
private DataInputStream dataInputStream;
private byte[] unicodeVersion;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
// DATA_FORMAT_ID_ values taken from icu4c isCnvAcceptable (ucnv_bld.c)
private static final byte DATA_FORMAT_ID[] = {(byte)0x63, (byte)0x6e, (byte)0x76, (byte)0x74}; // dataFormat="cnvt"
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x6};
}

View file

@ -0,0 +1,545 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.nio.ByteBuffer;
/*
* Defines the UConverterSharedData struct,
* the immutable, shared part of UConverter.
*/
public class UConverterSharedData {
//uint32_t structSize; /* Size of this structure */
public int structSize; /* Size of this structure */
//uint32_t referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */
public int referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */
public static final int MAX_VERSION_LENGTH=4;
//agljport:todo const void *dataMemory; /* from udata_openChoice() - for cleanup */
//agljport:todo void *table; /* Unused. This used to be a UConverterTable - Pointer to conversion data - see mbcs below */
//const UConverterStaticData *staticData; /* pointer to the static (non changing) data. */
public UConverterStaticData staticData; /* pointer to the static (non changing) data. */
//UBool sharedDataCached; /* TRUE: shared data is in cache, don't destroy on close() if 0 ref. FALSE: shared data isn't in the cache, do attempt to clean it up if the ref is 0 */
public boolean sharedDataCached; /* TRUE: shared data is in cache, don't destroy on close() if 0 ref. FALSE: shared data isn't in the cache, do attempt to clean it up if the ref is 0 */
/*UBool staticDataOwned; TRUE if static data owned by shared data & should be freed with it, NEVER true for udata() loaded statics. This ignored variable was removed to make space for sharedDataCached. */
//const UConverterImpl *impl; /* vtable-style struct of mostly function pointers */
//public UConverterImpl impl; /* vtable-style struct of mostly function pointers */
/*initial values of some members of the mutable part of object */
//uint32_t toUnicodeStatus;
public long toUnicodeStatus;
/*
* Shared data structures currently come in two flavors:
* - readonly for built-in algorithmic converters
* - allocated for MBCS, with a pointer to an allocated UConverterTable
* which always has a UConverterMBCSTable
*
* To eliminate one allocation, I am making the UConverterMBCSTable
* a member of the shared data. It is the last member so that static
* definitions of UConverterSharedData work as before.
* The table field above also remains to avoid updating all static
* definitions, but is now unused.
*
* markus 2003-nov-07
*/
public UConverterMBCSTable mbcs;
public UConverterSharedData()
{
mbcs = new UConverterMBCSTable();
}
public UConverterSharedData(int structSize_, int referenceCounter_, UConverterStaticData staticData_, boolean sharedDataCached_,/* UConverterImpl impl_,*/ long toUnicodeStatus_)
{
this();
structSize = structSize_;
referenceCounter = referenceCounter_;
staticData = staticData_;
sharedDataCached = sharedDataCached_;
//impl = impl_;
toUnicodeStatus = toUnicodeStatus_;
}
/**
* UConverterImpl contains all the data and functions for a converter type.
* Its function pointers work much like a C++ vtable.
* Many converter types need to define only a subset of the functions;
* when a function pointer is NULL, then a default action will be performed.
*
* Every converter type must implement toUnicode, fromUnicode, and getNextUChar,
* otherwise the converter may crash.
* Every converter type that has variable-length codepage sequences should
* also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for
* correct offset handling.
* All other functions may or may not be implemented - it depends only on
* whether the converter type needs them.
*
* When open() fails, then close() will be called, if present.
*/
//public class UConverterImpl {
//UConverterType type;
//UConverterToUnicode toUnicode;
/* protected void doToUnicode(UConverterToUnicodeArgs args, int[] pErrorCode)
{
}
public final void toUnicode(UConverterToUnicodeArgs args, int[] pErrorCode)
{
doToUnicode(args, pErrorCode);
}
//UConverterFromUnicode fromUnicode;
protected void doFromUnicode(UConverterFromUnicodeArgs args, int[] pErrorCode)
{
}
public final void fromUnicode(UConverterFromUnicodeArgs args, int[] pErrorCode)
{
doFromUnicode(args, pErrorCode);
}
protected int doGetNextUChar(UConverterToUnicodeArgs args, int[] pErrorCode)
{
return 0;
}
//UConverterGetNextUChar getNextUChar;
public final int getNextUChar(UConverterToUnicodeArgs args, int[] pErrorCode)
{
return doGetNextUChar(args, pErrorCode);
}
//public interface UConverterImplLoadable extends UConverterImpl
protected void doLoad(UConverterLoadArgs pArgs, short[] raw, int[] pErrorCode)
{
}
*/
protected void doUnload()
{
}
/*
//public interface UConverterImplOpenable extends UConverterImpl
protected void doOpen(UConverter cnv, String name, String locale, long options, int[] pErrorCode)
{
}
//UConverterOpen open;
public final void open(UConverter cnv, String name, String locale, long options, int[] pErrorCode)
{
doOpen(cnv, name, locale, options, pErrorCode);
}
protected void doClose(UConverter cnv)
{
}
//UConverterClose close;
public final void close(UConverter cnv)
{
doClose(cnv);
}
protected void doReset(UConverter cnv, int choice)
{
}
//typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice);
//UConverterReset reset;
public final void reset(UConverter cnv, int choice)
{
doReset(cnv, choice);
}
//public interface UConverterImplVariableLength extends UConverterImpl
protected void doToUnicodeWithOffsets(UConverterToUnicodeArgs args, int[] pErrorCode)
{
}
//UConverterToUnicode toUnicodeWithOffsets;
public final void toUnicodeWithOffsets(UConverterToUnicodeArgs args, int[] pErrorCode)
{
doToUnicodeWithOffsets(args, pErrorCode);
}
protected void doFromUnicodeWithOffsets(UConverterFromUnicodeArgs args, int[] pErrorCode)
{
}
//UConverterFromUnicode fromUnicodeWithOffsets;
public final void fromUnicodeWithOffsets(UConverterFromUnicodeArgs args, int[] pErrorCode)
{
doFromUnicodeWithOffsets(args, pErrorCode);
}
//public interface UConverterImplMisc extends UConverterImpl
protected void doGetStarters(UConverter converter, boolean starters[], int[] pErrorCode)
{
}
//UConverterGetStarters getStarters;
public final void getStarters(UConverter converter, boolean starters[], int[] pErrorCode)
{
doGetStarters(converter, starters, pErrorCode);
}
protected String doGetName(UConverter cnv)
{
return "";
}
//UConverterGetName getName;
public final String getName(UConverter cnv)
{
return doGetName(cnv);
}
protected void doWriteSub(UConverterFromUnicodeArgs pArgs, long offsetIndex, int[] pErrorCode)
{
}
//UConverterWriteSub writeSub;
public final void writeSub(UConverterFromUnicodeArgs pArgs, long offsetIndex, int[] pErrorCode)
{
doWriteSub(pArgs, offsetIndex, pErrorCode);
}
protected UConverter doSafeClone(UConverter cnv, byte[] stackBuffer, int[] pBufferSize, int[] status)
{
return new UConverter();
}
//UConverterSafeClone safeClone;
public final UConverter safeClone(UConverter cnv, byte[] stackBuffer, int[] pBufferSize, int[] status)
{
return doSafeClone(cnv, stackBuffer, pBufferSize, status);
}
protected void doGetUnicodeSet(UConverter cnv, UnicodeSet /*USetAdder* / sa, int /*UConverterUnicodeSet* / which, int[] pErrorCode)
{
}
//UConverterGetUnicodeSet getUnicodeSet;
//public final void getUnicodeSet(UConverter cnv, UnicodeSet /*USetAdder* / sa, int /*UConverterUnicodeSet* / which, int[] pErrorCode)
//{
// doGetUnicodeSet(cnv, sa, which, pErrorCode);
//}
//}
static final String DATA_TYPE = "cnv";
private static final int CNV_DATA_BUFFER_SIZE = 25000;
public static final int sizeofUConverterSharedData = 100;
//static UDataMemoryIsAcceptable isCnvAcceptable;
/**
* Load a non-algorithmic converter.
* If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex).
// UConverterSharedData * load(UConverterLoadArgs *pArgs, UErrorCode *err)
public static final UConverterSharedData load(UConverterLoadArgs pArgs, int[] err)
{
UConverterSharedData mySharedConverterData = null;
if(err == null || ErrorCode.isFailure(err[0])) {
return null;
}
if(pArgs.pkg != null && pArgs.pkg.length() != 0) {
application-provided converters are not currently cached
return UConverterSharedData.createConverterFromFile(pArgs, err);
}
//agljport:fix mySharedConverterData = getSharedConverterData(pArgs.name);
if (mySharedConverterData == null)
{
Not cached, we need to stream it in from file
mySharedConverterData = UConverterSharedData.createConverterFromFile(pArgs, err);
if (ErrorCode.isFailure(err[0]) || (mySharedConverterData == null))
{
return null;
}
else
{
share it with other library clients
//agljport:fix shareConverterData(mySharedConverterData);
}
}
else
{
The data for this converter was already in the cache.
Update the reference counter on the shared data: one more client
mySharedConverterData.referenceCounter++;
}
return mySharedConverterData;
}
Takes an alias name gets an actual converter file name
*goes to disk and opens it.
*allocates the memory and returns a new UConverter object
//static UConverterSharedData *createConverterFromFile(UConverterLoadArgs *pArgs, UErrorCode * err)
public static final UConverterSharedData createConverterFromFile(UConverterLoadArgs pArgs, int[] err)
{
UDataMemory data = null;
UConverterSharedData sharedData = null;
//agljport:todo UTRACE_ENTRY_OC(UTRACE_LOAD);
if (err == null || ErrorCode.isFailure(err[0])) {
//agljport:todo UTRACE_EXIT_STATUS(*err);
return null;
}
//agljport:todo UTRACE_DATA2(UTRACE_OPEN_CLOSE, "load converter %s from package %s", pArgs->name, pArgs->pkg);
//agljport:fix data = udata_openChoice(pArgs.pkgArray, DATA_TYPE.getBytes(), pArgs.name, isCnvAcceptable, null, err);
if(ErrorCode.isFailure(err[0]))
{
//agljport:todo UTRACE_EXIT_STATUS(*err);
return null;
}
sharedData = data_unFlattenClone(pArgs, data, err);
if(ErrorCode.isFailure(err[0]))
{
//agljport:fix udata_close(data);
//agljport:todo UTRACE_EXIT_STATUS(*err);
return null;
}
* TODO Store pkg in a field in the shared data so that delta-only converters
* can load base converters from the same package.
* If the pkg name is longer than the field, then either do not load the converter
* in the first place, or just set the pkg field to "".
return sharedData;
}
*/
UConverterDataReader dataReader = null;
/*returns a converter type from a string
*/
// static const UConverterSharedData * getAlgorithmicTypeFromName(const char *realName)
public static final UConverterSharedData getAlgorithmicTypeFromName(String realName)
{
long mid, start, limit;
long lastMid;
int result;
StringBuffer strippedName = new StringBuffer(UConverterConstants.MAX_CONVERTER_NAME_LENGTH);
/* Lower case and remove ignoreable characters. */
UConverterAlias.io_stripForCompare(strippedName, realName);
/* do a binary search for the alias */
start = 0;
limit = cnvNameType.length;
mid = limit;
lastMid = UConverterAlias.UINT32_MAX;
for (;;) {
mid = (long)((start + limit) / 2);
if (lastMid == mid) { /* Have we moved? */
break; /* We haven't moved, and it wasn't found. */
}
lastMid = mid;
result = strippedName.substring(0).compareTo(cnvNameType[(int)mid].name);
if (result < 0) {
limit = mid;
} else if (result > 0) {
start = mid;
} else {
return converterData[cnvNameType[(int)mid].type];
}
}
return null;
}
/**
* Fallbacks to Unicode are stored outside the normal state table and code point structures
* in a vector of items of this type. They are sorted by offset.
*/
public final class MBCSToUFallback {
int offset;
int codePoint;
}
/**
* This is the MBCS part of the UConverterTable union (a runtime data structure).
* It keeps all the per-converter data and points into the loaded mapping tables.
*/
public final class UConverterMBCSTable {
/* toUnicode */
short countStates;
byte dbcsOnlyState;
boolean stateTableOwned;
int countToUFallbacks;
int stateTable[/*countStates*/][/*256*/];
int swapLFNLStateTable[/*countStates*/][/*256*/]; /* for swaplfnl */
char unicodeCodeUnits[/*countUnicodeResults*/];
MBCSToUFallback toUFallbacks[/*countToUFallbacks*/];
/* fromUnicode */
char fromUnicodeTable[];
byte fromUnicodeBytes[];
byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */
int fromUBytesLength;
short outputType, unicodeMask;
/* converter name for swaplfnl */
String swapLFNLName;
/* extension data */
UConverterSharedData baseSharedData;
//int extIndexes[];
ByteBuffer extIndexes; // create int[] view etc. as needed
UConverterMBCSTable()
{
}
UConverterMBCSTable(UConverterMBCSTable t)
{
countStates = t.countStates;
dbcsOnlyState = t.dbcsOnlyState;
stateTableOwned = t.stateTableOwned;
countToUFallbacks = t.countToUFallbacks;
stateTable = t.stateTable;
swapLFNLStateTable = t.swapLFNLStateTable;
unicodeCodeUnits = t.unicodeCodeUnits;
toUFallbacks = t.toUFallbacks;
fromUnicodeTable = t.fromUnicodeTable;
fromUnicodeBytes = t.fromUnicodeBytes;
swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes;
fromUBytesLength = t.fromUBytesLength;
outputType = t.outputType;
unicodeMask = t.unicodeMask;
swapLFNLName = t.swapLFNLName;
baseSharedData = t.baseSharedData;
extIndexes = t.extIndexes;
}
}
/**
* MBCS data header. See data format description above.
*/
public final class MBCSHeader {
byte version[/*U_MAX_VERSION_LENGTH*/];
int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes;
int flags;
int fromUBytesLength;
public MBCSHeader()
{
version = new byte[MAX_VERSION_LENGTH];
}
}
/**
* Enum for specifying basic types of converters
* @see getType
* @draft ICU 3.6
*/
public static final class UConverterType {
public static final int UNSUPPORTED_CONVERTER = -1;
public static final int SBCS = 0;
public static final int DBCS = 1;
public static final int MBCS = 2;
public static final int LATIN_1 = 3;
public static final int UTF8 = 4;
public static final int UTF16_BigEndian = 5;
public static final int UTF16_LittleEndian = 6;
public static final int UTF32_BigEndian = 7;
public static final int UTF32_LittleEndian = 8;
public static final int EBCDIC_STATEFUL = 9;
public static final int ISO_2022 = 10;
public static final int LMBCS_1 = 11;
public static final int LMBCS_2 = LMBCS_1 + 1; //12
public static final int LMBCS_3 = LMBCS_2 + 1; //13
public static final int LMBCS_4 = LMBCS_3 + 1; //14
public static final int LMBCS_5 = LMBCS_4 + 1; //15
public static final int LMBCS_6 = LMBCS_5 + 1; //16
public static final int LMBCS_8 = LMBCS_6 + 1; //17
public static final int LMBCS_11 = LMBCS_8 + 1; //18
public static final int LMBCS_16 = LMBCS_11 + 1; //19
public static final int LMBCS_17 = LMBCS_16 + 1; //20
public static final int LMBCS_18 = LMBCS_17 + 1; //21
public static final int LMBCS_19 = LMBCS_18 + 1; //22
public static final int LMBCS_LAST = LMBCS_19; //22
public static final int HZ =LMBCS_LAST + 1; //23
public static final int SCSU = HZ + 1; //24
public static final int ISCII = SCSU + 1; //25
public static final int US_ASCII = ISCII + 1; //26
public static final int UTF7 = US_ASCII + 1; //27
public static final int BOCU1 = UTF7 + 1; //28
public static final int UTF16 = BOCU1 + 1; //29
public static final int UTF32 = UTF16 + 1; //30
public static final int CESU8 = UTF32 + 1; //31
public static final int IMAP_MAILBOX = CESU8 + 1; //32
public static final int MAC_ARABIC = IMAP_MAILBOX + 1; //33
public static final int MAC_HEBREW = MAC_ARABIC + 1; //34
/* Number of converter types for which we have conversion routines. */
public static final int NUMBER_OF_SUPPORTED_CONVERTER_TYPES = MAC_HEBREW + 1;
}
/**
* Enum for specifying which platform a converter ID refers to.
* The use of platform/CCSID is not recommended. See openCCSID().
* @draft ICU 3.6
*/
public static final class UConverterPlatform {
public static final int UNKNOWN = -1;
public static final int IBM = 0;
}
static UConverterSharedData _MBCSData = null, /*_Latin1Data = null,*/ /*_UTF8Data = null,*/ /*_UTF16BEData = null,*/ /*_UTF16LEData = null,*/ /*_UTF32BEData = null,*/ /*_UTF32LEData = null,*/ /*_ISO2022Data = null,*/ _LMBCSData1 = null,_LMBCSData2 = null, _LMBCSData3 = null, _LMBCSData4 = null, _LMBCSData5 = null, _LMBCSData6 = null, _LMBCSData8 = null,_LMBCSData11 = null,_LMBCSData16 = null,_LMBCSData17 = null,_LMBCSData18 = null,_LMBCSData19 = null, _HZData = null, _SCSUData = null, /*_ISCIIData = null,*/ /*_ASCIIData = null,*/ _UTF7Data = null, _Bocu1Data = null, /*_UTF16Data = null, _UTF32Data = null,*/ _CESU8Data = null, _IMAPData = null;
static UConverterSharedData[] converterData;
static class cnvNameTypeClass {
String name;
int type;
cnvNameTypeClass(String name_, int type_) { name = name_; type = type_; }
}
static cnvNameTypeClass cnvNameType[];
static final String DATA_TYPE = "cnv";
static final int CNV_DATA_BUFFER_SIZE = 25000;
static final int SIZE_OF_UCONVERTER_SHARED_DATA = 100;
static final int MAXIMUM_UCS2 = 0x0000FFFF;
static final int MAXIMUM_UTF = 0x0010FFFF;
static final int MAXIMUM_UCS4 = 0x7FFFFFFF;
static final int HALF_SHIFT = 10;
static final int HALF_BASE = 0x0010000;
static final int HALF_MASK = 0x3FF;
static final int SURROGATE_HIGH_START = 0xD800;
static final int SURROGATE_HIGH_END = 0xDBFF;
static final int SURROGATE_LOW_START = 0xDC00;
static final int SURROGATE_LOW_END = 0xDFFF;
/* -SURROGATE_LOW_START + HALF_BASE */
static final int SURROGATE_LOW_BASE = 9216;
}

View file

@ -0,0 +1,61 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
*******************************************************************************
*/
package com.ibm.icu.impl;
public final class UConverterStaticData { /* +offset: size */
public int structSize; /* +0: 4 Size of this structure */
public String name; /* +4: 60 internal name of the converter- invariant chars */
public int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
public byte platform; /* +68: 1 platform of the converter (only IBM now) */
public byte conversionType; /* +69: 1 conversion type */
public byte minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
public byte maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
public byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
public byte subCharLen; /* +76: 1 */
public byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
public byte hasFromUnicodeFallback; /* +78: 1 */
public short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
public byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
public byte reserved[/*19*/]; /* +81: 19 to round out the structure */
/* total size: 100 */
public UConverterStaticData()
{
subChar = new byte[UConverterConstants.MAX_SUBCHAR_LEN];
reserved = new byte[19];
}
public UConverterStaticData(int structSize_, String name_, int codepage_, byte platform_, byte conversionType_, byte minBytesPerChar_, byte maxBytesPerChar_, byte[] subChar_, byte subCharLen_, byte hasToUnicodeFallback_, byte hasFromUnicodeFallback_, short unicodeMask_, byte subChar1_, byte[] reserved_)
{
structSize = structSize_;
name = name_;
codepage = codepage_;
platform = platform_;
conversionType = conversionType_;
minBytesPerChar = minBytesPerChar_;
maxBytesPerChar = maxBytesPerChar_;
subChar = new byte[UConverterConstants.MAX_SUBCHAR_LEN];
System.arraycopy(subChar_, 0, subChar, 0, (subChar.length < subChar_.length? subChar.length : subChar_.length));
subCharLen = subCharLen_;
hasToUnicodeFallback = hasToUnicodeFallback_;
hasFromUnicodeFallback = hasFromUnicodeFallback_;
unicodeMask = unicodeMask_;
subChar1 = subChar1_;
reserved = new byte[19];
System.arraycopy(reserved_, 0, reserved, 0, (reserved.length < reserved_.length? reserved.length : reserved_.length));
}
public static final int sizeofUConverterStaticData = 100;
}