mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-5018 charset conversion support
X-SVN-Rev: 20172
This commit is contained in:
parent
d2841a5885
commit
e33252c102
21 changed files with 9915 additions and 2 deletions
|
@ -177,7 +177,7 @@
|
|||
|
||||
<!-- core does not build richedit or tests -->
|
||||
<target name="core" depends="init,coreData,icudata" description="build core classes and data">
|
||||
<javac includes="com/ibm/icu/util/**/*.java,com/ibm/icu/text/**/*.java,com/ibm/icu/math/**/*.java,com/ibm/icu/impl/**/*.java,com/ibm/icu/lang/*.java"
|
||||
<javac includes="com/ibm/icu/util/**/*.java,com/ibm/icu/text/**/*.java,com/ibm/icu/math/**/*.java,com/ibm/icu/impl/**/*.java,com/ibm/icu/lang/*.java,com/ibm/icu/charset/**/*.java"
|
||||
excludes="**/CVS/**/*"
|
||||
srcdir="${src.dir}"
|
||||
destdir="${build.dir}"
|
||||
|
@ -431,9 +431,12 @@
|
|||
<target name="jarRelease" depends="jar,jarSrc,jarDocs"/>
|
||||
|
||||
<target name="jar" depends="core,indices" description="build full 'icu4j.jar' jar file">
|
||||
<copy todir="${build.dir}/META-INF">
|
||||
<fileset dir="${src.dir}/META-INF" includes="**/*"/>
|
||||
</copy>
|
||||
<jar jarfile="${jar.file}"
|
||||
compress="true"
|
||||
includes="com/ibm/icu/util/**/*,com/ibm/icu/text/**/*,com/ibm/icu/math/**/*,com/ibm/icu/impl/**/*,com/ibm/icu/lang/**/*"
|
||||
includes="com/ibm/icu/util/**/*,com/ibm/icu/text/**/*,com/ibm/icu/math/**/*,com/ibm/icu/impl/**/*,com/ibm/icu/lang/**/*,META-INF/services/*"
|
||||
basedir="${build.dir}"
|
||||
manifest="${icu4j.manifest}"/>
|
||||
</target>
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
# Copyright (C) 2006, International Business Machines Corporation and others. All Rights Reserved.
|
||||
# icu4j converters
|
||||
com.ibm.icu.charset.CharsetProviderICU
|
158
icu4j/src/com/ibm/icu/charset/CharsetCallback.java
Normal file
158
icu4j/src/com/ibm/icu/charset/CharsetCallback.java
Normal file
|
@ -0,0 +1,158 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
|
||||
/*public*/ class CharsetCallback {
|
||||
/**
|
||||
* FROM_U, TO_U context options for sub callback
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String SUB_STOP_ON_ILLEGAL = "i";
|
||||
|
||||
/**
|
||||
* FROM_U, TO_U context options for skip callback
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String SKIP_STOP_ON_ILLEGAL = "i";
|
||||
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String ESCAPE_ICU = null;
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String ESCAPE_JAVA = "J";
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
|
||||
* TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String ESCAPE_C = "C";
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly
|
||||
* TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String ESCAPE_XML_DEC = "D";
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly
|
||||
* TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String ESCAPE_XML_HEX = "X";
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape teh code unit according to Unicode (U+XXXXX)
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
/*public*/ static final String ESCAPE_UNICODE = "U";
|
||||
|
||||
public interface Decoder {
|
||||
public CoderResult call(CharsetDecoderICU decoder, Object context,
|
||||
ByteBuffer source, CharBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, CoderResult cr);
|
||||
}
|
||||
|
||||
public interface Encoder {
|
||||
public CoderResult call(CharsetEncoderICU encoder, Object context,
|
||||
CharBuffer source, ByteBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, int cp, CoderResult cr);
|
||||
}
|
||||
public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
|
||||
public CoderResult call(CharsetEncoderICU encoder, Object context,
|
||||
CharBuffer source, ByteBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, int cp, CoderResult cr){
|
||||
if(context==null){
|
||||
return CoderResult.UNDERFLOW;
|
||||
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
|
||||
if(!cr.isUnmappable()){
|
||||
return cr;
|
||||
}else{
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
};
|
||||
public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
|
||||
public CoderResult call(CharsetDecoderICU decoder, Object context,
|
||||
ByteBuffer source, CharBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, CoderResult cr){
|
||||
if(context==null){
|
||||
return CoderResult.UNDERFLOW;
|
||||
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
|
||||
if(!cr.isUnmappable()){
|
||||
return cr;
|
||||
}else{
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
};
|
||||
|
||||
public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
|
||||
public CoderResult call(CharsetEncoderICU encoder, Object context,
|
||||
CharBuffer source, ByteBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, int cp, CoderResult cr){
|
||||
if(context==null){
|
||||
return encoder.cbFromUWriteSub(encoder, source, target, offsets);
|
||||
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
|
||||
if(!cr.isUnmappable()){
|
||||
return cr;
|
||||
}else{
|
||||
return encoder.cbFromUWriteSub(encoder, source, target, offsets);
|
||||
}
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
};
|
||||
|
||||
public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() {
|
||||
public CoderResult call(CharsetDecoderICU decoder, Object context,
|
||||
ByteBuffer source, CharBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, CoderResult cr){
|
||||
|
||||
if(context==null){
|
||||
return decoder.cbToUWriteSub(decoder, source, target, offsets);
|
||||
}else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
|
||||
if(!cr.isUnmappable()){
|
||||
return cr;
|
||||
}else{
|
||||
return decoder.cbToUWriteSub(decoder, source, target, offsets);
|
||||
}
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
};
|
||||
|
||||
public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
|
||||
public CoderResult call(CharsetEncoderICU encoder, Object context,
|
||||
CharBuffer source, ByteBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, int cp, CoderResult cr){
|
||||
return cr;
|
||||
}
|
||||
};
|
||||
public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
|
||||
public CoderResult call(CharsetDecoderICU decoder, Object context,
|
||||
ByteBuffer source, CharBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, CoderResult cr){
|
||||
return cr;
|
||||
}
|
||||
};
|
||||
}
|
639
icu4j/src/com/ibm/icu/charset/CharsetDecoderICU.java
Normal file
639
icu4j/src/com/ibm/icu/charset/CharsetDecoderICU.java
Normal file
|
@ -0,0 +1,639 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.MalformedInputException;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
|
||||
public abstract class CharsetDecoderICU extends CharsetDecoder{
|
||||
|
||||
protected int toUnicodeStatus;
|
||||
protected byte[] toUBytesArray = new byte[128];
|
||||
protected int toUBytesBegin = 0;
|
||||
protected int toULength;
|
||||
protected char[] charErrorBufferArray = new char[128];
|
||||
protected int charErrorBufferLength;
|
||||
protected int charErrorBufferBegin;
|
||||
protected char[] invalidCharBuffer = new char[128];
|
||||
protected int invalidCharLength;
|
||||
|
||||
/* store previous UChars/chars to continue partial matches */
|
||||
protected byte[] preToUArray;
|
||||
protected int preToUBegin;
|
||||
protected int preToULength; /* negative: replay */
|
||||
protected int preToUFirstLength; /* length of first character */
|
||||
|
||||
protected Object toUContext = null;
|
||||
private CharsetCallback.Decoder onUnmappableInput = CharsetCallback.TO_U_CALLBACK_STOP;
|
||||
private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
|
||||
protected CharsetCallback.Decoder toCharErrorBehaviour= new CharsetCallback.Decoder(){
|
||||
public CoderResult call(CharsetDecoderICU decoder, Object context,
|
||||
ByteBuffer source, CharBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, CoderResult cr) {
|
||||
if(cr.isUnmappable()){
|
||||
return onUnmappableInput.call(decoder, context,
|
||||
source, target, offsets,
|
||||
buffer, length, cr);
|
||||
}else if(cr.isMalformed()){
|
||||
return onMalformedInput.call(decoder, context,
|
||||
source, target, offsets,
|
||||
buffer, length, cr);
|
||||
}
|
||||
return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context,
|
||||
source, target, offsets,
|
||||
buffer, length, cr);
|
||||
}
|
||||
};
|
||||
|
||||
protected CharsetDecoderICU(CharsetICU cs) {
|
||||
super(cs, (float) (1/(float)cs.maxCharsPerByte), cs.maxCharsPerByte);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the action to be taken if an illegal sequence is encountered
|
||||
* @param newAction action to be taken
|
||||
* @exception IllegalArgumentException
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected final void implOnMalformedInput(CodingErrorAction newAction) {
|
||||
onMalformedInput = getCallback(newAction);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the action to be taken if an illegal sequence is encountered
|
||||
* @param newAction action to be taken
|
||||
* @exception IllegalArgumentException
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
|
||||
onUnmappableInput = getCallback(newAction);
|
||||
}
|
||||
private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
|
||||
if(action==CodingErrorAction.REPLACE){
|
||||
return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
|
||||
}else if(action==CodingErrorAction.IGNORE){
|
||||
return CharsetCallback.TO_U_CALLBACK_SKIP;
|
||||
}else if(action==CodingErrorAction.REPORT){
|
||||
return CharsetCallback.TO_U_CALLBACK_STOP;
|
||||
}
|
||||
return CharsetCallback.TO_U_CALLBACK_STOP;
|
||||
}
|
||||
/**
|
||||
* Flushes any characters saved in the converter's internal buffer and
|
||||
* resets the converter.
|
||||
* @param out action to be taken
|
||||
* @return result of flushing action and completes the decoding all input.
|
||||
* Returns CoderResult.UNDERFLOW if the action succeeds.
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected final CoderResult implFlush(CharBuffer out) {
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the to Unicode mode of converter
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected void implReset() {
|
||||
toUnicodeStatus = 0 ;
|
||||
toULength = 0;
|
||||
charErrorBufferLength = 0;
|
||||
charErrorBufferBegin = 0;
|
||||
|
||||
/* store previous UChars/chars to continue partial matches */
|
||||
preToUBegin = 0;
|
||||
preToULength = 0; /* negative: replay */
|
||||
preToUFirstLength = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes one or more bytes. The default behaviour of the converter
|
||||
* is stop and report if an error in input stream is encountered.
|
||||
* To set different behaviour use @see CharsetDecoder.onMalformedInput()
|
||||
* This method allows a buffer by buffer conversion of a data stream.
|
||||
* The state of the conversion is saved between calls to convert.
|
||||
* Among other things, this means multibyte input sequences can be
|
||||
* split between calls. If a call to convert results in an Error, the
|
||||
* conversion may be continued by calling convert again with suitably
|
||||
* modified parameters.All conversions should be finished with a call to
|
||||
* the flush method.
|
||||
* @param in buffer to decode
|
||||
* @param out buffer to populate with decoded result
|
||||
* @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
|
||||
* action succeeds or more input is needed for completing the decoding action.
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
|
||||
if(!in.hasRemaining()){
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
in.position(in.position()+toUCountPending());
|
||||
/* do the conversion */
|
||||
CoderResult ret = decode(in, out, null, false);
|
||||
|
||||
setSourcePosition(in);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements the ICU semantic for decode operation
|
||||
* @param in
|
||||
* @param out
|
||||
* @return
|
||||
*/
|
||||
protected abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets);
|
||||
|
||||
/**
|
||||
* Implements the ICU semantic for decode operation
|
||||
* @param source
|
||||
* @param target
|
||||
* @param offsets
|
||||
* @param flush
|
||||
* @return
|
||||
* @throws MalformedInputException
|
||||
*/
|
||||
protected final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
|
||||
|
||||
/* check parameters */
|
||||
if(target==null || source==null) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
/*
|
||||
* Make sure that the buffer sizes do not exceed the number range for
|
||||
* int32_t because some functions use the size (in units or bytes)
|
||||
* rather than comparing pointers, and because offsets are int32_t values.
|
||||
*
|
||||
* size_t is guaranteed to be unsigned and large enough for the job.
|
||||
*
|
||||
* Return with an error instead of adjusting the limits because we would
|
||||
* not be able to maintain the semantics that either the source must be
|
||||
* consumed or the target filled (unless an error occurs).
|
||||
* An adjustment would be sourceLimit=t+0x7fffffff; for example.
|
||||
*/
|
||||
/*agljport:fix
|
||||
if(
|
||||
((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
|
||||
((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
|
||||
) {
|
||||
*err=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
/* flush the target overflow buffer */
|
||||
if(charErrorBufferLength>0) {
|
||||
char[] overflow = null;
|
||||
int i, length;
|
||||
|
||||
overflow=charErrorBufferArray;
|
||||
length=charErrorBufferLength;
|
||||
i=0;
|
||||
do {
|
||||
if(target.remaining()<0) {
|
||||
/* the overflow buffer contains too much, keep the rest */
|
||||
int j=0;
|
||||
|
||||
do {
|
||||
overflow[j++]=overflow[i++];
|
||||
} while(i<length);
|
||||
|
||||
charErrorBufferLength=(byte)j;
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
/* copy the overflow contents to the target */
|
||||
target.put(overflow[i++]);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1); /* no source index available for old output */
|
||||
}
|
||||
} while(i<length);
|
||||
|
||||
/* the overflow buffer is completely copied to the target */
|
||||
charErrorBufferLength=0;
|
||||
}
|
||||
|
||||
if(!flush && source.remaining()==0 && preToULength>=0) {
|
||||
/* the overflow buffer is emptied and there is no new input: we are done */
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not simply return with a buffer overflow error if
|
||||
* !flush && t==targetLimit
|
||||
* because it is possible that the source will not generate any output.
|
||||
* For example, the skip callback may be called;
|
||||
* it does not output anything.
|
||||
*/
|
||||
|
||||
return toUnicodeWithCallback(source, target, offsets, flush);
|
||||
}
|
||||
|
||||
/* maximum number of indexed bytes */
|
||||
private static final int EXT_MAX_BYTES = 0x1f;
|
||||
private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
|
||||
int limit;
|
||||
int delta, offset;
|
||||
|
||||
if(sourceIndex>=0) {
|
||||
/*
|
||||
* adjust each offset by adding the previous sourceIndex
|
||||
* minus the length of the input sequence that caused an
|
||||
* error, if any
|
||||
*/
|
||||
delta=sourceIndex-errorInputLength;
|
||||
} else {
|
||||
/*
|
||||
* set each offset to -1 because this conversion function
|
||||
* does not handle offsets
|
||||
*/
|
||||
delta=-1;
|
||||
}
|
||||
limit=offsets.position()+length;
|
||||
if(delta==0) {
|
||||
/* most common case, nothing to do */
|
||||
} else if(delta>0) {
|
||||
/* add the delta to each offset (but not if the offset is <0) */
|
||||
while(offsets.position()<limit) {
|
||||
offset=offsets.get(offsets.position());
|
||||
if(offset>=0) {
|
||||
offsets.put(offset+delta);
|
||||
}
|
||||
//FIXME: ++offsets;
|
||||
}
|
||||
} else /* delta<0 */ {
|
||||
/*
|
||||
* set each offset to -1 because this conversion function
|
||||
* does not handle offsets
|
||||
* or the error input sequence started in a previous buffer
|
||||
*/
|
||||
while(offsets.position()<limit) {
|
||||
offsets.put(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
protected final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
|
||||
|
||||
int sourceIndex;
|
||||
int errorInputLength;
|
||||
boolean converterSawEndOfInput, calledCallback;
|
||||
int t=target.position();
|
||||
int s=source.position();
|
||||
/* variables for m:n conversion */
|
||||
ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
|
||||
int replayArrayIndex = 0;
|
||||
|
||||
ByteBuffer realSource=null;
|
||||
boolean realFlush=false;
|
||||
int realSourceIndex=0;
|
||||
|
||||
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
/* get the converter implementation function */
|
||||
sourceIndex=0;
|
||||
|
||||
if(preToULength>=0) {
|
||||
/* normal mode */
|
||||
} else {
|
||||
/*
|
||||
* Previous m:n conversion stored source units from a partial match
|
||||
* and failed to consume all of them.
|
||||
* We need to "replay" them from a temporary buffer and convert them first.
|
||||
*/
|
||||
realSource=source;
|
||||
realFlush=flush;
|
||||
realSourceIndex=sourceIndex;
|
||||
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
|
||||
replayArray.put(preToUArray,0, -preToULength);
|
||||
source=replayArray;
|
||||
source.position(0);
|
||||
source.limit(replayArrayIndex-preToULength);
|
||||
flush=false;
|
||||
sourceIndex=-1;
|
||||
preToULength=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* loop for conversion and error handling
|
||||
*
|
||||
* loop {
|
||||
* convert
|
||||
* loop {
|
||||
* update offsets
|
||||
* handle end of input
|
||||
* handle errors/call callback
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
for(;;) {
|
||||
if(cr.isUnderflow()) {
|
||||
/* convert */
|
||||
cr = decodeLoop(source, target, offsets);
|
||||
|
||||
/*
|
||||
* set a flag for whether the converter
|
||||
* successfully processed the end of the input
|
||||
*
|
||||
* need not check cnv->preToULength==0 because a replay (<0) will cause
|
||||
* s<sourceLimit before converterSawEndOfInput is checked
|
||||
*/
|
||||
converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength==0);
|
||||
} else {
|
||||
/* handle error from getNextUChar() */
|
||||
converterSawEndOfInput=false;
|
||||
}
|
||||
|
||||
/* no callback called yet for this iteration */
|
||||
calledCallback=false;
|
||||
|
||||
/* no sourceIndex adjustment for conversion, only for callback output */
|
||||
errorInputLength=0;
|
||||
|
||||
/*
|
||||
* loop for offsets and error handling
|
||||
*
|
||||
* iterates at most 3 times:
|
||||
* 1. to clean up after the conversion function
|
||||
* 2. after the callback
|
||||
* 3. after the callback again if there was truncated input
|
||||
*/
|
||||
for(;;) {
|
||||
/* update offsets if we write any */
|
||||
if(offsets!=null) {
|
||||
|
||||
int length=(target.position()-t);
|
||||
if(length>0) {
|
||||
updateOffsets(offsets, length, sourceIndex, errorInputLength);
|
||||
|
||||
|
||||
/*
|
||||
* if a converter handles offsets and updates the offsets
|
||||
* pointer at the end, then pArgs->offset should not change
|
||||
* here;
|
||||
* however, some converters do not handle offsets at all
|
||||
* (sourceIndex<0) or may not update the offsets pointer
|
||||
*/
|
||||
//TODO: pArgs->offsets=offsets+=length;
|
||||
}
|
||||
|
||||
if(sourceIndex>=0) {
|
||||
sourceIndex+=(source.position()-s);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(preToULength<0) {
|
||||
/*
|
||||
* switch the source to new replay units (cannot occur while replaying)
|
||||
* after offset handling and before end-of-input and callback handling
|
||||
*/
|
||||
if(realSource==null)
|
||||
{
|
||||
realSource=source;
|
||||
realFlush=flush;
|
||||
realSourceIndex=sourceIndex;
|
||||
|
||||
//UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
|
||||
replayArray.put(preToUArray,0, -preToULength);
|
||||
|
||||
source=replayArray;
|
||||
source.limit(replayArrayIndex-preToULength);
|
||||
flush=false;
|
||||
if((sourceIndex+=preToULength)<0) {
|
||||
sourceIndex=-1;
|
||||
}
|
||||
|
||||
preToULength=0;
|
||||
} else {
|
||||
/* see implementation note before _fromUnicodeWithCallback() */
|
||||
//agljport:todo U_ASSERT(realSource==NULL);
|
||||
Assert.assrt(realSource==null);
|
||||
}
|
||||
}
|
||||
|
||||
/* update pointers */
|
||||
s=source.position();
|
||||
t=target.position();
|
||||
|
||||
if(cr.isUnderflow()) {
|
||||
if(s<source.limit())
|
||||
{
|
||||
/*
|
||||
* continue with the conversion loop while there is still input left
|
||||
* (continue converting by breaking out of only the inner loop)
|
||||
*/
|
||||
break;
|
||||
} else if(realSource!=null) {
|
||||
/* switch back from replaying to the real source and continue */
|
||||
source = realSource;
|
||||
flush=realFlush;
|
||||
sourceIndex=realSourceIndex;
|
||||
realSource=null;
|
||||
break;
|
||||
} else if(flush && toULength>0) {
|
||||
/*
|
||||
* the entire input stream is consumed
|
||||
* and there is a partial, truncated input sequence left
|
||||
*/
|
||||
|
||||
/* inject an error and continue with callback handling */
|
||||
cr = CoderResult.malformedForLength(toULength);
|
||||
calledCallback=false; /* new error condition */
|
||||
} else {
|
||||
/* input consumed */
|
||||
if(flush) {
|
||||
/*
|
||||
* return to the conversion loop once more if the flush
|
||||
* flag is set and the conversion function has not
|
||||
* successfully processed the end of the input yet
|
||||
*
|
||||
* (continue converting by breaking out of only the inner loop)
|
||||
*/
|
||||
if(!converterSawEndOfInput) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* reset the converter without calling the callback function */
|
||||
implReset();
|
||||
}
|
||||
|
||||
/* done successfully */
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
/* U_FAILURE(*err) */
|
||||
{
|
||||
|
||||
if( calledCallback || cr.isOverflow() ||
|
||||
(cr.isMalformed() && cr.isUnmappable())
|
||||
) {
|
||||
/*
|
||||
* the callback did not or cannot resolve the error:
|
||||
* set output pointers and return
|
||||
*
|
||||
* the check for buffer overflow is redundant but it is
|
||||
* a high-runner case and hopefully documents the intent
|
||||
* well
|
||||
*
|
||||
* if we were replaying, then the replay buffer must be
|
||||
* copied back into the UConverter
|
||||
* and the real arguments must be restored
|
||||
*/
|
||||
if(realSource!=null) {
|
||||
int length;
|
||||
Assert.assrt(preToULength==0);
|
||||
length=(int)(source.limit()-source.position());
|
||||
if(length>0) {
|
||||
//UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
|
||||
source.get(preToUArray, preToUBegin, length);
|
||||
preToULength=(byte)-length;
|
||||
}
|
||||
|
||||
source=realSource;
|
||||
flush=realFlush;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
/* copy toUBytes[] to invalidCharBuffer[] */
|
||||
errorInputLength=invalidCharLength=toULength;
|
||||
if(errorInputLength>0) {
|
||||
copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
|
||||
}
|
||||
|
||||
/* set the converter state to deal with the next character */
|
||||
toULength=0;
|
||||
|
||||
/* call the callback function */
|
||||
cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
|
||||
/*
|
||||
* loop back to the offset handling
|
||||
*
|
||||
* this flag will indicate after offset handling
|
||||
* that a callback was called;
|
||||
* if the callback did not resolve the error, then we return
|
||||
*/
|
||||
calledCallback=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Releases the system resources by cleanly closing ICU converter opened
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected void finalize()throws Throwable{
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of chars held in the converter's internal state
|
||||
* because more input is needed for completing the conversion. This function is
|
||||
* useful for mapping semantics of ICU's converter interface to those of iconv,
|
||||
* and this information is not needed for normal conversion.
|
||||
* @param cnv The converter in which the input is held as internal state
|
||||
* @param status ICU error code in/out parameter.
|
||||
* Must fulfill U_SUCCESS before the function call.
|
||||
* @return The number of chars in the state. -1 if an error is encountered.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
/*public*/ int toUCountPending() {
|
||||
if(preToULength > 0){
|
||||
return preToULength ;
|
||||
}else if(preToULength < 0){
|
||||
return -preToULength;
|
||||
}else if(toULength > 0){
|
||||
return toULength;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
private final void setSourcePosition(ByteBuffer source){
|
||||
// ok was there input held in the previous invocation of decodeLoop
|
||||
// that resulted in output in this invocation?
|
||||
source.position(source.position() - toUCountPending());
|
||||
|
||||
}
|
||||
private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
|
||||
for(int i=srcOffset; i<length; i++){
|
||||
dst[dstOffset++]=(char)src[srcOffset++];
|
||||
}
|
||||
}
|
||||
protected static final CoderResult toUWriteUChars( CharsetDecoderICU cnv,
|
||||
char[] ucharsArray, int ucharsBegin, int length,
|
||||
CharBuffer target, IntBuffer offsets, int sourceIndex) {
|
||||
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
/* write UChars */
|
||||
if(offsets==null) {
|
||||
try{
|
||||
while(length>0) {
|
||||
target.put(ucharsArray[ucharsBegin++]);
|
||||
--length;
|
||||
}
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
} else {
|
||||
/* output with offsets */
|
||||
try{
|
||||
while(length>0) {
|
||||
target.put(ucharsArray[ucharsBegin++]);
|
||||
offsets.put(sourceIndex);
|
||||
--length;
|
||||
}
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
/* write overflow */
|
||||
if(length>0) {
|
||||
cnv.charErrorBufferLength= length;
|
||||
do {
|
||||
cnv.charErrorBufferArray[cnv.charErrorBufferBegin++]=ucharsArray[ucharsBegin++];
|
||||
} while(--length>0);
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
/**
|
||||
* Sub classes to override this method if required
|
||||
* @param decoder
|
||||
* @param source
|
||||
* @param target
|
||||
* @param offsets
|
||||
* @return
|
||||
*/
|
||||
protected CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
|
||||
ByteBuffer source, CharBuffer target,
|
||||
IntBuffer offsets){
|
||||
String sub = decoder.replacement();
|
||||
CharsetICU cs = (CharsetICU) decoder.charset();
|
||||
if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
|
||||
char[] subArr = new char[] { 0x1a };
|
||||
return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
|
||||
.length(), target, offsets, source.position());
|
||||
} else {
|
||||
return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
|
||||
0, sub.length(), target, offsets, source.position());
|
||||
|
||||
}
|
||||
}
|
||||
}
|
631
icu4j/src/com/ibm/icu/charset/CharsetEncoderICU.java
Normal file
631
icu4j/src/com/ibm/icu/charset/CharsetEncoderICU.java
Normal file
|
@ -0,0 +1,631 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.MalformedInputException;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
|
||||
public abstract class CharsetEncoderICU extends CharsetEncoder {
|
||||
|
||||
protected byte[] errorBuffer = new byte[30];
|
||||
protected int errorBufferLength = 0;
|
||||
|
||||
/** these are for encodeLoopICU */
|
||||
protected int fromUnicodeStatus;
|
||||
protected int fromUChar32;
|
||||
protected boolean useSubChar1;
|
||||
|
||||
/* store previous UChars/chars to continue partial matches */
|
||||
protected int preFromUFirstCP; /* >=0: partial match */
|
||||
protected char[] preFromUArray;
|
||||
protected int preFromUBegin;
|
||||
protected int preFromULength; /* negative: replay */
|
||||
|
||||
protected char[] invalidUCharBuffer = new char[2];
|
||||
protected int invalidUCharLength;
|
||||
protected Object fromUContext;
|
||||
private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP;
|
||||
private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP;
|
||||
protected CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder(){
|
||||
public CoderResult call(CharsetEncoderICU encoder, Object context,
|
||||
CharBuffer source, ByteBuffer target, IntBuffer offsets,
|
||||
char[] buffer, int length, int cp, CoderResult cr) {
|
||||
if(cr.isUnmappable()){
|
||||
return onUnmappableInput.call(encoder, context,
|
||||
source, target, offsets,
|
||||
buffer, length, cp, cr);
|
||||
}else if(cr.isMalformed()){
|
||||
return onMalformedInput.call(encoder, context,
|
||||
source, target, offsets,
|
||||
buffer, length, cp, cr);
|
||||
}
|
||||
return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context,
|
||||
source, target, offsets,
|
||||
buffer, length, cp, cr);
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Construcs a new encoder for the given charset
|
||||
* @param cs for which the decoder is created
|
||||
* @param cHandle the address of ICU converter
|
||||
* @param replacement the substitution bytes
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected CharsetEncoderICU(CharsetICU cs, byte[] replacement) {
|
||||
super(cs, (cs.minBytesPerChar+cs.maxBytesPerChar)/2, cs.maxBytesPerChar, replacement);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the action to be taken if an illegal sequence is encountered
|
||||
* @param newAction action to be taken
|
||||
* @exception IllegalArgumentException
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected void implOnMalformedInput(CodingErrorAction newAction) {
|
||||
onMalformedInput = getCallback(newAction);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the action to be taken if an illegal sequence is encountered
|
||||
* @param newAction action to be taken
|
||||
* @exception IllegalArgumentException
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
|
||||
onUnmappableInput = getCallback(newAction);
|
||||
}
|
||||
|
||||
private static CharsetCallback.Encoder getCallback(CodingErrorAction action){
|
||||
if(action==CodingErrorAction.REPLACE){
|
||||
return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE;
|
||||
}else if(action==CodingErrorAction.IGNORE){
|
||||
return CharsetCallback.FROM_U_CALLBACK_SKIP;
|
||||
}else if(action==CodingErrorAction.REPORT){
|
||||
return CharsetCallback.FROM_U_CALLBACK_STOP;
|
||||
}
|
||||
return CharsetCallback.FROM_U_CALLBACK_STOP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes any characters saved in the converter's internal buffer and
|
||||
* resets the converter.
|
||||
* @param out action to be taken
|
||||
* @return result of flushing action and completes the decoding all input.
|
||||
* Returns CoderResult.UNDERFLOW if the action succeeds.
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected CoderResult implFlush(ByteBuffer out) {
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the from Unicode mode of converter
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected void implReset() {
|
||||
errorBufferLength=0;
|
||||
fromUChar32=0;
|
||||
fromUnicodeStatus = 0;
|
||||
preFromUBegin = 0;
|
||||
preFromUFirstCP = 0;
|
||||
preFromULength = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes one or more chars. The default behaviour of the
|
||||
* converter is stop and report if an error in input stream is encountered.
|
||||
* To set different behaviour use @see CharsetEncoder.onMalformedInput()
|
||||
* @param in buffer to decode
|
||||
* @param out buffer to populate with decoded result
|
||||
* @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
|
||||
* action succeeds or more input is needed for completing the decoding action.
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
|
||||
if(!in.hasRemaining()){
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
in.position(in.position()+fromUCountPending());
|
||||
/* do the conversion */
|
||||
CoderResult ret = encode(in, out, null, false);
|
||||
setSourcePosition(in);
|
||||
return ret;
|
||||
}
|
||||
/**
|
||||
* Implements ICU semantics of buffer management
|
||||
* @param source
|
||||
* @param target
|
||||
* @param offsets
|
||||
* @return
|
||||
* @throws MalformedInputException
|
||||
*/
|
||||
protected abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets);
|
||||
|
||||
/**
|
||||
* Implements ICU semantics for encoding the buffer
|
||||
* @param in
|
||||
* @param out
|
||||
* @return
|
||||
*/
|
||||
protected final CoderResult encode(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
|
||||
|
||||
|
||||
/* check parameters */
|
||||
if(target==null || source==null) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the buffer sizes do not exceed the number range for
|
||||
* int32_t because some functions use the size (in units or bytes)
|
||||
* rather than comparing pointers, and because offsets are int32_t values.
|
||||
*
|
||||
* size_t is guaranteed to be unsigned and large enough for the job.
|
||||
*
|
||||
* Return with an error instead of adjusting the limits because we would
|
||||
* not be able to maintain the semantics that either the source must be
|
||||
* consumed or the target filled (unless an error occurs).
|
||||
* An adjustment would be targetLimit=t+0x7fffffff; for example.
|
||||
*/
|
||||
//Ram: not required
|
||||
//if( ((long)(sourceLimit-sArrayIndex)>(long)0x3fffffff && sourceLimit>sArrayIndex) || ((long)(targetLimit-tArrayIndex)>(long)0x7fffffff && targetLimit>tArrayIndex)) {
|
||||
// err[0]=ErrorCode.U_ILLEGAL_ARGUMENT_ERROR;
|
||||
// return;
|
||||
//}
|
||||
|
||||
/* flush the target overflow buffer */
|
||||
if(errorBufferLength>0) {
|
||||
byte[] overflowArray;
|
||||
int i, length;
|
||||
|
||||
overflowArray=errorBuffer;
|
||||
length=errorBufferLength;
|
||||
i=0;
|
||||
do {
|
||||
if(target.remaining()==0) {
|
||||
/* the overflow buffer contains too much, keep the rest */
|
||||
int j=0;
|
||||
|
||||
do {
|
||||
overflowArray[j++]=overflowArray[i++];
|
||||
} while(i<length);
|
||||
|
||||
errorBufferLength=(byte)j;
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
/* copy the overflow contents to the target */
|
||||
target.put(overflowArray[i++]);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1); /* no source index available for old output */
|
||||
}
|
||||
} while(i<length);
|
||||
|
||||
/* the overflow buffer is completely copied to the target */
|
||||
errorBufferLength=0;
|
||||
}
|
||||
|
||||
if(!flush && source.remaining()==0 && preFromULength>=0) {
|
||||
/* the overflow buffer is emptied and there is no new input: we are done */
|
||||
return CoderResult.UNDERFLOW;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not simply return with a buffer overflow error if
|
||||
* !flush && t==targetLimit
|
||||
* because it is possible that the source will not generate any output.
|
||||
* For example, the skip callback may be called;
|
||||
* it does not output anything.
|
||||
*/
|
||||
|
||||
return fromUnicodeWithCallback(source, target, offsets, flush);
|
||||
|
||||
}
|
||||
/* maximum number of indexed UChars */
|
||||
public static final int EXT_MAX_UCHARS = 19;
|
||||
|
||||
protected final CoderResult fromUnicodeWithCallback(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){
|
||||
int sBufferIndex;
|
||||
int sourceIndex;
|
||||
int errorInputLength;
|
||||
boolean converterSawEndOfInput, calledCallback;
|
||||
|
||||
|
||||
/* variables for m:n conversion */
|
||||
CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS);
|
||||
int replayArrayIndex=0;
|
||||
CharBuffer realSource;
|
||||
boolean realFlush;
|
||||
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
/* get the converter implementation function */
|
||||
sourceIndex=0;
|
||||
|
||||
if(preFromULength>=0) {
|
||||
/* normal mode */
|
||||
realSource=null;
|
||||
realFlush=false;
|
||||
} else {
|
||||
/*
|
||||
* Previous m:n conversion stored source units from a partial match
|
||||
* and failed to consume all of them.
|
||||
* We need to "replay" them from a temporary buffer and convert them first.
|
||||
*/
|
||||
realSource=source;
|
||||
realFlush = flush;
|
||||
|
||||
//UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
|
||||
replayArray.put(preFromUArray,0, -preFromULength);
|
||||
source.position(replayArrayIndex);
|
||||
source.limit(replayArrayIndex-preFromULength); //preFromULength is negative, see declaration
|
||||
source=replayArray;
|
||||
flush=false;
|
||||
|
||||
preFromULength=0;
|
||||
}
|
||||
|
||||
/*
|
||||
* loop for conversion and error handling
|
||||
*
|
||||
* loop {
|
||||
* convert
|
||||
* loop {
|
||||
* update offsets
|
||||
* handle end of input
|
||||
* handle errors/call callback
|
||||
* }
|
||||
* }
|
||||
*/
|
||||
for(;;) {
|
||||
/* convert */
|
||||
cr = encodeLoop(source, target, offsets);
|
||||
/*
|
||||
* set a flag for whether the converter
|
||||
* successfully processed the end of the input
|
||||
*
|
||||
* need not check cnv.preFromULength==0 because a replay (<0) will cause
|
||||
* s<sourceLimit before converterSawEndOfInput is checked
|
||||
*/
|
||||
converterSawEndOfInput= (boolean)(cr.isUnderflow() && flush && source.remaining()==0 && fromUChar32==0);
|
||||
|
||||
/* no callback called yet for this iteration */
|
||||
calledCallback=false;
|
||||
|
||||
/* no sourceIndex adjustment for conversion, only for callback output */
|
||||
errorInputLength=0;
|
||||
|
||||
/*
|
||||
* loop for offsets and error handling
|
||||
*
|
||||
* iterates at most 3 times:
|
||||
* 1. to clean up after the conversion function
|
||||
* 2. after the callback
|
||||
* 3. after the callback again if there was truncated input
|
||||
*/
|
||||
for(;;) {
|
||||
/* update offsets if we write any */
|
||||
if(offsets!=null) {
|
||||
int length = target.remaining();
|
||||
if(length>0) {
|
||||
|
||||
/*
|
||||
* if a converter handles offsets and updates the offsets
|
||||
* pointer at the end, then offset should not change
|
||||
* here;
|
||||
* however, some converters do not handle offsets at all
|
||||
* (sourceIndex<0) or may not update the offsets pointer
|
||||
*/
|
||||
offsets.position(offsets.position()+length);
|
||||
}
|
||||
|
||||
if(sourceIndex>=0) {
|
||||
sourceIndex+=(int)(source.position());
|
||||
}
|
||||
}
|
||||
|
||||
if(preFromULength<0) {
|
||||
/*
|
||||
* switch the source to new replay units (cannot occur while replaying)
|
||||
* after offset handling and before end-of-input and callback handling
|
||||
*/
|
||||
if(realSource==null) {
|
||||
realSource=source;
|
||||
realFlush=flush;
|
||||
|
||||
//UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
|
||||
replayArray.put(preFromUArray,0, -preFromULength);
|
||||
|
||||
source=replayArray;
|
||||
source.position(replayArrayIndex);
|
||||
source.limit(replayArrayIndex-preFromULength);
|
||||
flush=false;
|
||||
if((sourceIndex+=preFromULength)<0) {
|
||||
sourceIndex=-1;
|
||||
}
|
||||
|
||||
preFromULength=0;
|
||||
} else {
|
||||
/* see implementation note before _fromUnicodeWithCallback() */
|
||||
//agljport:todo U_ASSERT(realSource==NULL);
|
||||
Assert.assrt(realSource==null);
|
||||
}
|
||||
}
|
||||
|
||||
/* update pointers */
|
||||
sBufferIndex=source.position();
|
||||
if(cr.isUnderflow()) {
|
||||
if(sBufferIndex<source.limit()) {
|
||||
/*
|
||||
* continue with the conversion loop while there is still input left
|
||||
* (continue converting by breaking out of only the inner loop)
|
||||
*/
|
||||
break;
|
||||
} else if(realSource!=null) {
|
||||
/* switch back from replaying to the real source and continue */
|
||||
source=realSource;
|
||||
flush=realFlush;
|
||||
sourceIndex=source.position();
|
||||
realSource=null;
|
||||
break;
|
||||
} else if(flush && fromUChar32!=0) {
|
||||
/*
|
||||
* the entire input stream is consumed
|
||||
* and there is a partial, truncated input sequence left
|
||||
*/
|
||||
|
||||
/* inject an error and continue with callback handling */
|
||||
//err[0]=ErrorCode.U_TRUNCATED_CHAR_FOUND;
|
||||
cr = CoderResult.malformedForLength(1);
|
||||
calledCallback=false; /* new error condition */
|
||||
} else {
|
||||
/* input consumed */
|
||||
if(flush) {
|
||||
/*
|
||||
* return to the conversion loop once more if the flush
|
||||
* flag is set and the conversion function has not
|
||||
* successfully processed the end of the input yet
|
||||
*
|
||||
* (continue converting by breaking out of only the inner loop)
|
||||
*/
|
||||
if(!converterSawEndOfInput) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* reset the converter without calling the callback function */
|
||||
implReset();
|
||||
}
|
||||
|
||||
/* done successfully */
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
/*U_FAILURE(*err) */
|
||||
{
|
||||
|
||||
if( calledCallback || cr.isOverflow() ||
|
||||
(cr.isMalformed() && cr.isUnmappable())
|
||||
){
|
||||
/*
|
||||
* the callback did not or cannot resolve the error:
|
||||
* set output pointers and return
|
||||
*
|
||||
* the check for buffer overflow is redundant but it is
|
||||
* a high-runner case and hopefully documents the intent
|
||||
* well
|
||||
*
|
||||
* if we were replaying, then the replay buffer must be
|
||||
* copied back into the UConverter
|
||||
* and the real arguments must be restored
|
||||
*/
|
||||
if(realSource!=null) {
|
||||
int length;
|
||||
|
||||
//agljport:todo U_ASSERT(cnv.preFromULength==0);
|
||||
|
||||
length=source.remaining();
|
||||
if(length>0) {
|
||||
//UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR);
|
||||
source.get(preFromUArray, 0, length );
|
||||
preFromULength=(byte)-length;
|
||||
}
|
||||
source=realSource;
|
||||
flush=realFlush;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
/* callback handling */
|
||||
{
|
||||
/* get and write the code point */
|
||||
errorInputLength = UTF16.append(invalidUCharBuffer, 0, fromUChar32);
|
||||
invalidUCharLength = errorInputLength;
|
||||
|
||||
/* set the converter state to deal with the next character */
|
||||
fromUChar32=0;
|
||||
|
||||
/* call the callback function */
|
||||
cr = fromCharErrorBehaviour.call(this, fromUContext, source, target, offsets, invalidUCharBuffer, invalidUCharLength, fromUChar32, cr);
|
||||
}
|
||||
|
||||
/*
|
||||
* loop back to the offset handling
|
||||
*
|
||||
* this flag will indicate after offset handling
|
||||
* that a callback was called;
|
||||
* if the callback did not resolve the error, then we return
|
||||
*/
|
||||
calledCallback=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Ascertains if a given Unicode code point (32bit value for handling surrogates)
|
||||
* can be converted to the target encoding. If the caller wants to test if a
|
||||
* surrogate pair can be converted to target encoding then the
|
||||
* responsibility of assembling the int value lies with the caller.
|
||||
* For assembling a code point the caller can use UTF16 class of ICU4J and do something like:
|
||||
* <pre>
|
||||
* while(i<mySource.length){
|
||||
* if(UTF16.isLeadSurrogate(mySource[i])&& i+1< mySource.length){
|
||||
* if(UTF16.isTrailSurrogate(mySource[i+1])){
|
||||
* int temp = UTF16.charAt(mySource,i,i+1,0);
|
||||
* if(!((CharsetEncoderICU) myConv).canEncode(temp)){
|
||||
* passed=false;
|
||||
* }
|
||||
* i++;
|
||||
* i++;
|
||||
* }
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* or
|
||||
* <pre>
|
||||
* String src = new String(mySource);
|
||||
* int i,codepoint;
|
||||
* boolean passed = false;
|
||||
* while(i<src.length()){
|
||||
* codepoint = UTF16.charAt(src,i);
|
||||
* i+= (codepoint>0xfff)? 2:1;
|
||||
* if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){
|
||||
* passed = false;
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param codepoint Unicode code point as int value
|
||||
* @return true if a character can be converted
|
||||
* @draft ICU 3.6
|
||||
*
|
||||
*/
|
||||
public boolean canEncode(int codepoint) {
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isLegalReplacement(byte[] repl){
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases the system resources by cleanly closing ICU converter opened
|
||||
* @exception Throwable exception thrown by super class' finalize method
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected void finalize() throws Throwable {
|
||||
}
|
||||
|
||||
protected static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv,
|
||||
byte[] bytesArray, int bytesBegin, int bytesLength,
|
||||
ByteBuffer out, IntBuffer offsets, int sourceIndex){
|
||||
|
||||
//write bytes
|
||||
int obl = bytesLength;
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
int bytesLimit = bytesBegin + bytesLength;
|
||||
try{
|
||||
for (;bytesBegin< bytesLimit;){
|
||||
out.put(bytesArray[bytesBegin]);
|
||||
bytesBegin++;
|
||||
}
|
||||
// success
|
||||
bytesLength=0;
|
||||
}catch( BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
|
||||
if(offsets!=null) {
|
||||
while(obl>bytesLength) {
|
||||
offsets.put(sourceIndex);
|
||||
--obl;
|
||||
}
|
||||
}
|
||||
//write overflow
|
||||
cnv.errorBufferLength = bytesLimit - bytesBegin;
|
||||
if(cnv.errorBufferLength >0) {
|
||||
if(cnv!=null) {
|
||||
int index = 0;
|
||||
while(bytesBegin<bytesLimit) {
|
||||
cnv.errorBuffer[index++]=bytesArray[bytesBegin++];
|
||||
}
|
||||
}
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of chars held in the converter's internal state
|
||||
* because more input is needed for completing the conversion. This function is
|
||||
* useful for mapping semantics of ICU's converter interface to those of iconv,
|
||||
* and this information is not needed for normal conversion.
|
||||
* @param cnv The converter in which the input is held as internal state
|
||||
* @param status ICU error code in/out parameter.
|
||||
* Must fulfill U_SUCCESS before the function call.
|
||||
* @return The number of chars in the state. -1 if an error is encountered.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
/*public*/ int fromUCountPending(){
|
||||
if(preFromULength > 0){
|
||||
return UTF16.getCharCount(preFromUFirstCP)+preFromULength ;
|
||||
}else if(preFromULength < 0){
|
||||
return -preFromULength ;
|
||||
}else if(fromUChar32 > 0){
|
||||
return 1;
|
||||
}else if(preFromUFirstCP >0){
|
||||
return UTF16.getCharCount(preFromUFirstCP);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @param source
|
||||
*/
|
||||
private final void setSourcePosition(CharBuffer source){
|
||||
|
||||
// ok was there input held in the previous invocation of decodeLoop
|
||||
// that resulted in output in this invocation?
|
||||
source.position(source.position() - fromUCountPending());
|
||||
}
|
||||
/**
|
||||
* Write the codepage substitution character.
|
||||
* Subclasses to override this method.
|
||||
* For stateful converters, it is typically necessary to handle this
|
||||
* specificially for the converter in order to properly maintain the state.
|
||||
*/
|
||||
protected CoderResult cbFromUWriteSub (CharsetEncoderICU encoder,
|
||||
CharBuffer source, ByteBuffer target,
|
||||
IntBuffer offsets){
|
||||
CharsetICU cs = (CharsetICU) encoder.charset();
|
||||
byte[] sub = encoder.replacement();
|
||||
if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) {
|
||||
return CharsetEncoderICU.fromUWriteBytes(encoder,
|
||||
new byte[] { cs.subChar1 }, 0, 1, target, offsets, source
|
||||
.position());
|
||||
} else {
|
||||
return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0,
|
||||
sub.length, target, offsets, source.position());
|
||||
}
|
||||
}
|
||||
}
|
192
icu4j/src/com/ibm/icu/charset/CharsetICU.java
Normal file
192
icu4j/src/com/ibm/icu/charset/CharsetICU.java
Normal file
|
@ -0,0 +1,192 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.util.HashMap;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
|
||||
|
||||
public abstract class CharsetICU extends Charset{
|
||||
|
||||
protected String icuCanonicalName;
|
||||
protected String javaCanonicalName;
|
||||
protected int options;
|
||||
|
||||
protected int maxBytesPerChar;
|
||||
protected int minBytesPerChar;
|
||||
protected float maxCharsPerByte;
|
||||
protected byte subChar1 = 0x00;
|
||||
|
||||
protected int mode;
|
||||
protected boolean flush;
|
||||
protected boolean useFallback;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param icuCanonicalName
|
||||
* @param canonName
|
||||
* @param aliases
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
protected CharsetICU(String icuCanonicalName, String canonicalName, String[] aliases) {
|
||||
super(canonicalName,aliases);
|
||||
if(canonicalName.length() == 0){
|
||||
throw new IllegalCharsetNameException(canonicalName);
|
||||
}
|
||||
this.javaCanonicalName = canonicalName;
|
||||
this.icuCanonicalName = icuCanonicalName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ascertains if a charset is a sub set of this charset
|
||||
* @param cs charset to test
|
||||
* @return true if the given charset is a subset of this charset
|
||||
*/
|
||||
public boolean contains(Charset cs){
|
||||
if (null == cs) {
|
||||
return false;
|
||||
} else if (this.equals(cs)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
private static final HashMap algorithmicCharsets = new HashMap();
|
||||
static{
|
||||
algorithmicCharsets.put("BOCU-1", "com.ibm.icu.impl.CharsetBOCU1" );
|
||||
algorithmicCharsets.put("CESU-8", "com.ibm.icu.impl.CharsetCESU8" );
|
||||
algorithmicCharsets.put("HZ", "com.ibm.icu.impl.CharsetHZ" );
|
||||
algorithmicCharsets.put("imapmailboxname", "com.ibm.icu.impl.CharsetIMAP" );
|
||||
algorithmicCharsets.put("ISCII", "com.ibm.icu.impl.CharsetISCII" );
|
||||
algorithmicCharsets.put("iso2022", "com.ibm.icu.impl.CharsetISO2022" );
|
||||
algorithmicCharsets.put("iso88591", "com.ibm.icu.impl.CharsetBOCU1" );
|
||||
algorithmicCharsets.put("lmbcs1", "com.ibm.icu.impl.CharsetLMBCS1" );
|
||||
algorithmicCharsets.put("lmbcs11", "com.ibm.icu.impl.CharsetLMBCS11" );
|
||||
algorithmicCharsets.put("lmbcs16", "com.ibm.icu.impl.CharsetLMBCS16" );
|
||||
algorithmicCharsets.put("lmbcs17", "com.ibm.icu.impl.CharsetLMBCS17" );
|
||||
algorithmicCharsets.put("lmbcs18", "com.ibm.icu.impl.CharsetLMBCS18" );
|
||||
algorithmicCharsets.put("lmbcs19", "com.ibm.icu.impl.CharsetLMBCS19" );
|
||||
algorithmicCharsets.put("lmbcs2", "com.ibm.icu.impl.CharsetLMBCS2" );
|
||||
algorithmicCharsets.put("lmbcs3", "com.ibm.icu.impl.CharsetLMBCS3" );
|
||||
algorithmicCharsets.put("lmbcs4", "com.ibm.icu.impl.CharsetLMBCS4" );
|
||||
algorithmicCharsets.put("lmbcs5", "com.ibm.icu.impl.CharsetLMBCS5" );
|
||||
algorithmicCharsets.put("lmbcs6", "com.ibm.icu.impl.CharsetLMBCS6" );
|
||||
algorithmicCharsets.put("lmbcs8", "com.ibm.icu.impl.CharsetLMBCS8" );
|
||||
algorithmicCharsets.put("scsu", "com.ibm.icu.impl.CharsetSCSU" );
|
||||
algorithmicCharsets.put("usascii", "com.ibm.icu.impl.CharsetUSASCII" );
|
||||
algorithmicCharsets.put("UTF-16", "com.ibm.icu.impl.CharsetUTF16" );
|
||||
algorithmicCharsets.put("UTF-16BE", "com.ibm.icu.impl.CharsetUTF16" );
|
||||
algorithmicCharsets.put("UTF-16LE", "com.ibm.icu.impl.CharsetUTF16LE" );
|
||||
algorithmicCharsets.put("UTF16_OppositeEndian", "com.ibm.icu.impl.CharsetUTF16LE" );
|
||||
algorithmicCharsets.put("UTF16_PlatformEndian", "com.ibm.icu.impl.CharsetUTF16" );
|
||||
algorithmicCharsets.put("UTF-32", "com.ibm.icu.impl.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF-32BE", "com.ibm.icu.impl.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF-32LE", "com.ibm.icu.impl.CharsetUTF32LE" );
|
||||
algorithmicCharsets.put("UTF32_PlatformEndian", "com.ibm.icu.impl.CharsetUTF32LE" );
|
||||
algorithmicCharsets.put("UTF32_OppositeEndian", "com.ibm.icu.impl.CharsetUTF32" );
|
||||
algorithmicCharsets.put("UTF-7", "com.ibm.icu.impl.CharsetUTF7" );
|
||||
algorithmicCharsets.put("UTF-8", "com.ibm.icu.impl.CharsetUTF8" );
|
||||
}
|
||||
|
||||
/*public*/ static final Charset getCharset(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
String className = (String) algorithmicCharsets.get(icuCanonicalName);
|
||||
if(className==null){
|
||||
//all the cnv files are loaded as MBCS
|
||||
className = "com.ibm.icu.impl.CharsetMBCS";
|
||||
}
|
||||
try{
|
||||
CharsetICU conv = null;
|
||||
Class cs = Class.forName(className);
|
||||
Class[] paramTypes = new Class[]{ String.class, String.class, String[].class};
|
||||
final Constructor c = cs.getConstructor(paramTypes);
|
||||
Object[] params = new Object[]{ icuCanonicalName, javaCanonicalName, aliases};
|
||||
|
||||
java.security.AccessController.doPrivileged
|
||||
(new java.security.PrivilegedAction() {
|
||||
public Object run() {
|
||||
c.setAccessible(true);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
// Run constructor
|
||||
try {
|
||||
Object obj = c.newInstance(params);
|
||||
if(obj!=null && obj instanceof CharsetICU){
|
||||
conv = (CharsetICU)obj;
|
||||
return conv;
|
||||
}
|
||||
}catch (InvocationTargetException e) {
|
||||
throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className+ ". Exception:" + e.getTargetException());
|
||||
}
|
||||
}catch(ClassNotFoundException ex){
|
||||
}catch(NoSuchMethodException ex){
|
||||
}catch (IllegalAccessException ex){
|
||||
}catch (InstantiationException ex){
|
||||
}
|
||||
throw new UnsupportedCharsetException( icuCanonicalName+": "+"Could not load " + className);
|
||||
}
|
||||
|
||||
/** Always use fallbacks from codepage to Unicode */
|
||||
protected final boolean isToUUseFallback() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Use fallbacks from Unicode to codepage when useFallback or for private-use code points */
|
||||
protected final boolean isFromUUseFallback(int c) {
|
||||
return (useFallback) || isPrivateUse(c);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public static final String getDefaultCharsetName(){
|
||||
String defaultEncoding = new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding();
|
||||
return defaultEncoding;
|
||||
}
|
||||
|
||||
/*public*/ static final boolean isPrivateUse(int c) {
|
||||
return (UCharacter.getType(c) == UCharacter.PRIVATE_USE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a charset object for the named charset.
|
||||
* This method gurantee that ICU charset is returned when
|
||||
* available. If the ICU charset provider does not support
|
||||
* the specified charset, then try other charset providers
|
||||
* including the standard Java charset provider.
|
||||
*
|
||||
* @param charsetName The name of the requested charset,
|
||||
* may be either a canonical name or an alias
|
||||
* @return A charset object for the named charset
|
||||
* @throws IllegalCharsetNameException If the given charset name
|
||||
* is illegal
|
||||
* @throws UnsupportedCharsetException If no support for the
|
||||
* named charset is available in this instance of th Java
|
||||
* virtual machine
|
||||
*/
|
||||
public static Charset forNameICU(String charsetName) throws IllegalCharsetNameException, UnsupportedCharsetException {
|
||||
CharsetProviderICU icuProvider = new CharsetProviderICU();
|
||||
Charset cs = icuProvider.charsetForName(charsetName);
|
||||
if (cs != null) {
|
||||
return cs;
|
||||
}
|
||||
return Charset.forName(charsetName);
|
||||
}
|
||||
}
|
||||
|
260
icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java
Normal file
260
icu4j/src/com/ibm/icu/charset/CharsetProviderICU.java
Normal file
|
@ -0,0 +1,260 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.charset;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.nio.charset.spi.CharsetProvider;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.ibm.icu.impl.UConverterAlias;
|
||||
|
||||
public final class CharsetProviderICU extends CharsetProvider{
|
||||
|
||||
/**
|
||||
* Constructs a CharsetProviderICU object
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public CharsetProviderICU(){
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a charset for the given charset name
|
||||
* @param charsetName charset name
|
||||
* @return charset objet for the given charset name, null if unsupported
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public final Charset charsetForName(String charsetName){
|
||||
try{
|
||||
// get the canonical name
|
||||
String icuCanonicalName = getICUCanonicalName(charsetName);
|
||||
|
||||
// create the converter object and return it
|
||||
if(icuCanonicalName==null || icuCanonicalName.length()==0){
|
||||
// this would make the Charset API to throw
|
||||
// unsupported encoding exception
|
||||
return null;
|
||||
}
|
||||
return getCharset(icuCanonicalName);
|
||||
}catch(UnsupportedCharsetException ex){
|
||||
}catch(IOException ex){
|
||||
}
|
||||
return null;
|
||||
}
|
||||
/**
|
||||
* Gets the canonical name of the converter as defined by Java
|
||||
* @param enc converter name
|
||||
* @return canonical name of the converter
|
||||
* @internal ICU 3.4
|
||||
*/
|
||||
public static final String getICUCanonicalName(String enc)
|
||||
throws UnsupportedCharsetException{
|
||||
String canonicalName = null;
|
||||
String ret = null;
|
||||
try{
|
||||
if(enc!=null){
|
||||
if((canonicalName = UConverterAlias.getCanonicalName(enc, "MIME"))!=null){
|
||||
ret = canonicalName;
|
||||
}else if((canonicalName = UConverterAlias.getCanonicalName(enc, "IANA"))!=null){
|
||||
ret = canonicalName;
|
||||
}else if((canonicalName = UConverterAlias.getCanonicalName(enc, ""))!=null){
|
||||
ret = canonicalName;
|
||||
}else if((canonicalName = UConverterAlias.getAlias(enc, 0))!=null){
|
||||
/* we have some aliases in the form x-blah .. match those first */
|
||||
ret = canonicalName;
|
||||
}else if(enc.indexOf("x-")==0){
|
||||
/* TODO: Match with getJavaCanonicalName method */
|
||||
/*
|
||||
char temp[ UCNV_MAX_CONVERTER_NAME_LENGTH] = {0};
|
||||
strcpy(temp, encName+2);
|
||||
*/
|
||||
ret = enc.substring(2);
|
||||
}else{
|
||||
/* unsupported encoding */
|
||||
ret = "";
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}catch(IOException ex){
|
||||
throw new UnsupportedCharsetException(enc);
|
||||
}
|
||||
}
|
||||
private static final Charset getCharset(String icuCanonicalName) throws IOException{
|
||||
String[] aliases = (String[])getAliases(icuCanonicalName);
|
||||
String canonicalName = getJavaCanonicalName(icuCanonicalName);
|
||||
return (CharsetICU.getCharset(icuCanonicalName,canonicalName, aliases));
|
||||
}
|
||||
/**
|
||||
* Gets the canonical name of the converter as defined by Java
|
||||
* @param icuCanonicalName converter name
|
||||
* @return canonical name of the converter
|
||||
* @internal ICU 3.4
|
||||
*/
|
||||
|
||||
private static String getJavaCanonicalName(String icuCanonicalName){
|
||||
/*
|
||||
If a charset listed in the IANA Charset Registry is supported by an implementation
|
||||
of the Java platform then its canonical name must be the name listed in the registry.
|
||||
Many charsets are given more than one name in the registry, in which case the registry
|
||||
identifies one of the names as MIME-preferred. If a charset has more than one registry
|
||||
name then its canonical name must be the MIME-preferred name and the other names in
|
||||
the registry must be valid aliases. If a supported charset is not listed in the IANA
|
||||
registry then its canonical name must begin with one of the strings "X-" or "x-".
|
||||
*/
|
||||
if(icuCanonicalName==null ){
|
||||
return null;
|
||||
}
|
||||
try{
|
||||
String cName = null;
|
||||
/* find out the alias with MIME tag */
|
||||
if((cName=UConverterAlias.getStandardName(icuCanonicalName, "MIME"))!=null){
|
||||
/* find out the alias with IANA tag */
|
||||
}else if((cName=UConverterAlias.getStandardName(icuCanonicalName, "IANA"))!=null){
|
||||
}else {
|
||||
/*
|
||||
check to see if an alias already exists with x- prefix, if yes then
|
||||
make that the canonical name
|
||||
*/
|
||||
int aliasNum = UConverterAlias.countAliases(icuCanonicalName);
|
||||
String name;
|
||||
for(int i=0;i<aliasNum;i++){
|
||||
name = UConverterAlias.getAlias(icuCanonicalName, i);
|
||||
if(name!=null && name.indexOf("x-")==0){
|
||||
cName = name;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* last resort just append x- to any of the alias and
|
||||
make it the canonical name */
|
||||
if((cName==null || cName.length()==0)){
|
||||
name = UConverterAlias.getStandardName(icuCanonicalName, "UTR22");
|
||||
if(name==null && icuCanonicalName.indexOf(",")!=-1){
|
||||
name = UConverterAlias.getAlias(icuCanonicalName, 1);
|
||||
}
|
||||
/* if there is no UTR22 canonical name .. then just return itself*/
|
||||
if(name==null){
|
||||
name = icuCanonicalName;
|
||||
}
|
||||
cName = "x-"+ name;
|
||||
}
|
||||
}
|
||||
return cName;
|
||||
}catch (IOException ex){
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the aliases associated with the converter name
|
||||
* @param encName converter name
|
||||
* @return converter names as elements in an object array
|
||||
* @internal ICU 2.4
|
||||
*/
|
||||
private static final String[] getAliases(String encName)throws IOException{
|
||||
String[] ret = null;
|
||||
int aliasNum = 0;
|
||||
int i=0;
|
||||
int j=0;
|
||||
String aliasArray[/*50*/] = new String[50];
|
||||
|
||||
if(encName != null){
|
||||
aliasNum = UConverterAlias.countAliases(encName);
|
||||
for(i=0,j=0;i<aliasNum;i++){
|
||||
String name = UConverterAlias.getAlias(encName,i);
|
||||
if(name.indexOf('+')==-1 && name.indexOf(',')==-1){
|
||||
aliasArray[j++]= name;
|
||||
}
|
||||
}
|
||||
ret = new String[j];
|
||||
for(;--j>=0;) {
|
||||
ret[j] = aliasArray[j];
|
||||
}
|
||||
|
||||
}
|
||||
return (ret);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Class that implements the iterator for charsets
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
protected final class CharsetIterator implements Iterator{
|
||||
private String[] names;
|
||||
private int currentIndex;
|
||||
protected CharsetIterator(String[] strs){
|
||||
names = strs;
|
||||
currentIndex=0;
|
||||
}
|
||||
public boolean hasNext(){
|
||||
return (currentIndex< names.length);
|
||||
}
|
||||
public Object next(){
|
||||
if(currentIndex<names.length){
|
||||
return charsetForName(names[currentIndex++]);
|
||||
}else{
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
public void remove(){
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
private static final void putCharsets(Map map){
|
||||
int num = UConverterAlias.countAvailable();
|
||||
for(int i=0;i<num;i++) {
|
||||
String name = UConverterAlias.getAvailableName(i);
|
||||
try {
|
||||
Charset cs = getCharset(name);
|
||||
map.put(cs, getJavaCanonicalName(name));
|
||||
}catch(UnsupportedCharsetException ex){
|
||||
}catch (IOException e) {
|
||||
}
|
||||
// add only charsets that can be created!
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator for the available charsets
|
||||
* @return Iterator the charset name iterator
|
||||
*/
|
||||
public final Iterator charsets(){
|
||||
HashMap map = new HashMap();
|
||||
putCharsets(map);
|
||||
return map.keySet().iterator();
|
||||
}
|
||||
/**
|
||||
* Gets the canonical names of available converters
|
||||
* @return Object[] names as an object array
|
||||
*/
|
||||
public static final Object[] getAvailableNames(){
|
||||
HashMap map = new HashMap();
|
||||
putCharsets(map);
|
||||
return map.values().toArray();
|
||||
}
|
||||
/**
|
||||
* Return all names available
|
||||
* @return
|
||||
*/
|
||||
public static final String[] getAllNames(){
|
||||
int num = UConverterAlias.countAvailable();
|
||||
String[] names = new String[num];
|
||||
for(int i=0;i<num;i++) {
|
||||
names[i] = UConverterAlias.getAvailableName(i);
|
||||
}
|
||||
return names;
|
||||
}
|
||||
}
|
3568
icu4j/src/com/ibm/icu/impl/CharsetMBCS.java
Normal file
3568
icu4j/src/com/ibm/icu/impl/CharsetMBCS.java
Normal file
File diff suppressed because it is too large
Load diff
446
icu4j/src/com/ibm/icu/impl/CharsetUTF16.java
Normal file
446
icu4j/src/com/ibm/icu/impl/CharsetUTF16.java
Normal file
|
@ -0,0 +1,446 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.charset.CharsetDecoderICU;
|
||||
import com.ibm.icu.charset.CharsetEncoderICU;
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
public class CharsetUTF16 extends CharsetICU {
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xff, (byte)0xfd};
|
||||
public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
minBytesPerChar = 2;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
class CharsetDecoderUTF16 extends CharsetDecoderICU{
|
||||
|
||||
public CharsetDecoderUTF16(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining() && toUnicodeStatus==0) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
int sourceIndex=0, count=0, length, sourceArrayIndex;
|
||||
char c=0, trail;
|
||||
length = source.remaining();
|
||||
sourceArrayIndex = source.position();
|
||||
try{
|
||||
/* complete a partial UChar or pair from the last call */
|
||||
if(toUnicodeStatus!=0) {
|
||||
/*
|
||||
* special case: single byte from a previous buffer,
|
||||
* where the byte turned out not to belong to a trail surrogate
|
||||
* and the preceding, unmatched lead surrogate was put into toUBytes[]
|
||||
* for error handling
|
||||
*/
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
|
||||
toULength=1;
|
||||
toUnicodeStatus=0;
|
||||
}
|
||||
if((count=toULength)!=0) {
|
||||
byte[] pArray=toUBytesArray;
|
||||
int pArrayIndex = toUBytesBegin;
|
||||
do {
|
||||
pArray[count++]=source.get(sourceArrayIndex++);
|
||||
++sourceIndex;
|
||||
--length;
|
||||
if(count==2) {
|
||||
c=(char)(((pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
/* output the BMP code point */
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else if(UTF16.isLeadSurrogate(c)) {
|
||||
/* continue collecting bytes for the trail surrogate */
|
||||
c=0; /* avoid unnecessary surrogate handling below */
|
||||
} else {
|
||||
/* fall through to error handling for an unmatched trail surrogate */
|
||||
break;
|
||||
}
|
||||
} else if(count==4) {
|
||||
c=(char)(((pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
trail=(char)(((pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(UTF16.isTrailSurrogate(trail)) {
|
||||
/* output the surrogate pair */
|
||||
target.put(c);
|
||||
if(target.remaining()>=1) {
|
||||
target.put(trail);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
}
|
||||
} else /* targetCapacity==1 */ {
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else {
|
||||
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
|
||||
|
||||
/* back out reading the code unit after it */
|
||||
if((source.position()-sourceArrayIndex)>=2) {
|
||||
sourceArrayIndex-=2;
|
||||
} else {
|
||||
/*
|
||||
* if the trail unit's first byte was in a previous buffer, then
|
||||
* we need to put it into a special place because toUBytes[] will be
|
||||
* used for the lead unit's bytes
|
||||
*/
|
||||
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
|
||||
--sourceArrayIndex;
|
||||
}
|
||||
toULength=2;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);;
|
||||
}
|
||||
}
|
||||
} while(length>0);
|
||||
toULength=(byte)count;
|
||||
}
|
||||
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
count=2*target.remaining();
|
||||
if(count>length) {
|
||||
count=length&~1;
|
||||
}
|
||||
if(c==0 && count>0) {
|
||||
length-=count;
|
||||
count>>=1;
|
||||
//targetCapacity-=count;
|
||||
if(offsets==null) {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
} else {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=2;
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
}
|
||||
|
||||
if(count==0) {
|
||||
/* done with the loop for complete UChars */
|
||||
c=0;
|
||||
} else {
|
||||
/* keep c for surrogate handling, trail will be set there */
|
||||
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
|
||||
}
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
/*
|
||||
* c is a surrogate, and
|
||||
* - source or target too short
|
||||
* - or the surrogate is unmatched
|
||||
*/
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)(c>>>8);
|
||||
toUBytesArray[toUBytesBegin+1]=(byte)c;
|
||||
toULength=2;
|
||||
|
||||
if(UTF16.isLeadSurrogate(c)) {
|
||||
if(length>=2) {
|
||||
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
|
||||
/* output the surrogate pair, will overflow (see conditions comment above) */
|
||||
sourceArrayIndex+=2;
|
||||
length-=2;
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(sourceIndex);
|
||||
}
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
toULength=0;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* unmatched lead surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
} else {
|
||||
/* see if the trail surrogate is in the next buffer */
|
||||
}
|
||||
} else {
|
||||
/* unmatched trail surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* check for a remaining source byte */
|
||||
if(length>0) {
|
||||
if(!target.hasRemaining()) {
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* it must be length==1 because otherwise the above would have copied more */
|
||||
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
|
||||
}
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
|
||||
}
|
||||
class CharsetEncoderUTF16 extends CharsetEncoderICU{
|
||||
|
||||
public CharsetEncoderUTF16(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining()) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
char c;
|
||||
/* write the BOM if necessary */
|
||||
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
|
||||
byte bom[]={ (byte)0xfe, (byte)0xff };
|
||||
cr = fromUWriteBytes(this,bom, 0, bom.length, target, offsets, -1);
|
||||
if(cr.isError()){
|
||||
return cr;
|
||||
}
|
||||
fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
int sourceIndex = 0;
|
||||
char trail = 0;
|
||||
int length = source.remaining();
|
||||
|
||||
try{
|
||||
/* c!=0 indicates in several places outside the main loops that a surrogate was found */
|
||||
|
||||
if((c=(char)fromUChar32)!=0 && UTF16.isTrailSurrogate(trail=source.get(sourceIndex)) && target.remaining()>=4) {
|
||||
/* the last buffer ended with a lead surrogate, output the surrogate pair */
|
||||
++sourceIndex;
|
||||
--length;
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)c);
|
||||
target.put((byte)(trail>>>8));
|
||||
target.put((byte)trail);
|
||||
if(offsets!=null && offsets.remaining()>=4) {
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
}
|
||||
sourceIndex=1;
|
||||
fromUChar32=c=0;
|
||||
}
|
||||
byte overflow[/*4*/] = new byte[4];
|
||||
int sourceArrayIndex = source.position();
|
||||
|
||||
if(c==0) {
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
int count=2*length;
|
||||
int targetCapacity = target.limit();
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity&~1;
|
||||
}
|
||||
/* count is even */
|
||||
targetCapacity-=count;
|
||||
count>>=1;
|
||||
length-=count;
|
||||
|
||||
if(offsets==null) {
|
||||
while(count>0) {
|
||||
c= source.get(sourceArrayIndex++);
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)c);
|
||||
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
|
||||
++sourceArrayIndex;
|
||||
--count;
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)c);
|
||||
target.put((byte)(trail>>>8));
|
||||
target.put((byte)trail);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
--count;
|
||||
}
|
||||
} else {
|
||||
while(count>0) {
|
||||
c=source.get(sourceArrayIndex++);
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)c);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex++);
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
|
||||
++sourceArrayIndex;
|
||||
--count;
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)c);
|
||||
target.put((byte)(trail>>>8));
|
||||
target.put((byte)trail);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
if(count==0) {
|
||||
/* done with the loop for complete UChars */
|
||||
if(length>0 && targetCapacity>0) {
|
||||
/*
|
||||
* there is more input and some target capacity -
|
||||
* it must be targetCapacity==1 because otherwise
|
||||
* the above would have copied more;
|
||||
* prepare for overflow output
|
||||
*/
|
||||
if(!UTF16.isSurrogate(c=source.get(sourceArrayIndex++))) {
|
||||
overflow[0]=(byte)(c>>>8);
|
||||
overflow[1]=(byte)c;
|
||||
length=2; /* 2 bytes to output */
|
||||
c=0;
|
||||
/* } else { keep c for surrogate handling, length will be set there */
|
||||
}
|
||||
} else {
|
||||
length=0;
|
||||
c=0;
|
||||
}
|
||||
} else {
|
||||
/* keep c for surrogate handling, length will be set there */
|
||||
targetCapacity+=2*count;
|
||||
}
|
||||
} else {
|
||||
length=0; /* from here on, length counts the bytes in overflow[] */
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
/*
|
||||
* c is a surrogate, and
|
||||
* - source or target too short
|
||||
* - or the surrogate is unmatched
|
||||
*/
|
||||
length=0;
|
||||
if(UTF16.isLeadSurrogate(c)) {
|
||||
if(sourceArrayIndex<source.limit()) {
|
||||
if(UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
|
||||
/* output the surrogate pair, will overflow (see conditions comment above) */
|
||||
++sourceArrayIndex;
|
||||
overflow[0]=(byte)(c>>>8);
|
||||
overflow[1]=(byte)c;
|
||||
overflow[2]=(byte)(trail>>>8);
|
||||
overflow[3]=(byte)trail;
|
||||
length=4; /* 4 bytes to output */
|
||||
c=0;
|
||||
} else {
|
||||
/* unmatched lead surrogate */
|
||||
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
} else {
|
||||
/* see if the trail surrogate is in the next buffer */
|
||||
}
|
||||
} else {
|
||||
/* unmatched trail surrogate */
|
||||
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
fromUChar32=c;
|
||||
}
|
||||
source.position(sourceArrayIndex);
|
||||
if(length>0) {
|
||||
/* output length bytes with overflow (length>targetCapacity>0) */
|
||||
fromUWriteBytes(this, overflow, 0, length, target, offsets, sourceIndex);
|
||||
}
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF16(this);
|
||||
}
|
||||
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF16(this);
|
||||
}
|
||||
}
|
449
icu4j/src/com/ibm/icu/impl/CharsetUTF16LE.java
Normal file
449
icu4j/src/com/ibm/icu/impl/CharsetUTF16LE.java
Normal file
|
@ -0,0 +1,449 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.charset.CharsetDecoderICU;
|
||||
import com.ibm.icu.charset.CharsetEncoderICU;
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
*/
|
||||
public class CharsetUTF16LE extends CharsetICU {
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff};
|
||||
public CharsetUTF16LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
minBytesPerChar = 2;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
class CharsetDecoderUTF16LE extends CharsetDecoderICU{
|
||||
|
||||
public CharsetDecoderUTF16LE(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining() && toUnicodeStatus==0) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
int sourceIndex=0, count=0, length, sourceArrayIndex;
|
||||
char c=0, trail;
|
||||
length = source.remaining();
|
||||
sourceArrayIndex = source.position();
|
||||
try{
|
||||
/* complete a partial UChar or pair from the last call */
|
||||
if(toUnicodeStatus!=0) {
|
||||
/*
|
||||
* special case: single byte from a previous buffer,
|
||||
* where the byte turned out not to belong to a trail surrogate
|
||||
* and the preceding, unmatched lead surrogate was put into toUBytes[]
|
||||
* for error handling
|
||||
*/
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)toUnicodeStatus;
|
||||
toULength=1;
|
||||
toUnicodeStatus=0;
|
||||
}
|
||||
if((count=toULength)!=0) {
|
||||
byte[] pArray=toUBytesArray;
|
||||
int pArrayIndex = toUBytesBegin;
|
||||
do {
|
||||
pArray[count++]=source.get(sourceArrayIndex++);
|
||||
++sourceIndex;
|
||||
--length;
|
||||
if(count==2) {
|
||||
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
/* output the BMP code point */
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else if(UTF16.isLeadSurrogate(c)) {
|
||||
/* continue collecting bytes for the trail surrogate */
|
||||
c=0; /* avoid unnecessary surrogate handling below */
|
||||
} else {
|
||||
/* fall through to error handling for an unmatched trail surrogate */
|
||||
break;
|
||||
}
|
||||
} else if(count==4) {
|
||||
c=(char)(((pArray[pArrayIndex+1]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+0]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
trail=(char)(((pArray[pArrayIndex+3]&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(pArray[pArrayIndex+2]&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
if(UTF16.isTrailSurrogate(trail)) {
|
||||
/* output the surrogate pair */
|
||||
target.put(c);
|
||||
if(target.remaining()>=1) {
|
||||
target.put(trail);
|
||||
if(offsets!=null) {
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
}
|
||||
} else /* targetCapacity==1 */ {
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
count=0;
|
||||
c=0;
|
||||
break;
|
||||
} else {
|
||||
/* unmatched lead surrogate, handle here for consistent toUBytes[] */
|
||||
|
||||
/* back out reading the code unit after it */
|
||||
if((source.position()-sourceArrayIndex)>=2) {
|
||||
sourceArrayIndex-=2;
|
||||
} else {
|
||||
/*
|
||||
* if the trail unit's first byte was in a previous buffer, then
|
||||
* we need to put it into a special place because toUBytes[] will be
|
||||
* used for the lead unit's bytes
|
||||
*/
|
||||
toUnicodeStatus=0x100|pArray[pArrayIndex+2];
|
||||
--sourceArrayIndex;
|
||||
}
|
||||
toULength=2;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);;
|
||||
}
|
||||
}
|
||||
} while(length>0);
|
||||
toULength=(byte)count;
|
||||
}
|
||||
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
count=2*target.remaining();
|
||||
if(count>length) {
|
||||
count=length&~1;
|
||||
}
|
||||
if(c==0 && count>0) {
|
||||
length-=count;
|
||||
count>>=1;
|
||||
//targetCapacity-=count;
|
||||
if(offsets==null) {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
} else {
|
||||
do {
|
||||
c=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK));
|
||||
sourceArrayIndex+=2;
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put(c);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=2;
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 &&
|
||||
UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))
|
||||
) {
|
||||
sourceArrayIndex+=2;
|
||||
--count;
|
||||
target.put(c);
|
||||
target.put(trail);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=4;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while(--count>0);
|
||||
}
|
||||
|
||||
if(count==0) {
|
||||
/* done with the loop for complete UChars */
|
||||
c=0;
|
||||
} else {
|
||||
/* keep c for surrogate handling, trail will be set there */
|
||||
length+=2*(count-1); /* one more byte pair was consumed than count decremented */
|
||||
}
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
/*
|
||||
* c is a surrogate, and
|
||||
* - source or target too short
|
||||
* - or the surrogate is unmatched
|
||||
*/
|
||||
toUBytesArray[toUBytesBegin+0]=(byte)c;
|
||||
toUBytesArray[toUBytesBegin+1]=(byte)(c>>>8);
|
||||
toULength=2;
|
||||
|
||||
if(UTF16.isLeadSurrogate(c)) {
|
||||
if(length>=2) {
|
||||
if(UTF16.isTrailSurrogate(trail=(char)(((source.get(sourceArrayIndex+1)&UConverterConstants.UNSIGNED_BYTE_MASK)<<8)|(source.get(sourceArrayIndex+0)&UConverterConstants.UNSIGNED_BYTE_MASK)))) {
|
||||
/* output the surrogate pair, will overflow (see conditions comment above) */
|
||||
sourceArrayIndex+=2;
|
||||
length-=2;
|
||||
target.put(c);
|
||||
if(offsets!=null) {
|
||||
offsets.put(sourceIndex);
|
||||
}
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=trail;
|
||||
charErrorBufferLength=1;
|
||||
toULength=0;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* unmatched lead surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
} else {
|
||||
/* see if the trail surrogate is in the next buffer */
|
||||
}
|
||||
} else {
|
||||
/* unmatched trail surrogate */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* check for a remaining source byte */
|
||||
if(length>0) {
|
||||
if(!target.hasRemaining()) {
|
||||
cr = CoderResult.OVERFLOW;
|
||||
} else {
|
||||
/* it must be length==1 because otherwise the above would have copied more */
|
||||
toUBytesArray[toULength++]=source.get(sourceArrayIndex++);
|
||||
}
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
|
||||
}
|
||||
class CharsetEncoderUTF16LE extends CharsetEncoderICU{
|
||||
|
||||
public CharsetEncoderUTF16LE(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining()) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
char c;
|
||||
/* write the BOM if necessary */
|
||||
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
|
||||
byte bom[]={ (byte)0xff, (byte)0xfe };
|
||||
cr = fromUWriteBytes(this,bom, 0, bom.length, target, offsets, -1);
|
||||
if(cr.isError()){
|
||||
return cr;
|
||||
}
|
||||
fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
if(!target.hasRemaining()) {
|
||||
return CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
int sourceIndex = 0;
|
||||
char trail = 0;
|
||||
int length = source.remaining();
|
||||
|
||||
try{
|
||||
/* c!=0 indicates in several places outside the main loops that a surrogate was found */
|
||||
|
||||
if((c=(char)fromUChar32)!=0 && UTF16.isTrailSurrogate(trail=source.get(sourceIndex)) && target.remaining()>=4) {
|
||||
/* the last buffer ended with a lead surrogate, output the surrogate pair */
|
||||
++sourceIndex;
|
||||
--length;
|
||||
target.put((byte)c);
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)trail);
|
||||
target.put((byte)(trail>>>8));
|
||||
if(offsets!=null && offsets.remaining()>=4) {
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
offsets.put(-1);
|
||||
}
|
||||
sourceIndex=1;
|
||||
fromUChar32=c=0;
|
||||
}
|
||||
byte overflow[/*4*/] = new byte[4];
|
||||
int sourceArrayIndex = source.position();
|
||||
|
||||
if(c==0) {
|
||||
/* copy an even number of bytes for complete UChars */
|
||||
int count=2*length;
|
||||
int targetCapacity = target.limit();
|
||||
if(count>targetCapacity) {
|
||||
count=targetCapacity&~1;
|
||||
}
|
||||
/* count is even */
|
||||
targetCapacity-=count;
|
||||
count>>=1;
|
||||
length-=count;
|
||||
|
||||
if(offsets==null) {
|
||||
while(count>0) {
|
||||
c= source.get(sourceArrayIndex++);
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put((byte)c);
|
||||
target.put((byte)(c>>>8));
|
||||
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
|
||||
++sourceArrayIndex;
|
||||
--count;
|
||||
target.put((byte)c);
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)trail);
|
||||
target.put((byte)(trail>>>8));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
--count;
|
||||
}
|
||||
} else {
|
||||
while(count>0) {
|
||||
c=source.get(sourceArrayIndex++);
|
||||
if(!UTF16.isSurrogate(c)) {
|
||||
target.put((byte)c);
|
||||
target.put((byte)(c>>>8));
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex++);
|
||||
} else if(UTF16.isLeadSurrogate(c) && count>=2 && UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
|
||||
++sourceArrayIndex;
|
||||
--count;
|
||||
target.put((byte)c);
|
||||
target.put((byte)(c>>>8));
|
||||
target.put((byte)trail);
|
||||
target.put((byte)(trail>>>8));
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
offsets.put(sourceIndex);
|
||||
sourceIndex+=2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
--count;
|
||||
}
|
||||
}
|
||||
|
||||
if(count==0) {
|
||||
/* done with the loop for complete UChars */
|
||||
if(length>0 && targetCapacity>0) {
|
||||
/*
|
||||
* there is more input and some target capacity -
|
||||
* it must be targetCapacity==1 because otherwise
|
||||
* the above would have copied more;
|
||||
* prepare for overflow output
|
||||
*/
|
||||
if(!UTF16.isSurrogate(c=source.get(sourceArrayIndex++))) {
|
||||
overflow[0]=(byte)c;
|
||||
overflow[1]=(byte)(c>>>8);
|
||||
length=2; /* 2 bytes to output */
|
||||
c=0;
|
||||
/* } else { keep c for surrogate handling, length will be set there */
|
||||
}
|
||||
} else {
|
||||
length=0;
|
||||
c=0;
|
||||
}
|
||||
} else {
|
||||
/* keep c for surrogate handling, length will be set there */
|
||||
targetCapacity+=2*count;
|
||||
}
|
||||
} else {
|
||||
length=0; /* from here on, length counts the bytes in overflow[] */
|
||||
}
|
||||
|
||||
if(c!=0) {
|
||||
/*
|
||||
* c is a surrogate, and
|
||||
* - source or target too short
|
||||
* - or the surrogate is unmatched
|
||||
*/
|
||||
length=0;
|
||||
if(UTF16.isLeadSurrogate(c)) {
|
||||
if(sourceArrayIndex<source.limit()) {
|
||||
if(UTF16.isTrailSurrogate(trail=source.get(sourceArrayIndex))) {
|
||||
/* output the surrogate pair, will overflow (see conditions comment above) */
|
||||
++sourceArrayIndex;
|
||||
overflow[0]=(byte)c;
|
||||
overflow[1]=(byte)(c>>>8);
|
||||
overflow[2]=(byte)trail;
|
||||
overflow[3]=(byte)(trail>>>8);
|
||||
length=4; /* 4 bytes to output */
|
||||
c=0;
|
||||
} else {
|
||||
/* unmatched lead surrogate */
|
||||
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
} else {
|
||||
/* see if the trail surrogate is in the next buffer */
|
||||
}
|
||||
} else {
|
||||
/* unmatched trail surrogate */
|
||||
//pErrorCode[0]=ErrorCode.U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
fromUChar32=c;
|
||||
}
|
||||
source.position(sourceArrayIndex);
|
||||
if(length>0) {
|
||||
/* output length bytes with overflow (length>targetCapacity>0) */
|
||||
fromUWriteBytes(this, overflow, 0, length, target, offsets, sourceIndex);
|
||||
}
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF16LE(this);
|
||||
}
|
||||
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF16LE(this);
|
||||
}
|
||||
}
|
318
icu4j/src/com/ibm/icu/impl/CharsetUTF32.java
Normal file
318
icu4j/src/com/ibm/icu/impl/CharsetUTF32.java
Normal file
|
@ -0,0 +1,318 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.charset.CharsetDecoderICU;
|
||||
import com.ibm.icu.charset.CharsetEncoderICU;
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
*/
|
||||
public class CharsetUTF32 extends CharsetICU {
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0, (byte)0, (byte)0xff, (byte)0xfd};
|
||||
public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
minBytesPerChar = 4;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
class CharsetDecoderUTF32 extends CharsetDecoderICU{
|
||||
|
||||
public CharsetDecoderUTF32(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
int ch, i;
|
||||
|
||||
try{
|
||||
donefornow:
|
||||
{
|
||||
/* UTF-8 returns here for only non-offset, this needs to change.*/
|
||||
if (toUnicodeStatus != 0 && target.hasRemaining()) {
|
||||
i = toULength; /* restore # of bytes consumed */
|
||||
|
||||
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
|
||||
toUnicodeStatus = 0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch = (ch << 8) | ((byte)(source.get(sourceArrayIndex)) & UConverterConstants.UNSIGNED_BYTE_MASK);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
|
||||
i = 0;
|
||||
ch = 0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch = (ch << 8) | ((byte)(source.get(sourceArrayIndex)) & UConverterConstants.UNSIGNED_BYTE_MASK);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char) ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
|
||||
/* End of target buffer */
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
class CharsetEncoderUTF32 extends CharsetEncoderICU{
|
||||
|
||||
public CharsetEncoderUTF32(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining()) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
|
||||
byte[] bom={ 0, 0, (byte)0xfe, (byte)0xff };
|
||||
cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
|
||||
if(cr.isError()){
|
||||
return cr;
|
||||
}
|
||||
fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
int ch, ch2;
|
||||
int indexToWrite;
|
||||
byte temp[] = new byte[4];
|
||||
temp[0] = 0;
|
||||
int sourceArrayIndex = source.position();
|
||||
|
||||
try{
|
||||
boolean doloop = true;
|
||||
if (fromUChar32 != 0) {
|
||||
ch = fromUChar32;
|
||||
fromUChar32 = 0;
|
||||
//lowsurogate:
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch2 = source.get(sourceArrayIndex);
|
||||
if (UTF16.isTrailSurrogate((char)ch2)) {
|
||||
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
|
||||
sourceArrayIndex++;
|
||||
}
|
||||
else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
doloop = false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* ran out of source */
|
||||
fromUChar32 = ch;
|
||||
if (flush) {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
doloop = false;
|
||||
}
|
||||
|
||||
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
|
||||
temp[1] = (byte) (ch >>> 16 & 0x1F);
|
||||
temp[2] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
|
||||
temp[3] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
|
||||
|
||||
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
|
||||
if (target.hasRemaining()) {
|
||||
target.put(temp[indexToWrite]);
|
||||
}
|
||||
else {
|
||||
errorBuffer[errorBufferLength++] = temp[indexToWrite];
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(doloop) {
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
|
||||
ch = source.get(sourceArrayIndex++);
|
||||
|
||||
if (UTF16.isSurrogate((char)ch)) {
|
||||
if (UTF16.isLeadSurrogate((char)ch)) {
|
||||
//lowsurogate:
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch2 = source.get(sourceArrayIndex);
|
||||
if (UTF16.isTrailSurrogate((char)ch2)) {
|
||||
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
|
||||
sourceArrayIndex++;
|
||||
}
|
||||
else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* ran out of source */
|
||||
fromUChar32 = ch;
|
||||
if (flush) {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
|
||||
temp[1] = (byte) (ch >>> 16 & 0x1F);
|
||||
temp[2] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
|
||||
temp[3] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
|
||||
|
||||
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
|
||||
if (target.hasRemaining()) {
|
||||
target.put(temp[indexToWrite]);
|
||||
}
|
||||
else {
|
||||
errorBuffer[errorBufferLength++] = temp[indexToWrite];
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
source.position(sourceArrayIndex);
|
||||
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF32(this);
|
||||
}
|
||||
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF32(this);
|
||||
}
|
||||
}
|
318
icu4j/src/com/ibm/icu/impl/CharsetUTF32LE.java
Normal file
318
icu4j/src/com/ibm/icu/impl/CharsetUTF32LE.java
Normal file
|
@ -0,0 +1,318 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.charset.CharsetDecoderICU;
|
||||
import com.ibm.icu.charset.CharsetEncoderICU;
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
*/
|
||||
public class CharsetUTF32LE extends CharsetICU {
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xfd, (byte)0xff, (byte)0, (byte)0};
|
||||
public CharsetUTF32LE(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
minBytesPerChar = 4;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
class CharsetDecoderUTF32LE extends CharsetDecoderICU{
|
||||
|
||||
public CharsetDecoderUTF32LE(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
int ch, i;
|
||||
|
||||
try{
|
||||
donefornow:
|
||||
{
|
||||
/* UTF-8 returns here for only non-offset, this needs to change.*/
|
||||
if (toUnicodeStatus != 0 && target.hasRemaining()) {
|
||||
i = toULength; /* restore # of bytes consumed */
|
||||
|
||||
ch = (int)(toUnicodeStatus - 1);/*Stores the previously calculated ch from a previous call*/
|
||||
toUnicodeStatus = 0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
|
||||
i = 0;
|
||||
ch = 0;
|
||||
|
||||
while (i < 4) {
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch |= (source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK) << (i * 8);
|
||||
toUBytesArray[i++] = (byte) source.get(sourceArrayIndex++);
|
||||
}
|
||||
else {
|
||||
/* stores a partially calculated target*/
|
||||
/* + 1 to make 0 a valid character */
|
||||
toUnicodeStatus = ch + 1;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UTF && !UTF16.isSurrogate((char)ch)) {
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char) ch);
|
||||
}
|
||||
else {
|
||||
/* write out the surrogates */
|
||||
target.put(UTF16.getLeadSurrogate(ch));
|
||||
ch = UTF16.getTrailSurrogate(ch);
|
||||
if (target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
}
|
||||
else {
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[0] = (char) ch;
|
||||
charErrorBufferLength = 1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
|
||||
/* End of target buffer */
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
class CharsetEncoderUTF32LE extends CharsetEncoderICU{
|
||||
|
||||
public CharsetEncoderUTF32LE(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
private final static int NEED_TO_WRITE_BOM = 1;
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
fromUnicodeStatus = NEED_TO_WRITE_BOM;
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
if(!source.hasRemaining()) {
|
||||
/* no input, nothing to do */
|
||||
return cr;
|
||||
}
|
||||
|
||||
/* write the BOM if necessary */
|
||||
if(fromUnicodeStatus==NEED_TO_WRITE_BOM) {
|
||||
byte[] bom={ (byte)0xff, (byte)0xfe, 0, 0 };
|
||||
cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
|
||||
if(cr.isError()){
|
||||
return cr;
|
||||
}
|
||||
fromUnicodeStatus=0;
|
||||
}
|
||||
|
||||
int ch, ch2;
|
||||
int indexToWrite;
|
||||
byte temp[] = new byte[4];
|
||||
temp[3] = 0;
|
||||
int sourceArrayIndex = source.position();
|
||||
|
||||
try{
|
||||
boolean doloop = true;
|
||||
if (fromUChar32 != 0) {
|
||||
ch = fromUChar32;
|
||||
fromUChar32 = 0;
|
||||
//lowsurogate:
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch2 = source.get(sourceArrayIndex);
|
||||
if (UTF16.isTrailSurrogate((char)ch2)) {
|
||||
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
|
||||
sourceArrayIndex++;
|
||||
}
|
||||
else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
doloop = false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* ran out of source */
|
||||
fromUChar32 = ch;
|
||||
if (flush) {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
doloop = false;
|
||||
}
|
||||
|
||||
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
|
||||
temp[2] = (byte) (ch >>> 16 & 0x1F);
|
||||
temp[1] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
|
||||
temp[0] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
|
||||
|
||||
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
|
||||
if (target.hasRemaining()) {
|
||||
target.put(temp[indexToWrite]);
|
||||
}
|
||||
else {
|
||||
errorBuffer[errorBufferLength++] = temp[indexToWrite];
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(doloop) {
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining()) {
|
||||
ch = source.get(sourceArrayIndex++);
|
||||
|
||||
if (UTF16.isSurrogate((char)ch)) {
|
||||
if (UTF16.isLeadSurrogate((char)ch)) {
|
||||
//lowsurogate:
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
ch2 = source.get(sourceArrayIndex);
|
||||
if (UTF16.isTrailSurrogate((char)ch2)) {
|
||||
ch = ((ch - UConverterSharedData.SURROGATE_HIGH_START) << UConverterSharedData.HALF_SHIFT) + ch2 + UConverterSharedData.SURROGATE_LOW_BASE;
|
||||
sourceArrayIndex++;
|
||||
}
|
||||
else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.OVERFLOW;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* ran out of source */
|
||||
fromUChar32 = ch;
|
||||
if (flush) {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
|
||||
temp[2] = (byte) (ch >>> 16 & 0x1F);
|
||||
temp[1] = (byte) (ch >>> 8); /* unsigned cast implicitly does (ch & FF) */
|
||||
temp[0] = (byte) (ch); /* unsigned cast implicitly does (ch & FF) */
|
||||
|
||||
for (indexToWrite = 0; indexToWrite <= 3; indexToWrite++) {
|
||||
if (target.hasRemaining()) {
|
||||
target.put(temp[indexToWrite]);
|
||||
}
|
||||
else {
|
||||
errorBuffer[errorBufferLength++] = temp[indexToWrite];
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining()) {
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
source.position(sourceArrayIndex);
|
||||
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF32LE(this);
|
||||
}
|
||||
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF32LE(this);
|
||||
}
|
||||
}
|
508
icu4j/src/com/ibm/icu/impl/CharsetUTF8.java
Normal file
508
icu4j/src/com/ibm/icu/impl/CharsetUTF8.java
Normal file
|
@ -0,0 +1,508 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
import com.ibm.icu.charset.CharsetDecoderICU;
|
||||
import com.ibm.icu.charset.CharsetEncoderICU;
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
/**
|
||||
* @author Niti Hantaweepant
|
||||
*/
|
||||
public class CharsetUTF8 extends CharsetICU {
|
||||
protected byte[] fromUSubstitution = new byte[]{(byte)0xef, (byte)0xbf, (byte)0xbd};
|
||||
public CharsetUTF8(String icuCanonicalName, String javaCanonicalName, String[] aliases){
|
||||
super(icuCanonicalName, javaCanonicalName, aliases);
|
||||
maxBytesPerChar = 4;
|
||||
minBytesPerChar = 1;
|
||||
maxCharsPerByte = 1;
|
||||
}
|
||||
|
||||
/* UTF-8 Conversion DATA
|
||||
* for more information see Unicode Strandard 2.0 , Transformation Formats Appendix A-9
|
||||
*/
|
||||
private static final long OFFSETS_FROM_UTF8[] = {0,
|
||||
0x00000000L, 0x00003080L, 0x000E2080L,
|
||||
0x03C82080L, 0xFA082080L, 0x82082080L};
|
||||
|
||||
private static final byte BYTES_FROM_UTF8[] =
|
||||
{
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
||||
};
|
||||
|
||||
/*
|
||||
* Starting with Unicode 3.0.1:
|
||||
* UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
|
||||
* byte sequences with more than 4 bytes are illegal in UTF-8,
|
||||
* which is tested with impossible values for them
|
||||
*/
|
||||
private static final long UTF8_MIN_CHAR32[] = { 0L, 0L, 0x80L, 0x800L, 0x10000L, 0xffffffffL, 0xffffffffL };
|
||||
|
||||
class CharsetDecoderUTF8 extends CharsetDecoderICU{
|
||||
|
||||
public CharsetDecoderUTF8(CharsetICU cs) {
|
||||
super(cs);
|
||||
}
|
||||
|
||||
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
|
||||
// Todo: CESU8 implementation
|
||||
// boolean isCESU8 = args.converter.sharedData == _CESU8Data;
|
||||
boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
|
||||
int ch, ch2 = 0;
|
||||
int i, inBytes;
|
||||
|
||||
try{
|
||||
|
||||
donefornow:
|
||||
{
|
||||
if (toUnicodeStatus!=0 && target.hasRemaining())
|
||||
{
|
||||
inBytes = mode; /* restore # of bytes to consume */
|
||||
i = toULength; /* restore # of bytes consumed */
|
||||
|
||||
ch = toUnicodeStatus; /*Stores the previously calculated ch from a previous call*/
|
||||
toUnicodeStatus = 0;
|
||||
|
||||
while (i < inBytes)
|
||||
{
|
||||
if (sourceArrayIndex<source.limit())
|
||||
{
|
||||
toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK);
|
||||
if (!isTrail((byte)ch2))
|
||||
{
|
||||
break; /* i < inBytes */
|
||||
}
|
||||
ch = (ch << 6) + ch2;
|
||||
++sourceArrayIndex;
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* stores a partially calculated target*/
|
||||
toUnicodeStatus = ch;
|
||||
mode = inBytes;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= OFFSETS_FROM_UTF8[inBytes];
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (i == inBytes && ch <= UConverterSharedData.MAXIMUM_UTF && ch >= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch)))
|
||||
{
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
toULength = 0;
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char)ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* write out the surrogates */
|
||||
ch -= UConverterSharedData.HALF_BASE;
|
||||
target.put((char) ((ch >> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
|
||||
ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START;
|
||||
if(target.hasRemaining()) {
|
||||
target.put((char)ch);
|
||||
|
||||
} else /* targetCapacity==1 */ {
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=(char)ch;
|
||||
charErrorBufferLength=1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining())
|
||||
{
|
||||
ch = source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK;
|
||||
if (ch < 0x80) /* Simple case */
|
||||
{
|
||||
target.put((char)ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* store the first char */
|
||||
toUBytesArray[0] = (byte)ch;
|
||||
inBytes = BYTES_FROM_UTF8[(int)ch]; /* lookup current sequence length */
|
||||
i = 1;
|
||||
|
||||
while (i < inBytes)
|
||||
{
|
||||
if (sourceArrayIndex < source.limit())
|
||||
{
|
||||
toUBytesArray[i] = (byte) (ch2 = source.get(sourceArrayIndex) & UConverterConstants.UNSIGNED_BYTE_MASK);
|
||||
if (!isTrail((byte)ch2))
|
||||
{
|
||||
break; /* i < inBytes */
|
||||
}
|
||||
ch = (ch << 6) + ch2;
|
||||
++sourceArrayIndex;
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* stores a partially calculated target*/
|
||||
toUnicodeStatus = ch;
|
||||
mode = inBytes;
|
||||
toULength = (byte) i;
|
||||
break donefornow;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the accumulated high bits */
|
||||
ch -= OFFSETS_FROM_UTF8[inBytes];
|
||||
|
||||
/*
|
||||
* Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
|
||||
* - use only trail bytes after a lead byte (checked above)
|
||||
* - use the right number of trail bytes for a given lead byte
|
||||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
* In CESU-8, only surrogates, not supplementary code points, are encoded directly.
|
||||
*/
|
||||
if (i == inBytes && ch <= UConverterSharedData.MAXIMUM_UTF && ch >= UTF8_MIN_CHAR32[i] && (isCESU8 ? i <= 3 : !UTF16.isSurrogate((char)ch)))
|
||||
{
|
||||
/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
|
||||
toULength = 0;
|
||||
if (ch <= UConverterSharedData.MAXIMUM_UCS2)
|
||||
{
|
||||
/* fits in 16 bits */
|
||||
target.put((char) ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* write out the surrogates */
|
||||
ch -= UConverterSharedData.HALF_BASE;
|
||||
target.put((char) ((ch >>> UConverterSharedData.HALF_SHIFT) + UConverterSharedData.SURROGATE_HIGH_START));
|
||||
ch = (ch & UConverterSharedData.HALF_MASK) + UConverterSharedData.SURROGATE_LOW_START;
|
||||
if (target.hasRemaining())
|
||||
{
|
||||
target.put((char)ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Put in overflow buffer (not handled here) */
|
||||
charErrorBufferArray[charErrorBufferBegin+0]=(char)ch;
|
||||
charErrorBufferLength=1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
toULength = (byte)i;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining())
|
||||
{
|
||||
/* End of target buffer */
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
|
||||
}
|
||||
class CharsetEncoderUTF8 extends CharsetEncoderICU{
|
||||
|
||||
public CharsetEncoderUTF8(CharsetICU cs) {
|
||||
super(cs, fromUSubstitution);
|
||||
implReset();
|
||||
}
|
||||
|
||||
protected void implReset() {
|
||||
super.implReset();
|
||||
}
|
||||
|
||||
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
|
||||
CoderResult cr = CoderResult.UNDERFLOW;
|
||||
|
||||
int sourceArrayIndex = source.position();
|
||||
|
||||
// Todo: CESU8 implementation
|
||||
// boolean isCESU8 = args.converter.sharedData == _CESU8Data;
|
||||
boolean isCESU8 = (UConverterSharedData._CESU8Data != null);
|
||||
|
||||
int ch;
|
||||
short indexToWrite;
|
||||
byte temp[] = new byte[4];
|
||||
boolean doloop = true;
|
||||
|
||||
try{
|
||||
|
||||
if (fromUChar32 != 0 && target.hasRemaining())
|
||||
{
|
||||
ch = fromUChar32;
|
||||
fromUChar32 = 0;
|
||||
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
/* test the following code unit */
|
||||
char trail = source.get(sourceArrayIndex);
|
||||
if(UTF16.isTrailSurrogate(trail)) {
|
||||
++sourceArrayIndex;
|
||||
ch = UTF16.getCodePoint((char)ch, trail);
|
||||
/* convert this supplementary code point */
|
||||
/* exit this condition tree */
|
||||
} else {
|
||||
/* this is an unmatched lead code unit (1st surrogate) */
|
||||
/* callback(illegal) */
|
||||
fromUChar32 = (int)ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
doloop = false;
|
||||
}
|
||||
} else {
|
||||
/* no more input */
|
||||
fromUChar32 = (int)ch;
|
||||
doloop = false;
|
||||
}
|
||||
|
||||
if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE)
|
||||
{
|
||||
indexToWrite = 2;
|
||||
temp[2] = (byte) ((ch >>> 12) | 0xe0);
|
||||
}
|
||||
else
|
||||
{
|
||||
indexToWrite = 3;
|
||||
temp[3] = (byte) ((ch >>> 18) | 0xf0);
|
||||
temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
|
||||
}
|
||||
temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
|
||||
temp[0] = (byte) ((ch & 0x3f) | 0x80);
|
||||
|
||||
for (; indexToWrite >= 0; indexToWrite--)
|
||||
{
|
||||
if (target.hasRemaining())
|
||||
{
|
||||
target.put(temp[indexToWrite]);
|
||||
}
|
||||
else
|
||||
{
|
||||
errorBuffer[errorBufferLength++] = temp[indexToWrite];
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(doloop) {
|
||||
while (sourceArrayIndex < source.limit() && target.hasRemaining())
|
||||
{
|
||||
ch = source.get(sourceArrayIndex++);
|
||||
|
||||
if (ch < 0x80) /* Single byte */
|
||||
{
|
||||
target.put((byte)ch);
|
||||
}
|
||||
else if (ch < 0x800) /* Double byte */
|
||||
{
|
||||
target.put((byte) ((ch >>> 6) | 0xc0));
|
||||
if (target.hasRemaining())
|
||||
{
|
||||
target.put((byte) ((ch & 0x3f) | 0x80));
|
||||
}
|
||||
else
|
||||
{
|
||||
errorBuffer[0] = (byte) ((ch & 0x3f) | 0x80);
|
||||
errorBufferLength = 1;
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
}
|
||||
else
|
||||
/* Check for surrogates */
|
||||
{
|
||||
if(UTF16.isSurrogate((char)ch) && !isCESU8) {
|
||||
if(UTF16.isLeadSurrogate((char)ch)) {
|
||||
|
||||
if (sourceArrayIndex < source.limit()) {
|
||||
/* test the following code unit */
|
||||
char trail = source.get(sourceArrayIndex);
|
||||
if(UTF16.isTrailSurrogate(trail)) {
|
||||
++sourceArrayIndex;
|
||||
ch = UTF16.getCodePoint((char)ch, trail);
|
||||
//ch2 = 0;
|
||||
/* convert this supplementary code point */
|
||||
/* exit this condition tree */
|
||||
}
|
||||
else {
|
||||
/* this is an unmatched lead code unit (1st surrogate) */
|
||||
/* callback(illegal) */
|
||||
fromUChar32 = ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* no more input */
|
||||
fromUChar32 = ch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
fromUChar32 = (int)ch;
|
||||
cr = CoderResult.malformedForLength(sourceArrayIndex);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE)
|
||||
{
|
||||
indexToWrite = 2;
|
||||
temp[2] = (byte) ((ch >>> 12) | 0xe0);
|
||||
}
|
||||
else
|
||||
{
|
||||
indexToWrite = 3;
|
||||
temp[3] = (byte) ((ch >>> 18) | 0xf0);
|
||||
temp[2] = (byte) (((ch >>> 12) & 0x3f) | 0x80);
|
||||
}
|
||||
temp[1] = (byte) (((ch >>> 6) & 0x3f) | 0x80);
|
||||
temp[0] = (byte) ((ch & 0x3f) | 0x80);
|
||||
|
||||
for (; indexToWrite >= 0; indexToWrite--)
|
||||
{
|
||||
if (target.hasRemaining())
|
||||
{
|
||||
target.put(temp[indexToWrite]);
|
||||
}
|
||||
else
|
||||
{
|
||||
errorBuffer[errorBufferLength++] = temp[indexToWrite];
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sourceArrayIndex < source.limit() && !target.hasRemaining())
|
||||
{
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
|
||||
source.position(sourceArrayIndex);
|
||||
|
||||
}catch(BufferOverflowException ex){
|
||||
cr = CoderResult.OVERFLOW;
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
}
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/**
|
||||
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static boolean isSingle(byte c) {return (((c)&0x80)==0);}
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 lead byte?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static boolean isLead(byte c) {return ((((c)-0xc0) & UConverterConstants.UNSIGNED_BYTE_MASK)<0x3e);}
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 trail byte?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static boolean isTrail(byte c) {return (((c)&0xc0)==0x80);}
|
||||
|
||||
/**
|
||||
* How many code units (bytes) are used for the UTF-8 encoding
|
||||
* of this Unicode code point?
|
||||
* @param c 32-bit code point
|
||||
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int length(int c)
|
||||
{
|
||||
long uc = c & UConverterConstants.UNSIGNED_INT_MASK;
|
||||
return
|
||||
(uc<=0x7f ? 1 :
|
||||
(uc<=0x7ff ? 2 :
|
||||
(uc<=0xd7ff ? 3 :
|
||||
(uc<=0xdfff || uc>0x10ffff ? 0 :
|
||||
(uc<=0xffff ? 3 : 4)
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public CharsetDecoder newDecoder() {
|
||||
return new CharsetDecoderUTF8(this);
|
||||
}
|
||||
|
||||
public CharsetEncoder newEncoder() {
|
||||
return new CharsetEncoderUTF8(this);
|
||||
}
|
||||
}
|
16
icu4j/src/com/ibm/icu/impl/InvalidFormatException.java
Normal file
16
icu4j/src/com/ibm/icu/impl/InvalidFormatException.java
Normal file
|
@ -0,0 +1,16 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
public class InvalidFormatException extends Exception {
|
||||
public InvalidFormatException(){}
|
||||
public InvalidFormatException(String message){
|
||||
super(message);
|
||||
}
|
||||
}
|
789
icu4j/src/com/ibm/icu/impl/UConverterAlias.java
Normal file
789
icu4j/src/com/ibm/icu/impl/UConverterAlias.java
Normal file
|
@ -0,0 +1,789 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
import com.ibm.icu.charset.CharsetICU;
|
||||
|
||||
public final class UConverterAlias {
|
||||
/** The largest value a 32 bit unsigned integer can hold @draft ICU 3.6 */
|
||||
public static final long UINT32_MAX = 4294967295L;
|
||||
|
||||
public static final int AMBIGUOUS_ALIAS_MAP_BIT = 0x8000;
|
||||
|
||||
public static final int CONVERTER_INDEX_MASK = 0xFFF;
|
||||
|
||||
public static final int NUM_RESERVED_TAGS = 2;
|
||||
|
||||
public static final int NUM_HIDDEN_TAGS = 1;
|
||||
|
||||
static int[] gConverterListArray = null;
|
||||
|
||||
static int gConverterListArrayIndex;
|
||||
|
||||
static int[] gTagListArray = null;
|
||||
|
||||
static int gTagListArrayIndex;
|
||||
|
||||
static int[] gAliasListArray = null;
|
||||
|
||||
static int gAliasListArrayIndex;
|
||||
|
||||
static int[] gUntaggedConvArrayArray = null;
|
||||
|
||||
static int gUntaggedConvArrayArrayIndex;
|
||||
|
||||
static int[] gTaggedAliasArrayArray = null;
|
||||
|
||||
static int gTaggedAliasArrayArrayIndex;
|
||||
|
||||
static int[] gTaggedAliasListsArray = null;
|
||||
|
||||
static int gTaggedAliasListsArrayIndex;
|
||||
|
||||
static byte[] gStringTableArray = null;
|
||||
|
||||
static int gStringTableArrayIndex;
|
||||
|
||||
static long gConverterListSize;
|
||||
|
||||
static long gTagListSize;
|
||||
|
||||
static long gAliasListSize;
|
||||
|
||||
static long gUntaggedConvArraySize;
|
||||
|
||||
static long gTaggedAliasArraySize;
|
||||
|
||||
static long gTaggedAliasListsSize;
|
||||
|
||||
static long gStringTableSize;
|
||||
|
||||
static final String GET_STRING(int idx) {
|
||||
return new String(gStringTableArray, 2 * idx, (int) strlen(gStringTableArray, 2 * idx));
|
||||
}
|
||||
|
||||
public static final int strlen(byte[] sArray, int sBegin)
|
||||
{
|
||||
int i = sBegin;
|
||||
while(i < sArray.length && sArray[i++] != 0) {}
|
||||
return i - sBegin - 1;
|
||||
}
|
||||
|
||||
public static final int tocLengthIndex = 0;
|
||||
|
||||
public static final int converterListIndex = 1;
|
||||
|
||||
public static final int tagListIndex = 2;
|
||||
|
||||
public static final int aliasListIndex = 3;
|
||||
|
||||
public static final int untaggedConvArrayIndex = 4;
|
||||
|
||||
public static final int taggedAliasArrayIndex = 5;
|
||||
|
||||
public static final int taggedAliasListsIndex = 6;
|
||||
|
||||
public static final int reservedIndex1 = 7;
|
||||
|
||||
public static final int stringTableIndex = 8;
|
||||
|
||||
public static final int minTocLength = 8; /*
|
||||
* min. tocLength in the file,
|
||||
* does not count the
|
||||
* tocLengthIndex!
|
||||
*/
|
||||
|
||||
public static final int offsetsCount = minTocLength + 1; /*
|
||||
* length of the
|
||||
* swapper's
|
||||
* temporary
|
||||
* offsets[]
|
||||
*/
|
||||
|
||||
static ByteBuffer gAliasData = null;
|
||||
|
||||
private static final boolean isAlias(String alias) {
|
||||
if (alias == null) {
|
||||
throw new IllegalArgumentException("Alias param is null!");
|
||||
} else if (alias.length() == 0) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private static final String CNVALIAS_DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE + "/cnvalias.icu";
|
||||
|
||||
/**
|
||||
* Default buffer size of datafile
|
||||
*/
|
||||
private static final int CNVALIAS_DATA_BUFFER_SIZE = 25000;
|
||||
|
||||
private static final synchronized boolean haveAliasData()
|
||||
throws IOException{
|
||||
boolean needInit;
|
||||
|
||||
// agljport:todo umtx_lock(NULL);
|
||||
needInit = gAliasData == null;
|
||||
|
||||
/* load converter alias data from file if necessary */
|
||||
if (needInit) {
|
||||
ByteBuffer data = null;
|
||||
long[] tableArray = null;
|
||||
long tableStart;
|
||||
long reservedSize1;
|
||||
byte[] reservedBytes = null;
|
||||
|
||||
// agljport:fix data = udata_openChoice(NULL, DATA_TYPE, DATA_NAME,
|
||||
// isAcceptable, NULL, pErrorCode);
|
||||
// data = udata_openChoice(null, DATA_TYPE, DATA_NAME, 0,
|
||||
// isAcceptable, null, pErrorCode);
|
||||
InputStream i = ICUData.getRequiredStream(CNVALIAS_DATA_FILE_NAME);
|
||||
BufferedInputStream b = new BufferedInputStream(i, CNVALIAS_DATA_BUFFER_SIZE);
|
||||
UConverterAliasDataReader reader = new UConverterAliasDataReader(b);
|
||||
tableArray = reader.readToc(offsetsCount);
|
||||
|
||||
tableStart = tableArray[0];
|
||||
if (tableStart < minTocLength) {
|
||||
throw new IOException("Invalid data format.");
|
||||
}
|
||||
gConverterListSize = tableArray[1];
|
||||
gTagListSize = tableArray[2];
|
||||
gAliasListSize = tableArray[3];
|
||||
gUntaggedConvArraySize = tableArray[4];
|
||||
gTaggedAliasArraySize = tableArray[5];
|
||||
gTaggedAliasListsSize = tableArray[6];
|
||||
reservedSize1 = tableArray[7] * 2;
|
||||
gStringTableSize = tableArray[8] * 2;
|
||||
|
||||
gConverterListArray = new int[(int) gConverterListSize];
|
||||
gTagListArray = new int[(int) gTagListSize];
|
||||
gAliasListArray = new int[(int) gAliasListSize];
|
||||
gUntaggedConvArrayArray = new int[(int) gUntaggedConvArraySize];
|
||||
gTaggedAliasArrayArray = new int[(int) gTaggedAliasArraySize];
|
||||
gTaggedAliasListsArray = new int[(int) gTaggedAliasListsSize];
|
||||
reservedBytes = new byte[(int) reservedSize1];
|
||||
gStringTableArray = new byte[(int) gStringTableSize];
|
||||
|
||||
reader.read(gConverterListArray, gTagListArray,
|
||||
gAliasListArray, gUntaggedConvArrayArray,
|
||||
gTaggedAliasArrayArray, gTaggedAliasListsArray,
|
||||
reservedBytes, gStringTableArray);
|
||||
data = ByteBuffer.allocate(0); // dummy UDataMemory object in absence
|
||||
// of memory mapping
|
||||
|
||||
// agljport:todo umtx_lock(NULL);
|
||||
if (gAliasData == null) {
|
||||
gAliasData = data;
|
||||
data = null;
|
||||
|
||||
// agljport:fix ucln_common_registerCleanup(UCLN_COMMON_IO,
|
||||
// io_cleanup);
|
||||
}
|
||||
// agljport:todo umtx_unlock(NULL);
|
||||
|
||||
/* if a different thread set it first, then close the extra data */
|
||||
if (data != null) {
|
||||
// agljport:fix udata_close(data); /* NULL if it was set
|
||||
// correctly */
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// U_CFUNC const char * io_getConverterName(const char *alias, UErrorCode
|
||||
// *pErrorCode)
|
||||
public static final String io_getConverterName(String alias)
|
||||
throws IOException{
|
||||
if (haveAliasData() && isAlias(alias)) {
|
||||
boolean[] isAmbigous = new boolean[1];
|
||||
long convNum = findConverter(alias, isAmbigous);
|
||||
if (convNum < gConverterListSize) {
|
||||
return GET_STRING(gConverterListArray[(int) convNum]);
|
||||
}
|
||||
/* else converter not found */
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/*
|
||||
* search for an alias return the converter number index for gConverterList
|
||||
*/
|
||||
// static U_INLINE uint32_t findConverter(const char *alias, UErrorCode
|
||||
// *pErrorCode)
|
||||
private static final long findConverter(String alias, boolean[] isAmbigous) {
|
||||
long mid, start, limit;
|
||||
long lastMid;
|
||||
long result;
|
||||
|
||||
/* do a binary search for the alias */
|
||||
start = 0;
|
||||
limit = gUntaggedConvArraySize;
|
||||
mid = limit;
|
||||
lastMid = UINT32_MAX;
|
||||
|
||||
for (;;) {
|
||||
mid = (start + limit) / 2;
|
||||
if (lastMid == mid) { /* Have we moved? */
|
||||
break; /* We haven't moved, and it wasn't found. */
|
||||
}
|
||||
lastMid = mid;
|
||||
result = compareNames(alias, GET_STRING(gAliasListArray[(int) mid]));
|
||||
|
||||
if (result < 0) {
|
||||
limit = mid;
|
||||
} else if (result > 0) {
|
||||
start = mid;
|
||||
} else {
|
||||
/*
|
||||
* Since the gencnval tool folds duplicates into one entry, this
|
||||
* alias in gAliasList is unique, but different standards may
|
||||
* map an alias to different converters.
|
||||
*/
|
||||
if ((gUntaggedConvArrayArray[(int) mid] & AMBIGUOUS_ALIAS_MAP_BIT) != 0) {
|
||||
isAmbigous[0]=true;
|
||||
}
|
||||
return gUntaggedConvArrayArray[(int) mid] & CONVERTER_INDEX_MASK;
|
||||
}
|
||||
}
|
||||
// public static final long UINT32_MAX = 4294967295L;
|
||||
return Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* \var io_stripForCompare Remove the underscores, dashes and spaces from
|
||||
* the name, and convert the name to lower case.
|
||||
*
|
||||
* @param dst
|
||||
* The destination buffer, which is <= the buffer of name.
|
||||
* @param dst
|
||||
* The destination buffer, which is <= the buffer of name.
|
||||
* @return the destination buffer.
|
||||
*/
|
||||
public static final StringBuffer io_stripForCompare(StringBuffer dst, String name) {
|
||||
return io_stripASCIIForCompare(dst, name);
|
||||
}
|
||||
|
||||
/* @see compareNames */
|
||||
private static final StringBuffer io_stripASCIIForCompare(StringBuffer dst, String name) {
|
||||
name = name.concat("\000");
|
||||
int nameIndex = 0;
|
||||
char c1 = name.charAt(0);
|
||||
int dstItr = 0;
|
||||
|
||||
while (c1 != 0) {
|
||||
/* Ignore delimiters '-', '_', and ' ' */
|
||||
while ((c1 = name.charAt(nameIndex)) == 0x2d || c1 == 0x5f
|
||||
|| c1 == 0x20) {
|
||||
++nameIndex;
|
||||
}
|
||||
|
||||
/* lowercase for case-insensitive comparison */
|
||||
dst.append(Character.toLowerCase(c1));
|
||||
++dstItr;
|
||||
++nameIndex;
|
||||
}
|
||||
if (dst.length() > 0)
|
||||
dst.deleteCharAt(dst.length() - 1);
|
||||
return dst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a fuzzy compare of a two converter/alias names. The comparison is
|
||||
* case-insensitive. It also ignores the characters '-', '_', and ' ' (dash,
|
||||
* underscore, and space). Thus the strings "UTF-8", "utf_8", and "Utf 8"
|
||||
* are exactly equivalent.
|
||||
*
|
||||
* This is a symmetrical (commutative) operation; order of arguments is
|
||||
* insignificant. This is an important property for sorting the list (when
|
||||
* the list is preprocessed into binary form) and for performing binary
|
||||
* searches on it at run time.
|
||||
*
|
||||
* @param name1
|
||||
* a converter name or alias, zero-terminated
|
||||
* @param name2
|
||||
* a converter name or alias, zero-terminated
|
||||
* @return 0 if the names match, or a negative value if the name1 lexically
|
||||
* precedes name2, or a positive value if the name1 lexically
|
||||
* follows name2.
|
||||
*
|
||||
* @see io_stripForCompare
|
||||
*/
|
||||
public static int compareNames(String name1, String name2){
|
||||
int result = 0;
|
||||
int i1 = 0;
|
||||
int i2 = 0;
|
||||
while (true) {
|
||||
char ch1 = 0;
|
||||
char ch2 = 0;
|
||||
// Ignore delimiters '-', '_', and ASCII White_Space
|
||||
if (i1 < name1.length()) {
|
||||
ch1 = name1.charAt(i1 ++);
|
||||
}
|
||||
while (ch1 == '-' || ch1 == '_' || ch1 == ' ' ) {
|
||||
if (i1 < name1.length()) {
|
||||
ch1 = name1.charAt(i1 ++);
|
||||
}
|
||||
else {
|
||||
ch1 = 0;
|
||||
}
|
||||
}
|
||||
if (i2 < name2.length()) {
|
||||
ch2 = name2.charAt(i2 ++);
|
||||
}
|
||||
while (ch2 == '-' || ch2 == '_' || ch2 == ' ' ) {
|
||||
if (i2 < name2.length()) {
|
||||
ch2 = name2.charAt(i2 ++);
|
||||
}
|
||||
else {
|
||||
ch2 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// If we reach the ends of both strings then they match
|
||||
if (ch1 == 0 && ch2 == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Case-insensitive comparison
|
||||
if (ch1 != ch2) {
|
||||
result = Character.toLowerCase(ch1)- Character.toLowerCase(ch2);
|
||||
if (result != 0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static int io_countAliases(String alias)
|
||||
throws IOException{
|
||||
if (haveAliasData() && isAlias(alias)) {
|
||||
boolean[] isAmbigous = new boolean[1];
|
||||
long convNum = findConverter(alias, isAmbigous);
|
||||
if (convNum < gConverterListSize) {
|
||||
/* tagListNum - 1 is the ALL tag */
|
||||
int listOffset = gTaggedAliasArrayArray[(int) ((gTagListSize - 1)
|
||||
* gConverterListSize + convNum)];
|
||||
|
||||
if (listOffset != 0) {
|
||||
return gTaggedAliasListsArray[listOffset];
|
||||
}
|
||||
/* else this shouldn't happen. internal program error */
|
||||
}
|
||||
/* else converter not found */
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of all aliases (and converter names).
|
||||
*
|
||||
* @param pErrorCode
|
||||
* The error code
|
||||
* @return the number of all aliases
|
||||
*/
|
||||
// U_CFUNC uint16_t io_countTotalAliases(UErrorCode *pErrorCode);
|
||||
public static int io_countTotalAliases() throws IOException{
|
||||
if (haveAliasData()) {
|
||||
return (int) gAliasListSize;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// U_CFUNC const char * io_getAlias(const char *alias, uint16_t n,
|
||||
// UErrorCode *pErrorCode)
|
||||
public static String io_getAlias(String alias, int n) throws IOException{
|
||||
if (haveAliasData() && isAlias(alias)) {
|
||||
boolean[] isAmbigous = new boolean[1];
|
||||
long convNum = findConverter(alias,isAmbigous);
|
||||
if (convNum < gConverterListSize) {
|
||||
/* tagListNum - 1 is the ALL tag */
|
||||
int listOffset = gTaggedAliasArrayArray[(int) ((gTagListSize - 1)
|
||||
* gConverterListSize + convNum)];
|
||||
|
||||
if (listOffset != 0) {
|
||||
//long listCount = gTaggedAliasListsArray[listOffset];
|
||||
/* +1 to skip listCount */
|
||||
int[] currListArray = gTaggedAliasListsArray;
|
||||
int currListArrayIndex = listOffset + 1;
|
||||
|
||||
return GET_STRING(currListArray[currListArrayIndex + n]);
|
||||
|
||||
}
|
||||
/* else this shouldn't happen. internal program error */
|
||||
}
|
||||
/* else converter not found */
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// U_CFUNC uint16_t io_countStandards(UErrorCode *pErrorCode) {
|
||||
public static int io_countStandards() throws IOException{
|
||||
if (haveAliasData()) {
|
||||
return (int) (gTagListSize - NUM_HIDDEN_TAGS);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// U_CAPI const char * U_EXPORT2getStandard(uint16_t n, UErrorCode
|
||||
// *pErrorCode)
|
||||
public static String getStandard(int n) throws IOException{
|
||||
if (haveAliasData()) {
|
||||
return GET_STRING(gTagListArray[n]);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// U_CAPI const char * U_EXPORT2 getStandardName(const char *alias, const
|
||||
// char *standard, UErrorCode *pErrorCode)
|
||||
public static final String getStandardName(String alias, String standard)throws IOException {
|
||||
if (haveAliasData() && isAlias(alias)) {
|
||||
long listOffset = findTaggedAliasListsOffset(alias, standard);
|
||||
|
||||
if (0 < listOffset && listOffset < gTaggedAliasListsSize) {
|
||||
int[] currListArray = gTaggedAliasListsArray;
|
||||
long currListArrayIndex = listOffset + 1;
|
||||
if (currListArray[0] != 0) {
|
||||
return GET_STRING(currListArray[(int) currListArrayIndex]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// U_CAPI uint16_t U_EXPORT2 countAliases(const char *alias, UErrorCode
|
||||
// *pErrorCode)
|
||||
public static int countAliases(String alias) throws IOException{
|
||||
return io_countAliases(alias);
|
||||
}
|
||||
|
||||
// U_CAPI const char* U_EXPORT2 getAlias(const char *alias, uint16_t n,
|
||||
// UErrorCode *pErrorCode)
|
||||
public static String getAlias(String alias, int n) throws IOException{
|
||||
return io_getAlias(alias, n);
|
||||
}
|
||||
|
||||
// U_CFUNC uint16_t countStandards(void)
|
||||
public static int countStandards()throws IOException{
|
||||
return io_countStandards();
|
||||
}
|
||||
|
||||
/*returns a single Name from the list, will return NULL if out of bounds
|
||||
*/
|
||||
public static String getAvailableName (int n){
|
||||
try{
|
||||
if (0 <= n && n <= 0xffff) {
|
||||
String name = bld_getAvailableConverter(n);
|
||||
return name;
|
||||
}
|
||||
}catch(IOException ex){
|
||||
//throw away exception
|
||||
}
|
||||
return null;
|
||||
}
|
||||
// U_CAPI const char * U_EXPORT2 getCanonicalName(const char *alias, const
|
||||
// char *standard, UErrorCode *pErrorCode) {
|
||||
public static String getCanonicalName(String alias, String standard) throws IOException{
|
||||
if (haveAliasData() && isAlias(alias)) {
|
||||
long convNum = findTaggedConverterNum(alias, standard);
|
||||
|
||||
if (convNum < gConverterListSize) {
|
||||
return GET_STRING(gConverterListArray[(int) convNum]);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
public static int countAvailable (){
|
||||
try{
|
||||
return bld_countAvailableConverters();
|
||||
}catch(IOException ex){
|
||||
//throw away exception
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// U_CAPI UEnumeration * U_EXPORT2 openStandardNames(const char *convName,
|
||||
// const char *standard, UErrorCode *pErrorCode)
|
||||
public static final UConverterAliasesEnumeration openStandardNames(String convName, String standard)throws IOException {
|
||||
UConverterAliasesEnumeration aliasEnum = null;
|
||||
if (haveAliasData() && isAlias(convName)) {
|
||||
long listOffset = findTaggedAliasListsOffset(convName, standard);
|
||||
|
||||
/*
|
||||
* When listOffset == 0, we want to acknowledge that the converter
|
||||
* name and standard are okay, but there is nothing to enumerate.
|
||||
*/
|
||||
if (listOffset < gTaggedAliasListsSize) {
|
||||
|
||||
UConverterAliasesEnumeration.UAliasContext context = new UConverterAliasesEnumeration.UAliasContext(listOffset, 0);
|
||||
aliasEnum = new UConverterAliasesEnumeration();
|
||||
aliasEnum.setContext(context);
|
||||
}
|
||||
/* else converter or tag not found */
|
||||
}
|
||||
return aliasEnum;
|
||||
}
|
||||
|
||||
// static uint32_t getTagNumber(const char *tagname)
|
||||
private static long getTagNumber(String tagName) {
|
||||
if (gTagListArray != null) {
|
||||
long tagNum;
|
||||
for (tagNum = 0; tagNum < gTagListSize; tagNum++) {
|
||||
if (tagName.equals(GET_STRING(gTagListArray[(int) tagNum]))) {
|
||||
return tagNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
// static uint32_t findTaggedAliasListsOffset(const char *alias, const char
|
||||
// *standard, UErrorCode *pErrorCode)
|
||||
private static long findTaggedAliasListsOffset(String alias, String standard) {
|
||||
long idx;
|
||||
long listOffset;
|
||||
long convNum;
|
||||
long tagNum = getTagNumber(standard);
|
||||
boolean[] isAmbigous = new boolean[1];
|
||||
/* Make a quick guess. Hopefully they used a TR22 canonical alias. */
|
||||
convNum = findConverter(alias, isAmbigous);
|
||||
|
||||
if (tagNum < (gTagListSize - NUM_HIDDEN_TAGS)
|
||||
&& convNum < gConverterListSize) {
|
||||
listOffset = gTaggedAliasArrayArray[(int) (tagNum
|
||||
* gConverterListSize + convNum)];
|
||||
if (listOffset != 0
|
||||
&& gTaggedAliasListsArray[(int) listOffset + 1] != 0) {
|
||||
return listOffset;
|
||||
}
|
||||
if (isAmbigous[0]==true) {
|
||||
/*
|
||||
* Uh Oh! They used an ambiguous alias. We have to search the
|
||||
* whole swiss cheese starting at the highest standard affinity.
|
||||
* This may take a while.
|
||||
*/
|
||||
|
||||
for (idx = 0; idx < gTaggedAliasArraySize; idx++) {
|
||||
listOffset = gTaggedAliasArrayArray[(int) idx];
|
||||
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
|
||||
long currTagNum = idx / gConverterListSize;
|
||||
long currConvNum = (idx - currTagNum
|
||||
* gConverterListSize);
|
||||
long tempListOffset = gTaggedAliasArrayArray[(int) (tagNum
|
||||
* gConverterListSize + currConvNum)];
|
||||
if (tempListOffset != 0
|
||||
&& gTaggedAliasListsArray[(int) tempListOffset + 1] != 0) {
|
||||
return tempListOffset;
|
||||
}
|
||||
/*
|
||||
* else keep on looking We could speed this up by
|
||||
* starting on the next row because an alias is unique
|
||||
* per row, right now. This would change if alias
|
||||
* versioning appears.
|
||||
*/
|
||||
}
|
||||
}
|
||||
/* The standard doesn't know about the alias */
|
||||
}
|
||||
/* else no default name */
|
||||
return 0;
|
||||
}
|
||||
/* else converter or tag not found */
|
||||
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
/* Return the canonical name */
|
||||
// static uint32_t findTaggedConverterNum(const char *alias, const char
|
||||
// *standard, UErrorCode *pErrorCode)
|
||||
private static long findTaggedConverterNum(String alias, String standard) {
|
||||
long idx;
|
||||
long listOffset;
|
||||
long convNum;
|
||||
long tagNum = getTagNumber(standard);
|
||||
boolean[] isAmbigous = new boolean[1];
|
||||
|
||||
/* Make a quick guess. Hopefully they used a TR22 canonical alias. */
|
||||
convNum = findConverter(alias, isAmbigous);
|
||||
|
||||
if (tagNum < (gTagListSize - NUM_HIDDEN_TAGS)
|
||||
&& convNum < gConverterListSize) {
|
||||
listOffset = gTaggedAliasArrayArray[(int) (tagNum
|
||||
* gConverterListSize + convNum)];
|
||||
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
|
||||
return convNum;
|
||||
}
|
||||
if (isAmbigous[0] == true) {
|
||||
/*
|
||||
* Uh Oh! They used an ambiguous alias. We have to search one
|
||||
* slice of the swiss cheese. We search only in the requested
|
||||
* tag, not the whole thing. This may take a while.
|
||||
*/
|
||||
long convStart = (tagNum) * gConverterListSize;
|
||||
long convLimit = (tagNum + 1) * gConverterListSize;
|
||||
for (idx = convStart; idx < convLimit; idx++) {
|
||||
listOffset = gTaggedAliasArrayArray[(int) idx];
|
||||
if (listOffset != 0 && isAliasInList(alias, listOffset)) {
|
||||
return idx - convStart;
|
||||
}
|
||||
}
|
||||
/* The standard doesn't know about the alias */
|
||||
}
|
||||
/* else no canonical name */
|
||||
}
|
||||
/* else converter or tag not found */
|
||||
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
// static U_INLINE UBool isAliasInList(const char *alias, uint32_t
|
||||
// listOffset)
|
||||
private static boolean isAliasInList(String alias, long listOffset) {
|
||||
if (listOffset != 0) {
|
||||
long currAlias;
|
||||
long listCount = gTaggedAliasListsArray[(int) listOffset];
|
||||
/* +1 to skip listCount */
|
||||
int[] currList = gTaggedAliasListsArray;
|
||||
long currListArrayIndex = listOffset + 1;
|
||||
for (currAlias = 0; currAlias < listCount; currAlias++) {
|
||||
if (currList[(int) (currAlias + currListArrayIndex)] != 0
|
||||
&& compareNames(
|
||||
alias,
|
||||
GET_STRING(currList[(int) (currAlias + currListArrayIndex)])) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// begin bld.c
|
||||
static String[] gAvailableConverters = null;
|
||||
|
||||
static int gAvailableConverterCount = 0;
|
||||
|
||||
static byte[] gDefaultConverterNameBuffer; // [MAX_CONVERTER_NAME_LENGTH +
|
||||
// 1]; /* +1 for NULL */
|
||||
|
||||
static String gDefaultConverterName = null;
|
||||
|
||||
// static UBool haveAvailableConverterList(UErrorCode *pErrorCode)
|
||||
static boolean haveAvailableConverterList() throws IOException{
|
||||
if (gAvailableConverters == null) {
|
||||
int idx;
|
||||
int localConverterCount;
|
||||
String converterName;
|
||||
String[] localConverterList;
|
||||
|
||||
if (!haveAliasData()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* We can't have more than "*converterTable" converters to open */
|
||||
localConverterList = new String[(int) gConverterListSize];
|
||||
|
||||
localConverterCount = 0;
|
||||
|
||||
for (idx = 0; idx < gConverterListSize; idx++) {
|
||||
converterName = GET_STRING(gConverterListArray[idx]);
|
||||
//UConverter cnv = UConverter.open(converterName);
|
||||
//TODO: Fix me
|
||||
localConverterList[localConverterCount++] = converterName;
|
||||
|
||||
}
|
||||
|
||||
// agljport:todo umtx_lock(NULL);
|
||||
if (gAvailableConverters == null) {
|
||||
gAvailableConverters = localConverterList;
|
||||
gAvailableConverterCount = localConverterCount;
|
||||
/* haveData should have already registered the cleanup function */
|
||||
} else {
|
||||
// agljport:todo free((char **)localConverterList);
|
||||
}
|
||||
// agljport:todo umtx_unlock(NULL);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// U_CFUNC uint16_t bld_countAvailableConverters(UErrorCode *pErrorCode)
|
||||
public static int bld_countAvailableConverters() throws IOException{
|
||||
if (haveAvailableConverterList()) {
|
||||
return gAvailableConverterCount;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// U_CFUNC const char * bld_getAvailableConverter(uint16_t n, UErrorCode
|
||||
// *pErrorCode)
|
||||
public static String bld_getAvailableConverter(int n) throws IOException{
|
||||
if (haveAvailableConverterList()) {
|
||||
if (n < gAvailableConverterCount) {
|
||||
return gAvailableConverters[n];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/* default converter name --------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* In order to be really thread-safe, the get function would have to take
|
||||
* a buffer parameter and copy the current string inside a mutex block.
|
||||
* This implementation only tries to be really thread-safe while
|
||||
* setting the name.
|
||||
* It assumes that setting a pointer is atomic.
|
||||
*/
|
||||
|
||||
// U_CFUNC const char * getDefaultName()
|
||||
public static final synchronized String getDefaultName() {
|
||||
/* local variable to be thread-safe */
|
||||
String name;
|
||||
|
||||
//agljport:todo umtx_lock(null);
|
||||
name = gDefaultConverterName;
|
||||
//agljport:todo umtx_unlock(null);
|
||||
|
||||
if (name == null) {
|
||||
//UConverter cnv = null;
|
||||
long length = 0;
|
||||
|
||||
name = CharsetICU.getDefaultCharsetName();
|
||||
|
||||
/* if the name is there, test it out and get the canonical name with options */
|
||||
if (name != null) {
|
||||
// cnv = UConverter.open(name);
|
||||
// name = cnv.getName(cnv);
|
||||
// TODO: fix me
|
||||
}
|
||||
|
||||
if (name == null || name.length() == 0 ||/* cnv == null ||*/
|
||||
length >= gDefaultConverterNameBuffer.length) {
|
||||
/* Panic time, let's use a fallback. */
|
||||
name = new String("US-ASCII");
|
||||
}
|
||||
|
||||
//length=(int32_t)(strlen(name));
|
||||
|
||||
/* Copy the name before we close the converter. */
|
||||
name = gDefaultConverterName;
|
||||
}
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
//end bld.c
|
||||
}
|
218
icu4j/src/com/ibm/icu/impl/UConverterAliasDataReader.java
Normal file
218
icu4j/src/com/ibm/icu/impl/UConverterAliasDataReader.java
Normal file
|
@ -0,0 +1,218 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
|
||||
/* Format of cnvalias.icu -----------------------------------------------------
|
||||
*
|
||||
* cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
|
||||
* This binary form contains several tables. All indexes are to uint16_t
|
||||
* units, and not to the bytes (uint8_t units). Addressing everything on
|
||||
* 16-bit boundaries allows us to store more information with small index
|
||||
* numbers, which are also 16-bit in size. The majority of the table (except
|
||||
* the string table) are 16-bit numbers.
|
||||
*
|
||||
* First there is the size of the Table of Contents (TOC). The TOC
|
||||
* entries contain the size of each section. In order to find the offset
|
||||
* you just need to sum up the previous offsets.
|
||||
* The TOC length and entries are an array of uint32_t values.
|
||||
* The first section after the TOC starts immediately after the TOC.
|
||||
*
|
||||
* 1) This section contains a list of converters. This list contains indexes
|
||||
* into the string table for the converter name. The index of this list is
|
||||
* also used by other sections, which are mentioned later on.
|
||||
* This list is not sorted.
|
||||
*
|
||||
* 2) This section contains a list of tags. This list contains indexes
|
||||
* into the string table for the tag name. The index of this list is
|
||||
* also used by other sections, which are mentioned later on.
|
||||
* This list is in priority order of standards.
|
||||
*
|
||||
* 3) This section contains a list of sorted unique aliases. This
|
||||
* list contains indexes into the string table for the alias name. The
|
||||
* index of this list is also used by other sections, like the 4th section.
|
||||
* The index for the 3rd and 4th section is used to get the
|
||||
* alias -> converter name mapping. Section 3 and 4 form a two column table.
|
||||
*
|
||||
* 4) This section contains a list of mapped converter names. Consider this
|
||||
* as a table that maps the 3rd section to the 1st section. This list contains
|
||||
* indexes into the 1st section. The index of this list is the same index in
|
||||
* the 3rd section. There is also some extra information in the high bits of
|
||||
* each converter index in this table. Currently it's only used to say that
|
||||
* an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
|
||||
* and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
|
||||
* the predigested form of the 5th section so that an alias lookup can be fast.
|
||||
*
|
||||
* 5) This section contains a 2D array with indexes to the 6th section. This
|
||||
* section is the full form of all alias mappings. The column index is the
|
||||
* index into the converter list (column header). The row index is the index
|
||||
* to tag list (row header). This 2D array is the top part a 3D array. The
|
||||
* third dimension is in the 6th section.
|
||||
*
|
||||
* 6) This is blob of variable length arrays. Each array starts with a size,
|
||||
* and is followed by indexes to alias names in the string table. This is
|
||||
* the third dimension to the section 5. No other section should be referencing
|
||||
* this section.
|
||||
*
|
||||
* 7) Reserved at this time (There is no information). This _usually_ has a
|
||||
* size of 0. Future versions may add more information here.
|
||||
*
|
||||
* 8) This is the string table. All strings are indexed on an even address.
|
||||
* There are two reasons for this. First many chip architectures locate strings
|
||||
* faster on even address boundaries. Second, since all indexes are 16-bit
|
||||
* numbers, this string table can be 128KB in size instead of 64KB when we
|
||||
* only have strings starting on an even address.
|
||||
*
|
||||
*
|
||||
* Here is the concept of section 5 and 6. It's a 3D cube. Each tag
|
||||
* has a unique alias among all converters. That same alias can
|
||||
* be mentioned in other standards on different converters,
|
||||
* but only one alias per tag can be unique.
|
||||
*
|
||||
*
|
||||
* Converter Names (Usually in TR22 form)
|
||||
* -------------------------------------------.
|
||||
* T / /|
|
||||
* a / / |
|
||||
* g / / |
|
||||
* s / / |
|
||||
* / / |
|
||||
* ------------------------------------------/ |
|
||||
* A | | |
|
||||
* l | | |
|
||||
* i | | /
|
||||
* a | | /
|
||||
* s | | /
|
||||
* e | | /
|
||||
* s | |/
|
||||
* -------------------------------------------
|
||||
*
|
||||
*
|
||||
*
|
||||
* Here is what it really looks like. It's like swiss cheese.
|
||||
* There are holes. Some converters aren't recognized by
|
||||
* a standard, or they are really old converters that the
|
||||
* standard doesn't recognize anymore.
|
||||
*
|
||||
* Converter Names (Usually in TR22 form)
|
||||
* -------------------------------------------.
|
||||
* T /##########################################/|
|
||||
* a / # # /#
|
||||
* g / # ## ## ### # ### ### ### #/
|
||||
* s / # ##### #### ## ## #/#
|
||||
* / ### # # ## # # # ### # # #/##
|
||||
* ------------------------------------------/# #
|
||||
* A |### # # ## # # # ### # # #|# #
|
||||
* l |# # # # # ## # #|# #
|
||||
* i |# # # # # # #|#
|
||||
* a |# #|#
|
||||
* s | #|#
|
||||
* e
|
||||
* s
|
||||
*
|
||||
*/
|
||||
|
||||
final class UConverterAliasDataReader implements ICUBinary.Authenticate {
|
||||
private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
|
||||
|
||||
/**
|
||||
* <p>Protected constructor.</p>
|
||||
* @param inputStream ICU uprop.dat file input stream
|
||||
* @exception IOException throw if data file fails authentication
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected UConverterAliasDataReader(InputStream inputStream)
|
||||
throws IOException{
|
||||
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
|
||||
|
||||
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
|
||||
|
||||
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
|
||||
|
||||
dataInputStream = new DataInputStream(inputStream);
|
||||
|
||||
if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
|
||||
}
|
||||
|
||||
// protected methods -------------------------------------------------
|
||||
|
||||
protected long[] readToc(int n)throws IOException
|
||||
{
|
||||
long[] toc = new long[n];
|
||||
//Read the toc
|
||||
for (int i = 0; i < n ; ++i) {
|
||||
toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
|
||||
}
|
||||
return toc;
|
||||
}
|
||||
|
||||
protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, byte[] reservedBytes, byte[] stringTable) throws IOException{
|
||||
int i;
|
||||
//int listnum = 1;
|
||||
//long listsize;
|
||||
|
||||
for(i = 0; i < convList.length; ++i)
|
||||
convList[i] = dataInputStream.readUnsignedShort();
|
||||
|
||||
for(i = 0; i < tagList.length; ++i)
|
||||
tagList[i] = dataInputStream.readUnsignedShort();
|
||||
|
||||
for(i = 0; i < aliasList.length; ++i)
|
||||
aliasList[i] = dataInputStream.readUnsignedShort();
|
||||
|
||||
for(i = 0; i < untaggedConvArray.length; ++i)
|
||||
untaggedConvArray[i] = dataInputStream.readUnsignedShort();
|
||||
|
||||
for(i = 0; i < taggedAliasArray.length; ++i)
|
||||
taggedAliasArray[i] = dataInputStream.readUnsignedShort();
|
||||
|
||||
for(i = 0; i < taggedAliasLists.length; ++i)
|
||||
taggedAliasLists[i] = dataInputStream.readUnsignedShort();
|
||||
|
||||
dataInputStream.read(reservedBytes);
|
||||
dataInputStream.read(stringTable);
|
||||
}
|
||||
|
||||
public byte[] getDataFormatVersion(){
|
||||
return DATA_FORMAT_VERSION;
|
||||
}
|
||||
|
||||
public boolean isDataVersionAcceptable(byte version[])
|
||||
{
|
||||
return version[0] == DATA_FORMAT_VERSION[0];
|
||||
}
|
||||
|
||||
public byte[] getUnicodeVersion(){
|
||||
return unicodeVersion;
|
||||
}
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* ICU data file input stream
|
||||
*/
|
||||
private DataInputStream dataInputStream;
|
||||
|
||||
private byte[] unicodeVersion;
|
||||
|
||||
/**
|
||||
* File format version that this class understands.
|
||||
* No guarantees are made if a older version is used
|
||||
* see store.c of gennorm for more information and values
|
||||
*/
|
||||
// DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
|
||||
private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
|
||||
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x3};
|
||||
|
||||
//private static final int UNSIGNED_SHORT_MASK = 0xffff;
|
||||
private static final long UNSIGNED_INT_MASK = 0xffffffffL;
|
||||
|
||||
}
|
83
icu4j/src/com/ibm/icu/impl/UConverterAliasesEnumeration.java
Normal file
83
icu4j/src/com/ibm/icu/impl/UConverterAliasesEnumeration.java
Normal file
|
@ -0,0 +1,83 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.util.Enumeration;
|
||||
|
||||
|
||||
/**
|
||||
* Enumeration for Converter Aliases
|
||||
*/
|
||||
|
||||
public class UConverterAliasesEnumeration implements Enumeration {
|
||||
|
||||
private UAliasContext context;
|
||||
|
||||
/* Set alias context
|
||||
*/
|
||||
public void setContext(UAliasContext context){
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
public int count() {
|
||||
int value = 0;
|
||||
|
||||
if (context.listOffset!=0) {
|
||||
value = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset];
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public Object nextElement() {
|
||||
|
||||
if (context.listOffset!=0) {
|
||||
long listCount = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset];
|
||||
int[] currListArray = UConverterAlias.gTaggedAliasListsArray;
|
||||
long currListArrayIndex = context.getListOffset() + 1;
|
||||
|
||||
if (context.getListIdx() < listCount) {
|
||||
String str = UConverterAlias.GET_STRING(currListArray[(int)(context.listIdx+currListArrayIndex)]);
|
||||
context.listIdx++;
|
||||
return str;
|
||||
}
|
||||
}
|
||||
/* Either we accessed a zero length list, or we enumerated too far. */
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
context.listIdx = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class to store context for alias
|
||||
*/
|
||||
public static class UAliasContext{
|
||||
private long listOffset;
|
||||
private long listIdx;
|
||||
|
||||
public UAliasContext(long listOffset, long listIdx){
|
||||
this.listOffset = listOffset;
|
||||
this.listIdx = listIdx;
|
||||
}
|
||||
|
||||
public long getListOffset(){
|
||||
return listOffset;
|
||||
}
|
||||
|
||||
public long getListIdx(){
|
||||
return listIdx;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasMoreElements() {
|
||||
long listCount = UConverterAlias.gTaggedAliasListsArray[(int)context.listOffset];
|
||||
return (context.getListIdx() < listCount);
|
||||
}
|
||||
}
|
156
icu4j/src/com/ibm/icu/impl/UConverterConstants.java
Normal file
156
icu4j/src/com/ibm/icu/impl/UConverterConstants.java
Normal file
|
@ -0,0 +1,156 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
public interface UConverterConstants {
|
||||
|
||||
public static final short UNSIGNED_BYTE_MASK = 0xff;
|
||||
public static final int UNSIGNED_SHORT_MASK = 0xffff;
|
||||
public static final long UNSIGNED_INT_MASK = 0xffffffffL;
|
||||
|
||||
public static final int U_IS_BIG_ENDIAN = 0;
|
||||
|
||||
/**
|
||||
* Useful constant for the maximum size of the whole locale ID
|
||||
* (including the terminating NULL).
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
public static final int ULOC_FULLNAME_CAPACITY = 56;
|
||||
|
||||
/**
|
||||
* This value is intended for sentinel values for APIs that
|
||||
* (take or) return single code points (UChar32).
|
||||
* It is outside of the Unicode code point range 0..0x10ffff.
|
||||
*
|
||||
* For example, a "done" or "error" value in a new API
|
||||
* could be indicated with U_SENTINEL.
|
||||
*
|
||||
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
|
||||
* values, mostly 0xffff.
|
||||
* Those may need to be distinguished from
|
||||
* actual U+ffff text contents by calling functions like
|
||||
* CharacterIterator::hasNext() or UnicodeString::length().
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
public static final int U_SENTINEL = -1;
|
||||
|
||||
//end utf.h
|
||||
|
||||
//begin ucnv.h
|
||||
/**
|
||||
* Character that separates converter names from options and options from each other.
|
||||
* @see open
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
static final byte OPTION_SEP_CHAR = ',';
|
||||
|
||||
/** Maximum length of a converter name including the terminating NULL @draft ICU 3.6 */
|
||||
public static final int MAX_CONVERTER_NAME_LENGTH = 60;
|
||||
/** Maximum length of a converter name including path and terminating NULL @draft ICU 3.6 */
|
||||
public static final int MAX_FULL_FILE_NAME_LENGTH = (600+MAX_CONVERTER_NAME_LENGTH);
|
||||
|
||||
/** Shift in for EBDCDIC_STATEFUL and iso2022 states @draft ICU 3.6 */
|
||||
public static final int SI = 0x0F;
|
||||
/** Shift out for EBDCDIC_STATEFUL and iso2022 states @draft ICU 3.6 */
|
||||
public static final int SO = 0x0E;
|
||||
|
||||
//end ucnv.h
|
||||
|
||||
// begin bld.h
|
||||
/* size of the overflow buffers in UConverter, enough for escaping callbacks */
|
||||
//#define ERROR_BUFFER_LENGTH 32
|
||||
public static final int ERROR_BUFFER_LENGTH = 32;
|
||||
|
||||
/* at most 4 bytes per substitution character (part of .cnv file format! see UConverterStaticData) */
|
||||
public static final int MAX_SUBCHAR_LEN = 4;
|
||||
|
||||
/* at most 8 bytes per character in toUBytes[] (UTF-8 uses up to 6) */
|
||||
public static final int MAX_CHAR_LEN = 8;
|
||||
|
||||
/* converter options bits */
|
||||
public static final int OPTION_VERSION = 0xf;
|
||||
public static final int OPTION_SWAP_LFNL = 0x10;
|
||||
public static final int OPTION_MAC = 0x20; //agljport:comment added for Mac ISCII encodings
|
||||
|
||||
/** values for the unicodeMask */
|
||||
public static final int HAS_SUPPLEMENTARY = 1;
|
||||
public static final int HAS_SURROGATES = 2;
|
||||
// end bld.h
|
||||
|
||||
// begin cnv.h
|
||||
/* this is used in fromUnicode DBCS tables as an "unassigned" marker */
|
||||
public static final int missingCharMarker = 0xFFFF;
|
||||
|
||||
public final class UConverterResetChoice {
|
||||
public static final int RESET_BOTH = 0;
|
||||
public static final int RESET_TO_UNICODE = RESET_BOTH + 1;
|
||||
public static final int RESET_FROM_UNICODE = RESET_TO_UNICODE + 1;
|
||||
}
|
||||
|
||||
// begin utf16.h
|
||||
/**
|
||||
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
|
||||
* @return 2
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
public static final int U16_MAX_LENGTH = 2;
|
||||
// end utf16.h
|
||||
|
||||
// begin err.h
|
||||
/**
|
||||
* FROM_U, TO_U context options for sub callback
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
public static byte[] SUB_STOP_ON_ILLEGAL = {'i'};
|
||||
|
||||
/**
|
||||
* FROM_U, TO_U context options for skip callback
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
public static byte[] SKIP_STOP_ON_ILLEGAL = {'i'};
|
||||
|
||||
/**
|
||||
* The process condition code to be used with the callbacks.
|
||||
* Codes which are greater than IRREGULAR should be
|
||||
* passed on to any chained callbacks.
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
public static final class UConverterCallbackReason {
|
||||
public static final int UNASSIGNED = 0; /**< The code point is unassigned.
|
||||
The error code U_INVALID_CHAR_FOUND will be set. */
|
||||
public static final int ILLEGAL = 1; /**< The code point is illegal. For example,
|
||||
\\x81\\x2E is illegal in SJIS because \\x2E
|
||||
is not a valid trail byte for the \\x81
|
||||
lead byte.
|
||||
Also, starting with Unicode 3.0.1, non-shortest byte sequences
|
||||
in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
|
||||
are also illegal, not just irregular.
|
||||
The error code U_ILLEGAL_CHAR_FOUND will be set. */
|
||||
public static final int IRREGULAR = 2; /**< The codepoint is not a regular sequence in
|
||||
the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
|
||||
are irregular UTF-8 byte sequences for single surrogate
|
||||
code points.
|
||||
The error code U_INVALID_CHAR_FOUND will be set. */
|
||||
public static final int RESET = 3; /**< The callback is called with this reason when a
|
||||
'reset' has occured. Callback should reset all
|
||||
state. */
|
||||
public static final int CLOSE = 4; /**< Called when the converter is closed. The
|
||||
callback should release any allocated memory.*/
|
||||
public static final int CLONE = 5; /**< Called when safeClone() is called on the
|
||||
converter. the pointer available as the
|
||||
'context' is an alias to the original converters'
|
||||
context pointer. If the context must be owned
|
||||
by the new converter, the callback must clone
|
||||
the data and call setFromUCallback
|
||||
(or setToUCallback) with the correct pointer.
|
||||
@draft ICU 2.2
|
||||
*/
|
||||
}
|
||||
//end err.h
|
||||
}
|
552
icu4j/src/com/ibm/icu/impl/UConverterDataReader.java
Normal file
552
icu4j/src/com/ibm/icu/impl/UConverterDataReader.java
Normal file
|
@ -0,0 +1,552 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* ucnvmbcs.h
|
||||
*
|
||||
* ICU conversion (.cnv) data file structure, following the usual UDataInfo
|
||||
* header.
|
||||
*
|
||||
* Format version: 6.2
|
||||
*
|
||||
* struct UConverterStaticData -- struct containing the converter name, IBM CCSID,
|
||||
* min/max bytes per character, etc.
|
||||
* see ucnv_bld.h
|
||||
*
|
||||
* --------------------
|
||||
*
|
||||
* The static data is followed by conversionType-specific data structures.
|
||||
* At the moment, there are only variations of MBCS converters. They all have
|
||||
* the same toUnicode structures, while the fromUnicode structures for SBCS
|
||||
* differ from those for other MBCS-style converters.
|
||||
*
|
||||
* _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
|
||||
* If it is present, then an ICU version reading header versions 4.0 or 4.1
|
||||
* will be able to use the base table and ignore the extension.
|
||||
*
|
||||
* The unicodeMask in the static data is part of the base table data structure.
|
||||
* Especially, the UCNV_HAS_SUPPLEMENTARY flag determines the length of the
|
||||
* fromUnicode stage 1 array.
|
||||
* The static data unicodeMask refers only to the base table's properties if
|
||||
* a base table is included.
|
||||
* In an extension-only file, the static data unicodeMask is 0.
|
||||
* The extension data indexes have a separate field with the unicodeMask flags.
|
||||
*
|
||||
* MBCS-style data structure following the static data.
|
||||
* Offsets are counted in bytes from the beginning of the MBCS header structure.
|
||||
* Details about usage in comments in ucnvmbcs.c.
|
||||
*
|
||||
* struct _MBCSHeader (see the definition in this header file below)
|
||||
* contains 32-bit fields as follows:
|
||||
* 8 values:
|
||||
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
|
||||
* 1 uint32_t countStates
|
||||
* 2 uint32_t countToUFallbacks
|
||||
* 3 uint32_t offsetToUCodeUnits
|
||||
* 4 uint32_t offsetFromUTable
|
||||
* 5 uint32_t offsetFromUBytes
|
||||
* 6 uint32_t flags, bits:
|
||||
* 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher
|
||||
* 0 for older versions and if
|
||||
* there is not extension structure
|
||||
* 7.. 0 outputType
|
||||
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
|
||||
* counts bytes in fromUBytes[]
|
||||
*
|
||||
* if(outputType==MBCS_OUTPUT_EXT_ONLY) {
|
||||
* -- base table name for extension-only table
|
||||
* char baseTableName[variable]; -- with NUL plus padding for 4-alignment
|
||||
*
|
||||
* -- all _MBCSHeader fields except for version and flags are 0
|
||||
* } else {
|
||||
* -- normal base table with optional extension
|
||||
*
|
||||
* int32_t stateTable[countStates][256];
|
||||
*
|
||||
* struct _MBCSToUFallback { (fallbacks are sorted by offset)
|
||||
* uint32_t offset;
|
||||
* UChar32 codePoint;
|
||||
* } toUFallbacks[countToUFallbacks];
|
||||
*
|
||||
* uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
|
||||
* (padded to an even number of units)
|
||||
*
|
||||
* -- stage 1 tables
|
||||
* if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
* -- stage 1 table for all of Unicode
|
||||
* uint16_t fromUTable[0x440]; (32-bit-aligned)
|
||||
* } else {
|
||||
* -- BMP-only tables have a smaller stage 1 table
|
||||
* uint16_t fromUTable[0x40]; (32-bit-aligned)
|
||||
* }
|
||||
*
|
||||
* -- stage 2 tables
|
||||
* length determined by top of stage 1 and bottom of stage 3 tables
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: pure indexes
|
||||
* uint16_t stage 2 indexes[?];
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
|
||||
* uint32_t stage 2 flags and indexes[?];
|
||||
* }
|
||||
*
|
||||
* -- stage 3 tables with byte results
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
|
||||
* uint16_t fromUBytes[fromUBytesLength/2];
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
|
||||
* uint8_t fromUBytes[fromUBytesLength]; or
|
||||
* uint16_t fromUBytes[fromUBytesLength/2]; or
|
||||
* uint32_t fromUBytes[fromUBytesLength/4];
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* -- extension table, details see ucnv_ext.h
|
||||
* int32_t indexes[>=32]; ...
|
||||
*/
|
||||
/*
|
||||
* ucnv_ext.h
|
||||
*
|
||||
* See icuhtml/design/conversion/conversion_extensions.html
|
||||
*
|
||||
* Conversion extensions serve two purposes:
|
||||
* 1. They support m:n mappings.
|
||||
* 2. They support extension-only conversion files that are used together
|
||||
* with the regular conversion data in base files.
|
||||
*
|
||||
* A base file may contain an extension table (explicitly requested or
|
||||
* implicitly generated for m:n mappings), but its extension table is not
|
||||
* used when an extension-only file is used.
|
||||
*
|
||||
* It is an error if a base file contains any regular (not extension) mapping
|
||||
* from the same sequence as a mapping in the extension file
|
||||
* because the base mapping would hide the extension mapping.
|
||||
*
|
||||
*
|
||||
* Data for conversion extensions:
|
||||
*
|
||||
* One set of data structures per conversion direction (to/from Unicode).
|
||||
* The data structures are sorted by input units to allow for binary search.
|
||||
* Input sequences of more than one unit are handled like contraction tables
|
||||
* in collation:
|
||||
* The lookup value of a unit points to another table that is to be searched
|
||||
* for the next unit, recursively.
|
||||
*
|
||||
* For conversion from Unicode, the initial code point is looked up in
|
||||
* a 3-stage trie for speed,
|
||||
* with an additional table of unique results to save space.
|
||||
*
|
||||
* Long output strings are stored in separate arrays, with length and index
|
||||
* in the lookup tables.
|
||||
* Output results also include a flag distinguishing roundtrip from
|
||||
* (reverse) fallback mappings.
|
||||
*
|
||||
* Input Unicode strings must not begin or end with unpaired surrogates
|
||||
* to avoid problems with matches on parts of surrogate pairs.
|
||||
*
|
||||
* Mappings from multiple characters (code points or codepage state
|
||||
* table sequences) must be searched preferring the longest match.
|
||||
* For this to work and be efficient, the variable-width table must contain
|
||||
* all mappings that contain prefixes of the multiple characters.
|
||||
* If an extension table is built on top of a base table in another file
|
||||
* and a base table entry is a prefix of a multi-character mapping, then
|
||||
* this is an error.
|
||||
*
|
||||
*
|
||||
* Implementation note:
|
||||
*
|
||||
* Currently, the parser and several checks in the code limit the number
|
||||
* of UChars or bytes in a mapping to
|
||||
* UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
|
||||
* which are output value limits in the data structure.
|
||||
*
|
||||
* For input, this is not strictly necessary - it is a hard limit only for the
|
||||
* buffers in UConverter that are used to store partial matches.
|
||||
*
|
||||
* Input sequences could otherwise be arbitrarily long if partial matches
|
||||
* need not be stored (i.e., if a sequence does not span several buffers with too
|
||||
* many units before the last buffer), although then results would differ
|
||||
* depending on whether partial matches exceed the limits or not,
|
||||
* which depends on the pattern of buffer sizes.
|
||||
*
|
||||
*
|
||||
* Data structure:
|
||||
*
|
||||
* int32_t indexes[>=32];
|
||||
*
|
||||
* Array of indexes and lengths etc. The length of the array is at least 32.
|
||||
* The actual length is stored in indexes[0] to be forward compatible.
|
||||
*
|
||||
* Each index to another array is the number of bytes from indexes[].
|
||||
* Each length of an array is the number of array base units in that array.
|
||||
*
|
||||
* Some of the structures may not be present, in which case their indexes
|
||||
* and lengths are 0.
|
||||
*
|
||||
* Usage of indexes[i]:
|
||||
* [0] length of indexes[]
|
||||
*
|
||||
* // to Unicode table
|
||||
* [1] index of toUTable[] (array of uint32_t)
|
||||
* [2] length of toUTable[]
|
||||
* [3] index of toUUChars[] (array of UChar)
|
||||
* [4] length of toUUChars[]
|
||||
*
|
||||
* // from Unicode table, not for the initial code point
|
||||
* [5] index of fromUTableUChars[] (array of UChar)
|
||||
* [6] index of fromUTableValues[] (array of uint32_t)
|
||||
* [7] length of fromUTableUChars[] and fromUTableValues[]
|
||||
* [8] index of fromUBytes[] (array of char)
|
||||
* [9] length of fromUBytes[]
|
||||
*
|
||||
* // from Unicode trie for initial-code point lookup
|
||||
* [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
|
||||
* [11] length of stage 1 portion of fromUStage12[]
|
||||
* [12] length of fromUStage12[]
|
||||
* [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
|
||||
* [14] length of fromUStage3[]
|
||||
* [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
|
||||
* [16] length of fromUStage3b[]
|
||||
*
|
||||
* [17] Bit field containing numbers of bytes:
|
||||
* 31..24 reserved, 0
|
||||
* 23..16 maximum input bytes
|
||||
* 15.. 8 maximum output bytes
|
||||
* 7.. 0 maximum bytes per UChar
|
||||
*
|
||||
* [18] Bit field containing numbers of UChars:
|
||||
* 31..24 reserved, 0
|
||||
* 23..16 maximum input UChars
|
||||
* 15.. 8 maximum output UChars
|
||||
* 7.. 0 maximum UChars per byte
|
||||
*
|
||||
* [19] Bit field containing flags:
|
||||
* (extension table unicodeMask)
|
||||
* 1 UCNV_HAS_SURROGATES flag for the extension table
|
||||
* 0 UCNV_HAS_SUPPLEMENTARY flag for the extension table
|
||||
*
|
||||
* [20]..[30] reserved, 0
|
||||
* [31] number of bytes for the entire extension structure
|
||||
* [>31] reserved; there are indexes[0] indexes
|
||||
*
|
||||
*
|
||||
* uint32_t toUTable[];
|
||||
*
|
||||
* Array of byte/value pairs for lookups for toUnicode conversion.
|
||||
* The array is partitioned into sections like collation contraction tables.
|
||||
* Each section contains one word with the number of following words and
|
||||
* a default value for when the lookup in this section yields no match.
|
||||
*
|
||||
* A section is sorted in ascending order of input bytes,
|
||||
* allowing for fast linear or binary searches.
|
||||
* The builder may store entries for a contiguous range of byte values
|
||||
* (compare difference between the first and last one with count),
|
||||
* which then allows for direct array access.
|
||||
* The builder should always do this for the initial table section.
|
||||
*
|
||||
* Entries may have 0 values, see below.
|
||||
* No two entries in a section have the same byte values.
|
||||
*
|
||||
* Each uint32_t contains an input byte value in bits 31..24 and the
|
||||
* corresponding lookup value in bits 23..0.
|
||||
* Interpret the value as follows:
|
||||
* if(value==0) {
|
||||
* no match, see below
|
||||
* } else if(value<0x1f0000) {
|
||||
* partial match - use value as index to the next toUTable section
|
||||
* and match the next unit; (value indexes toUTable[value])
|
||||
* } else {
|
||||
* if(bit 23 set) {
|
||||
* roundtrip;
|
||||
* } else {
|
||||
* fallback;
|
||||
* }
|
||||
* unset value bit 23;
|
||||
* if(value<=0x2fffff) {
|
||||
* (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
|
||||
* } else {
|
||||
* bits 17..0 (value&0x3ffff) is an index to
|
||||
* the result UChars in toUUChars[]; (0 indexes toUUChars[0])
|
||||
* length of the result=((value>>18)-12); (length=0..19)
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* The first word in a section contains the number of following words in the
|
||||
* input byte position (bits 31..24, number=1..0xff).
|
||||
* The value of the initial word is used when the current byte is not found
|
||||
* in this section.
|
||||
* If the value is not 0, then it represents a result as above.
|
||||
* If the value is 0, then the search has to return a shorter match with an
|
||||
* earlier default value as the result, or result in "unmappable" even for the
|
||||
* initial bytes.
|
||||
* If the value is 0 for the initial toUTable entry, then the initial byte
|
||||
* does not start any mapping input.
|
||||
*
|
||||
*
|
||||
* UChar toUUChars[];
|
||||
*
|
||||
* Contains toUnicode mapping results, stored as sequences of UChars.
|
||||
* Indexes and lengths stored in the toUTable[].
|
||||
*
|
||||
*
|
||||
* UChar fromUTableUChars[];
|
||||
* uint32_t fromUTableValues[];
|
||||
*
|
||||
* The fromUTable is split into two arrays, but works otherwise much like
|
||||
* the toUTable. The array is partitioned into sections like collation
|
||||
* contraction tables and toUTable.
|
||||
* A row in the table consists of same-index entries in fromUTableUChars[]
|
||||
* and fromUTableValues[].
|
||||
*
|
||||
* Interpret a value as follows:
|
||||
* if(value==0) {
|
||||
* no match, see below
|
||||
* } else if(value<=0xffffff) { (bits 31..24 are 0)
|
||||
* partial match - use value as index to the next fromUTable section
|
||||
* and match the next unit; (value indexes fromUTable[value])
|
||||
* } else {
|
||||
* if(value==0x80000001) {
|
||||
* return no mapping, but request for <subchar1>;
|
||||
* }
|
||||
* if(bit 31 set) {
|
||||
* roundtrip;
|
||||
* } else {
|
||||
* fallback;
|
||||
* }
|
||||
* // bits 30..29 reserved, 0
|
||||
* length=(value>>24)&0x1f; (bits 28..24)
|
||||
* if(length==1..3) {
|
||||
* bits 23..0 contain 1..3 bytes, padded with 00s on the left;
|
||||
* } else {
|
||||
* bits 23..0 (value&0xffffff) is an index to
|
||||
* the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* The first pair in a section contains the number of following pairs in the
|
||||
* UChar position (16 bits, number=1..0xffff).
|
||||
* The value of the initial pair is used when the current UChar is not found
|
||||
* in this section.
|
||||
* If the value is not 0, then it represents a result as above.
|
||||
* If the value is 0, then the search has to return a shorter match with an
|
||||
* earlier default value as the result, or result in "unmappable" even for the
|
||||
* initial UChars.
|
||||
*
|
||||
* If the from Unicode trie is present, then the from Unicode search tables
|
||||
* are not used for initial code points.
|
||||
* In this case, the first entries (index 0) in the tables are not used
|
||||
* (reserved, set to 0) because a value of 0 is used in trie results
|
||||
* to indicate no mapping.
|
||||
*
|
||||
*
|
||||
* uint16_t fromUStage12[];
|
||||
*
|
||||
* Stages 1 & 2 of a trie that maps an initial code point.
|
||||
* Indexes in stage 1 are all offset by the length of stage 1 so that the
|
||||
* same array pointer can be used for both stages.
|
||||
* If (c>>10)>=(length of stage 1) then c does not start any mapping.
|
||||
* Same bit distribution as for regular conversion tries.
|
||||
*
|
||||
*
|
||||
* uint16_t fromUStage3[];
|
||||
* uint32_t fromUStage3b[];
|
||||
*
|
||||
* Stage 3 of the trie. The first array simply contains indexes to the second,
|
||||
* which contains words in the same format as fromUTableValues[].
|
||||
* Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
|
||||
* and 16-bit entries in stage 3 allow for 64k stage 3b entries.
|
||||
* The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
|
||||
*
|
||||
* Two arrays are used because it is expected that more than half of the stage 3
|
||||
* entries will be zero. The 16-bit index stage 3 array saves space even
|
||||
* considering storing a total of 6 bytes per non-zero entry in both arrays
|
||||
* together.
|
||||
* Using a stage 3 granularity of >1 diminishes the compactability in that stage
|
||||
* but provides a larger effective addressing space in stage 2.
|
||||
* All but the final result stage use 16-bit entries to save space.
|
||||
*
|
||||
* fromUStage3b[] contains a zero for "no mapping" at its index 0,
|
||||
* and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for "<subchar1> SUB mapping"
|
||||
* (i.e., "no mapping" with preference for <subchar1> rather than <subchar>),
|
||||
* and all other items are unique non-zero results.
|
||||
*
|
||||
* The default value of a fromUTableValues[] section that is referenced
|
||||
* _directly_ from a fromUStage3b[] item may also be UCNV_EXT_FROM_U_SUBCHAR1,
|
||||
* but this value must not occur anywhere else in fromUTableValues[]
|
||||
* because "no mapping" is always a property of a single code point,
|
||||
* never of multiple.
|
||||
*
|
||||
*
|
||||
* char fromUBytes[];
|
||||
*
|
||||
* Contains fromUnicode mapping results, stored as sequences of chars.
|
||||
* Indexes and lengths stored in the fromUTableValues[].
|
||||
*/
|
||||
|
||||
public final class UConverterDataReader implements ICUBinary.Authenticate {
|
||||
private final static boolean debug = ICUDebug.enabled("UConverterDataReader");
|
||||
|
||||
/*
|
||||
* public UConverterDataReader(UConverterDataReader r)
|
||||
{
|
||||
dataInputStream = new DataInputStream(r.dataInputStream);
|
||||
unicodeVersion = r.unicodeVersion;
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>Protected constructor.</p>
|
||||
* @param inputStream ICU uprop.dat file input stream
|
||||
* @exception IOException throw if data file fails authentication
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected UConverterDataReader(InputStream inputStream)
|
||||
throws IOException{
|
||||
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
|
||||
|
||||
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
|
||||
|
||||
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
|
||||
|
||||
dataInputStream = new DataInputStream(inputStream);
|
||||
|
||||
if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
|
||||
}
|
||||
|
||||
// protected methods -------------------------------------------------
|
||||
|
||||
protected void readStaticData(UConverterStaticData sd) throws IOException
|
||||
{
|
||||
sd.structSize = dataInputStream.readInt();
|
||||
byte[] name = new byte[UConverterConstants.MAX_CONVERTER_NAME_LENGTH];
|
||||
int length = dataInputStream.read(name);
|
||||
sd.name = new String(name, 0, length);
|
||||
sd.codepage = dataInputStream.readInt();
|
||||
sd.platform = dataInputStream.readByte();
|
||||
sd.conversionType = dataInputStream.readByte();
|
||||
sd.minBytesPerChar = dataInputStream.readByte();
|
||||
sd.maxBytesPerChar = dataInputStream.readByte();
|
||||
dataInputStream.read(sd.subChar);
|
||||
sd.subCharLen = dataInputStream.readByte();
|
||||
sd.hasToUnicodeFallback = dataInputStream.readByte();
|
||||
sd.hasFromUnicodeFallback = dataInputStream.readByte();
|
||||
sd.unicodeMask = (short)dataInputStream.readUnsignedByte();
|
||||
sd.subChar1 = dataInputStream.readByte();
|
||||
dataInputStream.read(sd.reserved);
|
||||
}
|
||||
|
||||
protected void readMBCSHeader(UConverterSharedData.MBCSHeader h) throws IOException
|
||||
{
|
||||
dataInputStream.read(h.version);
|
||||
h.countStates = dataInputStream.readInt();
|
||||
h.countToUFallbacks = dataInputStream.readInt();
|
||||
h.offsetToUCodeUnits = dataInputStream.readInt();
|
||||
h.offsetFromUTable = dataInputStream.readInt();
|
||||
h.offsetFromUBytes = dataInputStream.readInt();
|
||||
h.flags = dataInputStream.readInt();
|
||||
h.fromUBytesLength = dataInputStream.readInt();
|
||||
}
|
||||
|
||||
protected void readMBCSTable(int[][] stateTableArray, UConverterSharedData.MBCSToUFallback[] toUFallbacksArray, char[] unicodeCodeUnitsArray, char[] fromUnicodeTableArray, byte[] fromUnicodeBytesArray) throws IOException
|
||||
{
|
||||
int i, j;
|
||||
for(i = 0; i < stateTableArray.length; ++i)
|
||||
for(j = 0; j < stateTableArray[i].length; ++j)
|
||||
stateTableArray[i][j] = dataInputStream.readInt();
|
||||
for(i = 0; i < toUFallbacksArray.length; ++i) {
|
||||
toUFallbacksArray[i].offset = dataInputStream.readInt();
|
||||
toUFallbacksArray[i].codePoint = dataInputStream.readInt();
|
||||
}
|
||||
for(i = 0; i < unicodeCodeUnitsArray.length; ++i)
|
||||
unicodeCodeUnitsArray[i] = dataInputStream.readChar();
|
||||
for(i = 0; i < fromUnicodeTableArray.length; ++i)
|
||||
fromUnicodeTableArray[i] = dataInputStream.readChar();
|
||||
for(i = 0; i < fromUnicodeBytesArray.length; ++i)
|
||||
fromUnicodeBytesArray[i] = dataInputStream.readByte();
|
||||
}
|
||||
|
||||
protected String readBaseTableName() throws IOException
|
||||
{
|
||||
char c;
|
||||
StringBuffer name = new StringBuffer();
|
||||
while((c = (char)dataInputStream.readByte()) != 0)
|
||||
name.append(c);
|
||||
return name.toString();
|
||||
}
|
||||
|
||||
//protected int[] readExtIndexes(int skip) throws IOException
|
||||
protected ByteBuffer readExtIndexes(int skip) throws IOException
|
||||
{
|
||||
dataInputStream.skipBytes(skip);
|
||||
|
||||
int n = dataInputStream.readInt();
|
||||
int[] indexes = new int[n];
|
||||
indexes[0] = n;
|
||||
for(int i = 1; i < n; ++i) {
|
||||
indexes[i] = dataInputStream.readInt();
|
||||
}
|
||||
//return indexes;
|
||||
|
||||
ByteBuffer b = ByteBuffer.allocate(indexes[31]);
|
||||
for(int i = 0; i < n; ++i) {
|
||||
b.putInt(indexes[i]);
|
||||
}
|
||||
dataInputStream.read(b.array(), b.position(), b.remaining());
|
||||
return b;
|
||||
}
|
||||
|
||||
protected byte[] readExtTables(int n) throws IOException
|
||||
{
|
||||
byte[] tables = new byte[n];
|
||||
dataInputStream.read(tables);
|
||||
return tables;
|
||||
}
|
||||
|
||||
public byte[] getDataFormatVersion(){
|
||||
return DATA_FORMAT_VERSION;
|
||||
}
|
||||
|
||||
public boolean isDataVersionAcceptable(byte version[])
|
||||
{
|
||||
return version[0] == DATA_FORMAT_VERSION[0];
|
||||
}
|
||||
|
||||
public byte[] getUnicodeVersion(){
|
||||
return unicodeVersion;
|
||||
}
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
* ICU data file input stream
|
||||
*/
|
||||
private DataInputStream dataInputStream;
|
||||
|
||||
private byte[] unicodeVersion;
|
||||
|
||||
/**
|
||||
* File format version that this class understands.
|
||||
* No guarantees are made if a older version is used
|
||||
* see store.c of gennorm for more information and values
|
||||
*/
|
||||
// DATA_FORMAT_ID_ values taken from icu4c isCnvAcceptable (ucnv_bld.c)
|
||||
private static final byte DATA_FORMAT_ID[] = {(byte)0x63, (byte)0x6e, (byte)0x76, (byte)0x74}; // dataFormat="cnvt"
|
||||
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x6};
|
||||
|
||||
}
|
||||
|
545
icu4j/src/com/ibm/icu/impl/UConverterSharedData.java
Normal file
545
icu4j/src/com/ibm/icu/impl/UConverterSharedData.java
Normal file
|
@ -0,0 +1,545 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
|
||||
/*
|
||||
* Defines the UConverterSharedData struct,
|
||||
* the immutable, shared part of UConverter.
|
||||
*/
|
||||
public class UConverterSharedData {
|
||||
//uint32_t structSize; /* Size of this structure */
|
||||
public int structSize; /* Size of this structure */
|
||||
//uint32_t referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */
|
||||
public int referenceCounter; /* used to count number of clients, 0xffffffff for static SharedData */
|
||||
public static final int MAX_VERSION_LENGTH=4;
|
||||
//agljport:todo const void *dataMemory; /* from udata_openChoice() - for cleanup */
|
||||
//agljport:todo void *table; /* Unused. This used to be a UConverterTable - Pointer to conversion data - see mbcs below */
|
||||
|
||||
//const UConverterStaticData *staticData; /* pointer to the static (non changing) data. */
|
||||
public UConverterStaticData staticData; /* pointer to the static (non changing) data. */
|
||||
|
||||
//UBool sharedDataCached; /* TRUE: shared data is in cache, don't destroy on close() if 0 ref. FALSE: shared data isn't in the cache, do attempt to clean it up if the ref is 0 */
|
||||
public boolean sharedDataCached; /* TRUE: shared data is in cache, don't destroy on close() if 0 ref. FALSE: shared data isn't in the cache, do attempt to clean it up if the ref is 0 */
|
||||
/*UBool staticDataOwned; TRUE if static data owned by shared data & should be freed with it, NEVER true for udata() loaded statics. This ignored variable was removed to make space for sharedDataCached. */
|
||||
|
||||
//const UConverterImpl *impl; /* vtable-style struct of mostly function pointers */
|
||||
//public UConverterImpl impl; /* vtable-style struct of mostly function pointers */
|
||||
|
||||
/*initial values of some members of the mutable part of object */
|
||||
//uint32_t toUnicodeStatus;
|
||||
public long toUnicodeStatus;
|
||||
|
||||
/*
|
||||
* Shared data structures currently come in two flavors:
|
||||
* - readonly for built-in algorithmic converters
|
||||
* - allocated for MBCS, with a pointer to an allocated UConverterTable
|
||||
* which always has a UConverterMBCSTable
|
||||
*
|
||||
* To eliminate one allocation, I am making the UConverterMBCSTable
|
||||
* a member of the shared data. It is the last member so that static
|
||||
* definitions of UConverterSharedData work as before.
|
||||
* The table field above also remains to avoid updating all static
|
||||
* definitions, but is now unused.
|
||||
*
|
||||
* markus 2003-nov-07
|
||||
*/
|
||||
public UConverterMBCSTable mbcs;
|
||||
|
||||
public UConverterSharedData()
|
||||
{
|
||||
mbcs = new UConverterMBCSTable();
|
||||
}
|
||||
|
||||
public UConverterSharedData(int structSize_, int referenceCounter_, UConverterStaticData staticData_, boolean sharedDataCached_,/* UConverterImpl impl_,*/ long toUnicodeStatus_)
|
||||
{
|
||||
this();
|
||||
structSize = structSize_;
|
||||
referenceCounter = referenceCounter_;
|
||||
staticData = staticData_;
|
||||
sharedDataCached = sharedDataCached_;
|
||||
//impl = impl_;
|
||||
toUnicodeStatus = toUnicodeStatus_;
|
||||
}
|
||||
|
||||
/**
|
||||
* UConverterImpl contains all the data and functions for a converter type.
|
||||
* Its function pointers work much like a C++ vtable.
|
||||
* Many converter types need to define only a subset of the functions;
|
||||
* when a function pointer is NULL, then a default action will be performed.
|
||||
*
|
||||
* Every converter type must implement toUnicode, fromUnicode, and getNextUChar,
|
||||
* otherwise the converter may crash.
|
||||
* Every converter type that has variable-length codepage sequences should
|
||||
* also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for
|
||||
* correct offset handling.
|
||||
* All other functions may or may not be implemented - it depends only on
|
||||
* whether the converter type needs them.
|
||||
*
|
||||
* When open() fails, then close() will be called, if present.
|
||||
*/
|
||||
//public class UConverterImpl {
|
||||
//UConverterType type;
|
||||
//UConverterToUnicode toUnicode;
|
||||
/* protected void doToUnicode(UConverterToUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
public final void toUnicode(UConverterToUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
doToUnicode(args, pErrorCode);
|
||||
}
|
||||
|
||||
//UConverterFromUnicode fromUnicode;
|
||||
protected void doFromUnicode(UConverterFromUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
public final void fromUnicode(UConverterFromUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
doFromUnicode(args, pErrorCode);
|
||||
}
|
||||
|
||||
protected int doGetNextUChar(UConverterToUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
//UConverterGetNextUChar getNextUChar;
|
||||
public final int getNextUChar(UConverterToUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
return doGetNextUChar(args, pErrorCode);
|
||||
}
|
||||
|
||||
//public interface UConverterImplLoadable extends UConverterImpl
|
||||
protected void doLoad(UConverterLoadArgs pArgs, short[] raw, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
*/
|
||||
protected void doUnload()
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
//public interface UConverterImplOpenable extends UConverterImpl
|
||||
protected void doOpen(UConverter cnv, String name, String locale, long options, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterOpen open;
|
||||
public final void open(UConverter cnv, String name, String locale, long options, int[] pErrorCode)
|
||||
{
|
||||
doOpen(cnv, name, locale, options, pErrorCode);
|
||||
}
|
||||
|
||||
protected void doClose(UConverter cnv)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterClose close;
|
||||
public final void close(UConverter cnv)
|
||||
{
|
||||
doClose(cnv);
|
||||
}
|
||||
|
||||
protected void doReset(UConverter cnv, int choice)
|
||||
{
|
||||
}
|
||||
|
||||
//typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice);
|
||||
//UConverterReset reset;
|
||||
public final void reset(UConverter cnv, int choice)
|
||||
{
|
||||
doReset(cnv, choice);
|
||||
}
|
||||
|
||||
//public interface UConverterImplVariableLength extends UConverterImpl
|
||||
protected void doToUnicodeWithOffsets(UConverterToUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterToUnicode toUnicodeWithOffsets;
|
||||
public final void toUnicodeWithOffsets(UConverterToUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
doToUnicodeWithOffsets(args, pErrorCode);
|
||||
}
|
||||
|
||||
protected void doFromUnicodeWithOffsets(UConverterFromUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterFromUnicode fromUnicodeWithOffsets;
|
||||
public final void fromUnicodeWithOffsets(UConverterFromUnicodeArgs args, int[] pErrorCode)
|
||||
{
|
||||
doFromUnicodeWithOffsets(args, pErrorCode);
|
||||
}
|
||||
|
||||
//public interface UConverterImplMisc extends UConverterImpl
|
||||
protected void doGetStarters(UConverter converter, boolean starters[], int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterGetStarters getStarters;
|
||||
public final void getStarters(UConverter converter, boolean starters[], int[] pErrorCode)
|
||||
{
|
||||
doGetStarters(converter, starters, pErrorCode);
|
||||
}
|
||||
|
||||
protected String doGetName(UConverter cnv)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
//UConverterGetName getName;
|
||||
public final String getName(UConverter cnv)
|
||||
{
|
||||
return doGetName(cnv);
|
||||
}
|
||||
|
||||
protected void doWriteSub(UConverterFromUnicodeArgs pArgs, long offsetIndex, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterWriteSub writeSub;
|
||||
public final void writeSub(UConverterFromUnicodeArgs pArgs, long offsetIndex, int[] pErrorCode)
|
||||
{
|
||||
doWriteSub(pArgs, offsetIndex, pErrorCode);
|
||||
}
|
||||
|
||||
protected UConverter doSafeClone(UConverter cnv, byte[] stackBuffer, int[] pBufferSize, int[] status)
|
||||
{
|
||||
return new UConverter();
|
||||
}
|
||||
|
||||
//UConverterSafeClone safeClone;
|
||||
public final UConverter safeClone(UConverter cnv, byte[] stackBuffer, int[] pBufferSize, int[] status)
|
||||
{
|
||||
return doSafeClone(cnv, stackBuffer, pBufferSize, status);
|
||||
}
|
||||
|
||||
protected void doGetUnicodeSet(UConverter cnv, UnicodeSet /*USetAdder* / sa, int /*UConverterUnicodeSet* / which, int[] pErrorCode)
|
||||
{
|
||||
}
|
||||
|
||||
//UConverterGetUnicodeSet getUnicodeSet;
|
||||
//public final void getUnicodeSet(UConverter cnv, UnicodeSet /*USetAdder* / sa, int /*UConverterUnicodeSet* / which, int[] pErrorCode)
|
||||
//{
|
||||
// doGetUnicodeSet(cnv, sa, which, pErrorCode);
|
||||
//}
|
||||
|
||||
//}
|
||||
|
||||
static final String DATA_TYPE = "cnv";
|
||||
private static final int CNV_DATA_BUFFER_SIZE = 25000;
|
||||
public static final int sizeofUConverterSharedData = 100;
|
||||
|
||||
//static UDataMemoryIsAcceptable isCnvAcceptable;
|
||||
|
||||
/**
|
||||
* Load a non-algorithmic converter.
|
||||
* If pkg==NULL, then this function must be called inside umtx_lock(&cnvCacheMutex).
|
||||
|
||||
// UConverterSharedData * load(UConverterLoadArgs *pArgs, UErrorCode *err)
|
||||
public static final UConverterSharedData load(UConverterLoadArgs pArgs, int[] err)
|
||||
{
|
||||
UConverterSharedData mySharedConverterData = null;
|
||||
|
||||
if(err == null || ErrorCode.isFailure(err[0])) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if(pArgs.pkg != null && pArgs.pkg.length() != 0) {
|
||||
application-provided converters are not currently cached
|
||||
return UConverterSharedData.createConverterFromFile(pArgs, err);
|
||||
}
|
||||
|
||||
//agljport:fix mySharedConverterData = getSharedConverterData(pArgs.name);
|
||||
if (mySharedConverterData == null)
|
||||
{
|
||||
Not cached, we need to stream it in from file
|
||||
mySharedConverterData = UConverterSharedData.createConverterFromFile(pArgs, err);
|
||||
if (ErrorCode.isFailure(err[0]) || (mySharedConverterData == null))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
else
|
||||
{
|
||||
share it with other library clients
|
||||
//agljport:fix shareConverterData(mySharedConverterData);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
The data for this converter was already in the cache.
|
||||
Update the reference counter on the shared data: one more client
|
||||
mySharedConverterData.referenceCounter++;
|
||||
}
|
||||
|
||||
return mySharedConverterData;
|
||||
}
|
||||
|
||||
Takes an alias name gets an actual converter file name
|
||||
*goes to disk and opens it.
|
||||
*allocates the memory and returns a new UConverter object
|
||||
|
||||
//static UConverterSharedData *createConverterFromFile(UConverterLoadArgs *pArgs, UErrorCode * err)
|
||||
public static final UConverterSharedData createConverterFromFile(UConverterLoadArgs pArgs, int[] err)
|
||||
{
|
||||
UDataMemory data = null;
|
||||
UConverterSharedData sharedData = null;
|
||||
|
||||
//agljport:todo UTRACE_ENTRY_OC(UTRACE_LOAD);
|
||||
|
||||
if (err == null || ErrorCode.isFailure(err[0])) {
|
||||
//agljport:todo UTRACE_EXIT_STATUS(*err);
|
||||
return null;
|
||||
}
|
||||
|
||||
//agljport:todo UTRACE_DATA2(UTRACE_OPEN_CLOSE, "load converter %s from package %s", pArgs->name, pArgs->pkg);
|
||||
|
||||
//agljport:fix data = udata_openChoice(pArgs.pkgArray, DATA_TYPE.getBytes(), pArgs.name, isCnvAcceptable, null, err);
|
||||
if(ErrorCode.isFailure(err[0]))
|
||||
{
|
||||
//agljport:todo UTRACE_EXIT_STATUS(*err);
|
||||
return null;
|
||||
}
|
||||
|
||||
sharedData = data_unFlattenClone(pArgs, data, err);
|
||||
if(ErrorCode.isFailure(err[0]))
|
||||
{
|
||||
//agljport:fix udata_close(data);
|
||||
//agljport:todo UTRACE_EXIT_STATUS(*err);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
* TODO Store pkg in a field in the shared data so that delta-only converters
|
||||
* can load base converters from the same package.
|
||||
* If the pkg name is longer than the field, then either do not load the converter
|
||||
* in the first place, or just set the pkg field to "".
|
||||
|
||||
|
||||
return sharedData;
|
||||
}
|
||||
*/
|
||||
UConverterDataReader dataReader = null;
|
||||
|
||||
|
||||
|
||||
/*returns a converter type from a string
|
||||
*/
|
||||
// static const UConverterSharedData * getAlgorithmicTypeFromName(const char *realName)
|
||||
public static final UConverterSharedData getAlgorithmicTypeFromName(String realName)
|
||||
{
|
||||
long mid, start, limit;
|
||||
long lastMid;
|
||||
int result;
|
||||
StringBuffer strippedName = new StringBuffer(UConverterConstants.MAX_CONVERTER_NAME_LENGTH);
|
||||
|
||||
/* Lower case and remove ignoreable characters. */
|
||||
UConverterAlias.io_stripForCompare(strippedName, realName);
|
||||
|
||||
/* do a binary search for the alias */
|
||||
start = 0;
|
||||
limit = cnvNameType.length;
|
||||
mid = limit;
|
||||
lastMid = UConverterAlias.UINT32_MAX;
|
||||
|
||||
for (;;) {
|
||||
mid = (long)((start + limit) / 2);
|
||||
if (lastMid == mid) { /* Have we moved? */
|
||||
break; /* We haven't moved, and it wasn't found. */
|
||||
}
|
||||
lastMid = mid;
|
||||
result = strippedName.substring(0).compareTo(cnvNameType[(int)mid].name);
|
||||
|
||||
if (result < 0) {
|
||||
limit = mid;
|
||||
} else if (result > 0) {
|
||||
start = mid;
|
||||
} else {
|
||||
return converterData[cnvNameType[(int)mid].type];
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallbacks to Unicode are stored outside the normal state table and code point structures
|
||||
* in a vector of items of this type. They are sorted by offset.
|
||||
*/
|
||||
public final class MBCSToUFallback {
|
||||
int offset;
|
||||
int codePoint;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the MBCS part of the UConverterTable union (a runtime data structure).
|
||||
* It keeps all the per-converter data and points into the loaded mapping tables.
|
||||
*/
|
||||
public final class UConverterMBCSTable {
|
||||
/* toUnicode */
|
||||
short countStates;
|
||||
byte dbcsOnlyState;
|
||||
boolean stateTableOwned;
|
||||
int countToUFallbacks;
|
||||
|
||||
int stateTable[/*countStates*/][/*256*/];
|
||||
int swapLFNLStateTable[/*countStates*/][/*256*/]; /* for swaplfnl */
|
||||
char unicodeCodeUnits[/*countUnicodeResults*/];
|
||||
MBCSToUFallback toUFallbacks[/*countToUFallbacks*/];
|
||||
|
||||
/* fromUnicode */
|
||||
char fromUnicodeTable[];
|
||||
byte fromUnicodeBytes[];
|
||||
byte swapLFNLFromUnicodeBytes[]; /* for swaplfnl */
|
||||
int fromUBytesLength;
|
||||
short outputType, unicodeMask;
|
||||
|
||||
/* converter name for swaplfnl */
|
||||
String swapLFNLName;
|
||||
|
||||
/* extension data */
|
||||
UConverterSharedData baseSharedData;
|
||||
//int extIndexes[];
|
||||
ByteBuffer extIndexes; // create int[] view etc. as needed
|
||||
|
||||
UConverterMBCSTable()
|
||||
{
|
||||
}
|
||||
|
||||
UConverterMBCSTable(UConverterMBCSTable t)
|
||||
{
|
||||
countStates = t.countStates;
|
||||
dbcsOnlyState = t.dbcsOnlyState;
|
||||
stateTableOwned = t.stateTableOwned;
|
||||
countToUFallbacks = t.countToUFallbacks;
|
||||
stateTable = t.stateTable;
|
||||
swapLFNLStateTable = t.swapLFNLStateTable;
|
||||
unicodeCodeUnits = t.unicodeCodeUnits;
|
||||
toUFallbacks = t.toUFallbacks;
|
||||
fromUnicodeTable = t.fromUnicodeTable;
|
||||
fromUnicodeBytes = t.fromUnicodeBytes;
|
||||
swapLFNLFromUnicodeBytes = t.swapLFNLFromUnicodeBytes;
|
||||
fromUBytesLength = t.fromUBytesLength;
|
||||
outputType = t.outputType;
|
||||
unicodeMask = t.unicodeMask;
|
||||
swapLFNLName = t.swapLFNLName;
|
||||
baseSharedData = t.baseSharedData;
|
||||
extIndexes = t.extIndexes;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* MBCS data header. See data format description above.
|
||||
*/
|
||||
public final class MBCSHeader {
|
||||
byte version[/*U_MAX_VERSION_LENGTH*/];
|
||||
int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes;
|
||||
int flags;
|
||||
int fromUBytesLength;
|
||||
|
||||
public MBCSHeader()
|
||||
{
|
||||
version = new byte[MAX_VERSION_LENGTH];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum for specifying basic types of converters
|
||||
* @see getType
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
public static final class UConverterType {
|
||||
public static final int UNSUPPORTED_CONVERTER = -1;
|
||||
public static final int SBCS = 0;
|
||||
public static final int DBCS = 1;
|
||||
public static final int MBCS = 2;
|
||||
public static final int LATIN_1 = 3;
|
||||
public static final int UTF8 = 4;
|
||||
public static final int UTF16_BigEndian = 5;
|
||||
public static final int UTF16_LittleEndian = 6;
|
||||
public static final int UTF32_BigEndian = 7;
|
||||
public static final int UTF32_LittleEndian = 8;
|
||||
public static final int EBCDIC_STATEFUL = 9;
|
||||
public static final int ISO_2022 = 10;
|
||||
|
||||
public static final int LMBCS_1 = 11;
|
||||
public static final int LMBCS_2 = LMBCS_1 + 1; //12
|
||||
public static final int LMBCS_3 = LMBCS_2 + 1; //13
|
||||
public static final int LMBCS_4 = LMBCS_3 + 1; //14
|
||||
public static final int LMBCS_5 = LMBCS_4 + 1; //15
|
||||
public static final int LMBCS_6 = LMBCS_5 + 1; //16
|
||||
public static final int LMBCS_8 = LMBCS_6 + 1; //17
|
||||
public static final int LMBCS_11 = LMBCS_8 + 1; //18
|
||||
public static final int LMBCS_16 = LMBCS_11 + 1; //19
|
||||
public static final int LMBCS_17 = LMBCS_16 + 1; //20
|
||||
public static final int LMBCS_18 = LMBCS_17 + 1; //21
|
||||
public static final int LMBCS_19 = LMBCS_18 + 1; //22
|
||||
public static final int LMBCS_LAST = LMBCS_19; //22
|
||||
public static final int HZ =LMBCS_LAST + 1; //23
|
||||
public static final int SCSU = HZ + 1; //24
|
||||
public static final int ISCII = SCSU + 1; //25
|
||||
public static final int US_ASCII = ISCII + 1; //26
|
||||
public static final int UTF7 = US_ASCII + 1; //27
|
||||
public static final int BOCU1 = UTF7 + 1; //28
|
||||
public static final int UTF16 = BOCU1 + 1; //29
|
||||
public static final int UTF32 = UTF16 + 1; //30
|
||||
public static final int CESU8 = UTF32 + 1; //31
|
||||
public static final int IMAP_MAILBOX = CESU8 + 1; //32
|
||||
public static final int MAC_ARABIC = IMAP_MAILBOX + 1; //33
|
||||
public static final int MAC_HEBREW = MAC_ARABIC + 1; //34
|
||||
|
||||
/* Number of converter types for which we have conversion routines. */
|
||||
public static final int NUMBER_OF_SUPPORTED_CONVERTER_TYPES = MAC_HEBREW + 1;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Enum for specifying which platform a converter ID refers to.
|
||||
* The use of platform/CCSID is not recommended. See openCCSID().
|
||||
* @draft ICU 3.6
|
||||
*/
|
||||
public static final class UConverterPlatform {
|
||||
public static final int UNKNOWN = -1;
|
||||
public static final int IBM = 0;
|
||||
}
|
||||
|
||||
static UConverterSharedData _MBCSData = null, /*_Latin1Data = null,*/ /*_UTF8Data = null,*/ /*_UTF16BEData = null,*/ /*_UTF16LEData = null,*/ /*_UTF32BEData = null,*/ /*_UTF32LEData = null,*/ /*_ISO2022Data = null,*/ _LMBCSData1 = null,_LMBCSData2 = null, _LMBCSData3 = null, _LMBCSData4 = null, _LMBCSData5 = null, _LMBCSData6 = null, _LMBCSData8 = null,_LMBCSData11 = null,_LMBCSData16 = null,_LMBCSData17 = null,_LMBCSData18 = null,_LMBCSData19 = null, _HZData = null, _SCSUData = null, /*_ISCIIData = null,*/ /*_ASCIIData = null,*/ _UTF7Data = null, _Bocu1Data = null, /*_UTF16Data = null, _UTF32Data = null,*/ _CESU8Data = null, _IMAPData = null;
|
||||
static UConverterSharedData[] converterData;
|
||||
static class cnvNameTypeClass {
|
||||
String name;
|
||||
int type;
|
||||
cnvNameTypeClass(String name_, int type_) { name = name_; type = type_; }
|
||||
}
|
||||
|
||||
static cnvNameTypeClass cnvNameType[];
|
||||
|
||||
static final String DATA_TYPE = "cnv";
|
||||
static final int CNV_DATA_BUFFER_SIZE = 25000;
|
||||
static final int SIZE_OF_UCONVERTER_SHARED_DATA = 100;
|
||||
|
||||
static final int MAXIMUM_UCS2 = 0x0000FFFF;
|
||||
static final int MAXIMUM_UTF = 0x0010FFFF;
|
||||
static final int MAXIMUM_UCS4 = 0x7FFFFFFF;
|
||||
static final int HALF_SHIFT = 10;
|
||||
static final int HALF_BASE = 0x0010000;
|
||||
static final int HALF_MASK = 0x3FF;
|
||||
static final int SURROGATE_HIGH_START = 0xD800;
|
||||
static final int SURROGATE_HIGH_END = 0xDBFF;
|
||||
static final int SURROGATE_LOW_START = 0xDC00;
|
||||
static final int SURROGATE_LOW_END = 0xDFFF;
|
||||
|
||||
/* -SURROGATE_LOW_START + HALF_BASE */
|
||||
static final int SURROGATE_LOW_BASE = 9216;
|
||||
}
|
61
icu4j/src/com/ibm/icu/impl/UConverterStaticData.java
Normal file
61
icu4j/src/com/ibm/icu/impl/UConverterStaticData.java
Normal file
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
public final class UConverterStaticData { /* +offset: size */
|
||||
public int structSize; /* +0: 4 Size of this structure */
|
||||
|
||||
public String name; /* +4: 60 internal name of the converter- invariant chars */
|
||||
|
||||
public int codepage; /* +64: 4 codepage # (now IBM-$codepage) */
|
||||
|
||||
public byte platform; /* +68: 1 platform of the converter (only IBM now) */
|
||||
public byte conversionType; /* +69: 1 conversion type */
|
||||
|
||||
public byte minBytesPerChar; /* +70: 1 Minimum # bytes per char in this codepage */
|
||||
public byte maxBytesPerChar; /* +71: 1 Maximum # bytes output per UChar in this codepage */
|
||||
|
||||
public byte subChar[/*UCNV_MAX_SUBCHAR_LEN*/]; /* +72: 4 [note: 4 and 8 byte boundary] */
|
||||
public byte subCharLen; /* +76: 1 */
|
||||
|
||||
public byte hasToUnicodeFallback; /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
|
||||
public byte hasFromUnicodeFallback; /* +78: 1 */
|
||||
public short unicodeMask; /* +79: 1 bit 0: has supplementary bit 1: has single surrogates */
|
||||
public byte subChar1; /* +80: 1 single-byte substitution character for IBM MBCS (0 if none) */
|
||||
public byte reserved[/*19*/]; /* +81: 19 to round out the structure */
|
||||
/* total size: 100 */
|
||||
public UConverterStaticData()
|
||||
{
|
||||
subChar = new byte[UConverterConstants.MAX_SUBCHAR_LEN];
|
||||
reserved = new byte[19];
|
||||
}
|
||||
|
||||
public UConverterStaticData(int structSize_, String name_, int codepage_, byte platform_, byte conversionType_, byte minBytesPerChar_, byte maxBytesPerChar_, byte[] subChar_, byte subCharLen_, byte hasToUnicodeFallback_, byte hasFromUnicodeFallback_, short unicodeMask_, byte subChar1_, byte[] reserved_)
|
||||
{
|
||||
structSize = structSize_;
|
||||
name = name_;
|
||||
codepage = codepage_;
|
||||
platform = platform_;
|
||||
conversionType = conversionType_;
|
||||
minBytesPerChar = minBytesPerChar_;
|
||||
maxBytesPerChar = maxBytesPerChar_;
|
||||
subChar = new byte[UConverterConstants.MAX_SUBCHAR_LEN];
|
||||
System.arraycopy(subChar_, 0, subChar, 0, (subChar.length < subChar_.length? subChar.length : subChar_.length));
|
||||
subCharLen = subCharLen_;
|
||||
hasToUnicodeFallback = hasToUnicodeFallback_;
|
||||
hasFromUnicodeFallback = hasFromUnicodeFallback_;
|
||||
unicodeMask = unicodeMask_;
|
||||
subChar1 = subChar1_;
|
||||
reserved = new byte[19];
|
||||
System.arraycopy(reserved_, 0, reserved, 0, (reserved.length < reserved_.length? reserved.length : reserved_.length));
|
||||
}
|
||||
|
||||
public static final int sizeofUConverterStaticData = 100;
|
||||
}
|
||||
|
Loading…
Add table
Reference in a new issue