From fac1ab82dc4238aded92b8c77d09cfee6baf3564 Mon Sep 17 00:00:00 2001 From: Kedar Rajwade Date: Tue, 10 Jun 2008 18:35:57 +0000 Subject: [PATCH] ICU-2147 SCSU and BOCU-1 converters X-SVN-Rev: 24141 --- .gitattributes | 2 + .../src/com/ibm/icu/charset/CharsetBOCU1.java | 1053 ++++++++++++++ .../src/com/ibm/icu/charset/CharsetSCSU.java | 1254 +++++++++++++++++ 3 files changed, 2309 insertions(+) create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetBOCU1.java create mode 100644 icu4j/src/com/ibm/icu/charset/CharsetSCSU.java diff --git a/.gitattributes b/.gitattributes index c9b5cc8446f..5e76a7499dd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -131,7 +131,9 @@ icu4j/localespi/src/META-INF/services/java.util.spi.TimeZoneNameProvider -text icu4j/localespi/src/com/ibm/icu/impl/javaspi/ICULocaleServiceProviderConfig.properties -text icu4j/preprocessor.txt -text icu4j/src/com/ibm/icu/ICUConfig.properties -text +icu4j/src/com/ibm/icu/charset/CharsetBOCU1.java -text icu4j/src/com/ibm/icu/charset/CharsetISO2022.java -text +icu4j/src/com/ibm/icu/charset/CharsetSCSU.java -text icu4j/src/com/ibm/icu/dev/data/rbbi/english.dict -text icu4j/src/com/ibm/icu/dev/data/testdata.jar -text icu4j/src/com/ibm/icu/dev/data/thai6.ucs -text diff --git a/icu4j/src/com/ibm/icu/charset/CharsetBOCU1.java b/icu4j/src/com/ibm/icu/charset/CharsetBOCU1.java new file mode 100644 index 00000000000..e749db77f86 --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetBOCU1.java @@ -0,0 +1,1053 @@ +/* + ******************************************************************************* + * Copyright (C) 2008, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + + +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.lang.UCharacter; + +/** + * @author krajwade + * + */ +public class CharsetBOCU1 extends CharsetICU { + + /* BOCU constants and macros */ + + /* initial value for "prev": middle of the ASCII range */ + private static final byte BOCU1_ASCII_PREV = 0x40; + + /* bounding byte values for differences */ + private static final int BOCU1_MIN = 0x21; + private static final int BOCU1_MIDDLE = 0x90; + private static final int BOCU1_MAX_LEAD = 0xfe; + private static final int BOCU1_MAX_TRAIL = 0xff; + private static final int BOCU1_RESET = 0xff; + + /* number of lead bytes */ + private static final int BOCU1_COUNT = (BOCU1_MAX_LEAD-BOCU1_MIN+1); + + /* adjust trail byte counts for the use of some C0 control byte values */ + private static final int BOCU1_TRAIL_CONTROLS_COUNT = 20; + private static final int BOCU1_TRAIL_BYTE_OFFSET = (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT); + + /* number of trail bytes */ + private static final int BOCU1_TRAIL_COUNT =((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT); + + /* + * number of positive and negative single-byte codes + * (counting 0==BOCU1_MIDDLE among the positive ones) + */ + private static final int BOCU1_SINGLE = 64; + + /* number of lead bytes for positive and negative 2/3/4-byte sequences */ + private static final int BOCU1_LEAD_2 = 43; + private static final int BOCU1_LEAD_3 = 3; + private static final int BOCU1_LEAD_4 = 1; + + /* The difference value range for single-byters. */ + private static final int BOCU1_REACH_POS_1 = (BOCU1_SINGLE-1); + private static final int BOCU1_REACH_NEG_1 = (-BOCU1_SINGLE); + + /* The difference value range for double-byters. */ + private static final int BOCU1_REACH_POS_2 = (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT); + private static final int BOCU1_REACH_NEG_2 = (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT); + + /* The difference value range for 3-byters. */ + private static final int BOCU1_REACH_POS_3 = + (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + + private static final int BOCU1_REACH_NEG_3 = (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + + /* The lead byte start values. */ + private static final int BOCU1_START_POS_2 = (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1); + private static final int BOCU1_START_POS_3 = (BOCU1_START_POS_2+BOCU1_LEAD_2); + private static final int BOCU1_START_POS_4 = (BOCU1_START_POS_3+BOCU1_LEAD_3); + /* ==BOCU1_MAX_LEAD */ + + private static final int BOCU1_START_NEG_2 = (BOCU1_MIDDLE+BOCU1_REACH_NEG_1); + private static final int BOCU1_START_NEG_3 = (BOCU1_START_NEG_2-BOCU1_LEAD_2); + private static final int BOCU1_START_NEG_4 = (BOCU1_START_NEG_3-BOCU1_LEAD_3); + /* ==BOCU1_MIN+1 */ + + /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ + private static int BOCU1_LENGTH_FROM_LEAD(int lead) { + return ((BOCU1_START_NEG_2<=(lead) && (lead)>24 : 4); + } + + /* + * Byte value map for control codes, + * from external byte values 0x00..0x20 + * to trail byte values 0..19 (0..0x13) as used in the difference calculation. + * External byte values that are illegal as trail bytes are mapped to -1. + */ + private static final int[] + bocu1ByteToTrail={ + /* 0 1 2 3 4 5 6 7 */ + -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, + + /* 8 9 a b c d e f */ + -1, -1, -1, -1, -1, -1, -1, -1, + + /* 10 11 12 13 14 15 16 17 */ + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + + /* 18 19 1a 1b 1c 1d 1e 1f */ + 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, + + /* 20 */ + -1 + }; + + /* + * Byte value map for control codes, + * from trail byte values 0..19 (0..0x13) as used in the difference calculation + * to external byte values 0x00..0x20. + */ + private static final int[] + bocu1TrailToByte={ + /* 0 1 2 3 4 5 6 7 */ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, + + /* 8 9 a b c d e f */ + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + + /* 10 11 12 13 */ + 0x1c, 0x1d, 0x1e, 0x1f + }; + + + /* + * 12 commonly used C0 control codes (and space) are only used to encode + * themselves directly, + * which makes BOCU-1 MIME-usable and reasonably safe for + * ASCII-oriented software. + * + * These controls are + * 0 NUL + * + * 7 BEL + * 8 BS + * + * 9 TAB + * a LF + * b VT + * c FF + * d CR + * + * e SO + * f SI + * + * 1a SUB + * 1b ESC + * + * The other 20 C0 controls are also encoded directly (to preserve order) + * but are also used as trail bytes in difference encoding + * (for better compression). + */ + private static int BOCU1_TRAIL_TO_BYTE(int trail) { + return ((trail)>=BOCU1_TRAIL_CONTROLS_COUNT ? (trail)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[trail]); + } + + + + + + /* BOCU-1 implementation functions ------------------------------------------ */ + + private static int BOCU1_SIMPLE_PREV(int c){ + return (((c)&~0x7f)+BOCU1_ASCII_PREV); + } + + /** + * Compute the next "previous" value for differencing + * from the current code point. + * + * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) + * @return "previous code point" state value + */ + private static int bocu1Prev(int c) { + /* compute new prev */ + if(/* 0x3040<=c && */ c<=0x309f) { + /* Hiragana is not 128-aligned */ + return 0x3070; + } else if(0x4e00<=c && c<=0x9fa5) { + /* CJK Unihan */ + return 0x4e00-BOCU1_REACH_NEG_2; + } else if(0xac00<=c /* && c<=0xd7a3 */) { + /* Korean Hangul */ + return (0xd7a3+0xac00)/2; + } else { + /* mostly small scripts */ + return BOCU1_SIMPLE_PREV(c); + } + } + + /** Fast version of bocu1Prev() for most scripts. */ + private static int BOCU1_PREV(int c) { + return ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)); + } + + protected byte[] fromUSubstitution = new byte[]{(byte)0x1A}; + + /* Faster versions of packDiff() for single-byte-encoded diff values. */ + + /** Is a diff value encodable in a single byte? */ + private static boolean DIFF_IS_SINGLE(int diff){ + return (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1); + } + + /** Encode a diff value in a single byte. */ + private static int PACK_SINGLE_DIFF(int diff){ + return (BOCU1_MIDDLE+(diff)); + } + + /** Is a diff value encodable in two bytes? */ + private static boolean DIFF_IS_DOUBLE(int diff){ + return (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2); + } + + + public CharsetBOCU1(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 1; + maxCharsPerByte = 1; + } + + class CharsetEncoderBOCU extends CharsetEncoderICU { + public CharsetEncoderBOCU(CharsetICU cs) { + super(cs,fromUSubstitution); + } + + int sourceIndex, nextSourceIndex; + int prev, c , diff; + boolean checkNegative = false; + int targetCapacity; + /* label Values */ + private static final int fastSingle=0; + private static final int getTrail=1; + private static final int regularLoop=2; + + private boolean LabelLoop = true; + private int labelType = fastSingle; + + + /** + * Integer division and modulo with negative numerators + * yields negative modulo results and quotients that are one more than + * what we need here. + * This macro adjust the results so that the modulo-value m is always >=0. + * + * For positive n, the if() condition is always FALSE. + * + * @param n Number to be split into quotient and rest. + * Will be modified to contain the quotient. + * @param d Divisor. + * @param m Output variable for the rest (modulo result). + */ + + private int NEGDIVMOD(int n, int d, int m) { + diff = n; + (m)=(diff)%(d); + (diff)/=(d); + if((m)<0) { + --(diff); + (m)+=(d); + } + return m; + } + + /** + * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes + * and return a packed integer with them. + * + * The encoding favors small absolute differences with short encodings + * to compress runs of same-script characters. + * + * Optimized version with unrolled loops and fewer floating-point operations + * than the standard packDiff(). + * + * @param diff difference value -0x10ffff..0x10ffff + * @return + * 0x010000zz for 1-byte sequence zz + * 0x0200yyzz for 2-byte sequence yy zz + * 0x03xxyyzz for 3-byte sequence xx yy zz + * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) + */ + private int packDiff(int n) { + int result, m =0; + diff = n; + + if(diff>=BOCU1_REACH_NEG_1) { + /* mostly positive differences, and single-byte negative ones */ + if(diff<=BOCU1_REACH_POS_2) { + /* two bytes */ + diff-=BOCU1_REACH_POS_1+1; + result=0x02000000; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m); + + result|=(BOCU1_START_POS_2+diff)<<8; + } else if(diff<=BOCU1_REACH_POS_3) { + /* three bytes */ + diff-=BOCU1_REACH_POS_2+1; + result=0x03000000; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m); + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + result|=(BOCU1_START_POS_3+diff)<<16; + } else { + /* four bytes */ + diff-=BOCU1_REACH_POS_3+1; + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result=BOCU1_TRAIL_TO_BYTE(m); + + m=diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + /* + * We know that / and % would deliver quotient 0 and rest=diff. + * Avoid division and modulo for performance. + */ + result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; + + result|=((BOCU1_START_POS_4&UConverterConstants.UNSIGNED_INT_MASK))<<24; + } + } else { + /* two- to four-byte negative differences */ + if(diff>=BOCU1_REACH_NEG_2) { + /* two bytes */ + diff-=BOCU1_REACH_NEG_1; + result=0x02000000; + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m); + + result|=(BOCU1_START_NEG_2+diff)<<8; + } else if(diff>=BOCU1_REACH_NEG_3) { + /* three bytes */ + diff-=BOCU1_REACH_NEG_2; + result=0x03000000; + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m); + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + result|=(BOCU1_START_NEG_3+diff)<<16; + } else { + /* four bytes */ + diff-=BOCU1_REACH_NEG_3; + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result=BOCU1_TRAIL_TO_BYTE(m); + + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + result|=BOCU1_TRAIL_TO_BYTE(m)<<8; + + /* + * We know that NEGDIVMOD would deliver + * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. + * Avoid division and modulo for performance. + */ + m=diff+BOCU1_TRAIL_COUNT; + result|=BOCU1_TRAIL_TO_BYTE(m)<<16; + + result|=BOCU1_MIN<<24; + } + } + return result; + } + + + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush){ + CoderResult cr = CoderResult.UNDERFLOW; + + + /*set up the local pointers*/ + targetCapacity = target.limit() - target.position(); + c = fromUChar32; + prev = fromUnicodeStatus; + if(prev==0){ + prev = BOCU1_ASCII_PREV; + } + + /*sourceIndex ==-1 if the current characte began in the previous buffer*/ + sourceIndex = c ==0 ? 0: -1; + nextSourceIndex = 0; + + while(LabelLoop){ + switch(labelType){ + case fastSingle: + labelType = fastSingle(source, target, offsets); + break; + case getTrail: + labelType = getTrail(source, target, offsets); + break; + case regularLoop: + labelType = regularLoop(source, target, offsets, cr); + break; + } + } + + return cr; + } + + + private int fastSingle(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + + /*conversion loop*/ + if(c!=0 &&targetCapacity>0){ + labelType = getTrail; + return labelType; + } +//fastSingle: + + /*fast loop for single-byte differences*/ + /*use only one loop counter variable , targetCapacity, not also source*/ + diff = source.limit() - source.position(); + if(targetCapacity>diff){ + targetCapacity = diff; + } + while(targetCapacity>0 && (c=source.get(source.position()))<0x3000){ + if(c<=0x20){ + if(c!=0x20){ + prev = BOCU1_ASCII_PREV; + } + target.put((byte)c); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + source.position(source.position()+1); + --targetCapacity; + }else { + diff = c-prev; + if(DIFF_IS_SINGLE(diff)){ + prev = BOCU1_SIMPLE_PREV(c); + target.put((byte)PACK_SINGLE_DIFF(diff)); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + source.position(source.position()+1); + --targetCapacity; + }else { + break; + } + } + } + return regularLoop; + } + + private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){ + if(source.hasRemaining()){ + /*test the following code unit*/ + char trail = source.get(source.position()); + if(UTF16.isTrailSurrogate(trail)){ + source.position(source.position()+1); + ++nextSourceIndex; + c=UCharacter.getCodePoint((char)c, trail); + } + } else { + /*no more input*/ + c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/ + checkNegative = true; + } + return regularLoop; + } + + private int regularLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, CoderResult cr){ + /*restore real values*/ + targetCapacity = target.limit()-target.position(); + sourceIndex = nextSourceIndex; /*wrong if offsets==null but does not matter*/ + + /*regular loop for all classes*/ + while(source.hasRemaining()){ + if(targetCapacity>0){ + c = source.get(); + ++nextSourceIndex; + + if(c<=0x20){ + /* + * ISO C0 control & space: + * Encode directly for MIME compatibility, + * and reset state except for space, to not disrupt compression. + */ + if(c!=0x20) { + prev=BOCU1_ASCII_PREV; + } + target.put((byte)c); + if(offsets != null){ + offsets.put(sourceIndex++); + } + --targetCapacity; + + sourceIndex=nextSourceIndex; + continue; + } + if(UTF16.isLeadSurrogate((char)c)){ + getTrail(source, target, offsets); + if(checkNegative){ + break; + } + } + + /* + * all other Unicode code points c==U+0021..U+10ffff + * are encoded with the difference c-prev + * + * a new prev is computed from c, + * placed in the middle of a 0x80-block (for most small scripts) or + * in the middle of the Unihan and Hangul blocks + * to statistically minimize the following difference + */ + diff = c- prev; + prev = BOCU1_PREV(c); + if(DIFF_IS_SINGLE(diff)){ + target.put((byte)PACK_SINGLE_DIFF(diff)); + if(offsets!=null){ + offsets.put(sourceIndex++); + } + --targetCapacity; + sourceIndex=nextSourceIndex; + if(c<0x3000){ + labelType = fastSingle; + return labelType; + } + } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity){ + /*optimize 2 byte case*/ + int m = 0; + if(diff>=0){ + diff -= BOCU1_REACH_POS_1 +1; + m = diff%BOCU1_TRAIL_COUNT; + diff/=BOCU1_TRAIL_COUNT; + diff+=BOCU1_START_POS_2; + } else { + diff -= BOCU1_REACH_NEG_1; + m = NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); + diff+=BOCU1_START_NEG_2; + } + target.put((byte)diff); + target.put((byte)BOCU1_TRAIL_TO_BYTE(m)); + if(offsets!=null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + targetCapacity -= 2; + sourceIndex = nextSourceIndex; + } else { + int length; /*will be 2..4*/ + diff = packDiff(diff); + length = BOCU1_LENGTH_FROM_PACKED(diff); + + /*write the output character bytes from diff and length*/ + /*from the first if in the loop we know that targetCapacity>0*/ + if(length<=targetCapacity){ + switch(length){ + /*each branch falls through the next one*/ + case 4: + target.put((byte)(diff>>24)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 3: + target.put((byte)(diff>>16)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(diff>>8)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + /*case 1 handled above*/ + target.put((byte)diff); + if(offsets!= null){ + offsets.put(sourceIndex); + } + default: + /*will never occur*/ + break; + } + targetCapacity -= length; + sourceIndex = nextSourceIndex; + } else { + ByteBuffer error = ByteBuffer.wrap(errorBuffer); + /* + * We actually do this backwards here: + * In order to save an intermediate variable, we output + * first to the overflow buffer what does not fit into the + * regular target. + */ + /* we know that 1<=targetCapacity>16)); + case 2: + error.put((byte)(diff>>8)); + case 1: + error.put((byte)diff); + default: + /* will never occur */ + break; + } + errorBufferLength = length; + + /* now output what fits into the regular target */ + diff>>=8*length; /* length was reduced by targetCapacity */ + switch(targetCapacity) { + /* each branch falls through to the next one */ + case 3: + target.put((byte)(diff>>16)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(diff>>8)); + if(offsets!= null){ + offsets.put(sourceIndex); + } + case 1: + target.put((byte)diff); + if(offsets!= null){ + offsets.put(sourceIndex); + } + default: + /* will never occur */ + break; + } + + /* target overflow */ + targetCapacity=0; + cr = CoderResult.OVERFLOW; + break; + } + } + } else{ + /*target is full*/ + cr = CoderResult.OVERFLOW; + break; + } + + } + /*set the converter state back into UConverter*/ + fromUChar32 = c<0 ? -c :0; + fromUnicodeStatus = prev; + LabelLoop = false; + labelType = fastSingle; + return labelType; + } + + } + + class CharsetDecoderBOCU extends CharsetDecoderICU{ + public CharsetDecoderBOCU(CharsetICU cs) { + super(cs); + } + + int byteIndex; + int sourceIndex, nextSourceIndex; + int prev, c , diff, count; + byte[] bytes; + int targetCapacity; + + /* label Values */ + private static final int fastSingle=0; + private static final int getTrail=1; + private static final int regularLoop=2; + private static final int endLoop=3; + + private boolean LabelLoop = true; + private boolean afterTrail = false; + private int labelType = fastSingle; + + + + /* + * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. + * The UConverter fields are used as follows: + * + * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) + * + * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) + * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) + */ + + /* BOCU-1-from-Unicode conversion functions --------------------------------- */ + + + + /** + * Function for BOCU-1 decoder; handles multi-byte lead bytes. + * + * @param b lead byte; + * BOCU1_MIN<=b= BOCU1_START_NEG_2) { + /* positive difference */ + if(b < BOCU1_START_POS_3) { + /* two bytes */ + diff = (b - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1+1; + count = 1; + } else if(b < BOCU1_START_POS_4) { + /* three bytes */ + diff = (b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; + count = 2; + } else { + /* four bytes */ + diff = BOCU1_REACH_POS_3+1; + count = 3; + } + } else { + /* negative difference */ + if(b >= BOCU1_START_NEG_3) { + /* two bytes */ + diff=(b -BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; + count=1; + } else if(b>BOCU1_MIN) { + /* three bytes */ + diff=(b - BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_2; + count = 2; + } else { + /* four bytes */ + diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; + count=3; + } + } + + /* return the state for decoding the trail byte(s) */ + return (diff<<2)|count; + } + + /** + * Function for BOCU-1 decoder; handles multi-byte trail bytes. + * + * @param count number of remaining trail bytes including this one + * @param b trail byte + * @return new delta for diff including b - <0 indicates an error + * + * @see decodeBocu1 + */ + private int decodeBocu1TrailByte(int count, int b) { + b = b&UConverterConstants.UNSIGNED_BYTE_MASK; + if((b)<=0x20) { + /* skip some C0 controls and make the trail byte range contiguous */ + b = bocu1ByteToTrail[b]; + /* b<0 for an illegal trail byte value will result in return<0 below */ + } else { + //b-= BOCU1_TRAIL_BYTE_OFFSET; + b = b - BOCU1_TRAIL_BYTE_OFFSET; + } + + /* add trail byte into difference and decrement count */ + if(count==1) { + return b; + } else if(count==2) { + return b*BOCU1_TRAIL_COUNT; + } else /* count==3 */ { + return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); + } + } + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, + boolean flush){ + CoderResult cr = CoderResult.UNDERFLOW; + prev = toUnicodeStatus; + if(prev==0){ + prev = BOCU1_ASCII_PREV; + } + diff = mode; + count = diff&3; + diff>>=2; + + byteIndex = toULength; + bytes = toUBytesArray; + + /* sourceIndex=-1 if the current character began in the previous buffer */ + sourceIndex=byteIndex==0 ? 0 : -1; + nextSourceIndex=0; + + /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ + if(count>0 && byteIndex>0 && target.position()diff) { + count = diff; + } + while(count>0) { + if(BOCU1_START_NEG_2 <=(c=source.get(source.position())) && c< BOCU1_START_POS_2) { + c = prev + (c-BOCU1_MIDDLE); + if(c<0x3000) { + target.put((char)c); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + prev = BOCU1_SIMPLE_PREV(c); + } else { + break; + } + } else if((c&UConverterConstants.UNSIGNED_BYTE_MASK) <= 0x20) { + if((c&UConverterConstants.UNSIGNED_BYTE_MASK) != 0x20) { + prev = BOCU1_ASCII_PREV; + } + target.put((char)c); + if(offsets!=null){ + offsets.put(nextSourceIndex++); + } + } else { + break; + } + source.position(source.position()+1); + --count; + } + sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ + return labelType; + } + + private int getTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, CoderResult cr){ + labelType = regularLoop; + for(;;) { + if(source.position() >= source.limit()) { + labelType = endLoop; + return labelType; + } + ++nextSourceIndex; + c = bytes[byteIndex++] = source.get(); + + /* trail byte in any position */ + c = decodeBocu1TrailByte(count, c); + if(c<0) { + cr = CoderResult.malformedForLength(1); + labelType = endLoop; + return labelType; + } + + diff+=c; + if(--count==0) { + /* final trail byte, deliver a code point */ + byteIndex=0; + c = prev + diff; + if(c > 0x10ffff) { + cr = CoderResult.malformedForLength(1); + labelType = endLoop; + return labelType; + } + break; + } + } + return labelType; + + } + + + private int afterGetTrail(ByteBuffer source, CharBuffer target, IntBuffer offsets, CoderResult cr){ + /* decode a sequence of single and lead bytes */ + while(source.hasRemaining()) { + if(!afterTrail){ + if(target.position() >= target.limit()) { + /* target is full */ + cr = CoderResult.OVERFLOW; + break; + } + + ++nextSourceIndex; + c = source.get()&UConverterConstants.UNSIGNED_BYTE_MASK; + if(BOCU1_START_NEG_2 <= c && c < BOCU1_START_POS_2) { + /* Write a code point directly from a single-byte difference. */ + c = prev + (c-BOCU1_MIDDLE); + if(c<0x3000) { + target.put((char)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + prev = BOCU1_SIMPLE_PREV(c); + sourceIndex = nextSourceIndex; + labelType = fastSingle; + return labelType; + } + } else if(c <= 0x20) { + /* + * Direct-encoded C0 control code or space. + * Reset prev for C0 control codes but not for space. + */ + if(c != 0x20) { + prev=BOCU1_ASCII_PREV; + } + target.put((char)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + sourceIndex=nextSourceIndex; + continue; + } else if(BOCU1_START_NEG_3 <= c && c < BOCU1_START_POS_3 && source.hasRemaining()) { + /* Optimize two-byte case. */ + if(c >= BOCU1_MIDDLE) { + diff=(c - BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_POS_1 + 1; + } else { + diff=(c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT + BOCU1_REACH_NEG_1; + } + + /* trail byte */ + ++nextSourceIndex; + c = decodeBocu1TrailByte(1, source.get()); + if(c<0 || ((c = prev + diff + c)&UConverterConstants.UNSIGNED_INT_MASK)>0x10ffff) { + bytes[0]= source.get(-2); + bytes[1]= source.get(-1); + byteIndex = 2; + cr = CoderResult.malformedForLength(1); + break; + } + } else if(c == BOCU1_RESET) { + /* only reset the state, no code point */ + prev=BOCU1_ASCII_PREV; + sourceIndex=nextSourceIndex; + continue; + } else { + /* + * For multi-byte difference lead bytes, set the decoder state + * with the partial difference value from the lead byte and + * with the number of trail bytes. + */ + bytes[0]= (byte)c; + byteIndex = 1; + + diff = decodeBocu1LeadByte(c); + count = diff&3; + diff>>=2; + getTrail(source, target, offsets, cr); + if(labelType != regularLoop){ + return labelType; + } + } + } + + /* calculate the next prev and output c */ + prev = BOCU1_PREV(c); + if(c<=0xffff) { + target.put((char)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + } else { + /* output surrogate pair */ + target.put((char)UTF16.getLeadSurrogate(c)); + if(target.hasRemaining()) { + target.put((char)UTF16.getTrailSurrogate(c)); + if(offsets!=null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + } else { + /* target overflow */ + if(offsets!=null){ + offsets.put(sourceIndex); + } + charErrorBufferArray[0] = UTF16.getTrailSurrogate(c); + charErrorBufferLength = 1; + cr = CoderResult.OVERFLOW; + break; + } + } + sourceIndex=nextSourceIndex; + } + labelType = endLoop; + return labelType; + } + + private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, CoderResult cr){ + if(cr.isMalformed()) { + /* set the converter state in UConverter to deal with the next character */ + toUnicodeStatus = BOCU1_ASCII_PREV; + mode = 0; + } else { + /* set the converter state back into UConverter */ + toUnicodeStatus=prev; + mode=(diff<<2)|count; + } + toULength=byteIndex; + LabelLoop = false; + } + + } + + + public CharsetDecoder newDecoder() { + return new CharsetDecoderBOCU(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderBOCU(this); + } + + void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ + CharsetICU.getCompleteUnicodeSet(setFillIn); + } + +} diff --git a/icu4j/src/com/ibm/icu/charset/CharsetSCSU.java b/icu4j/src/com/ibm/icu/charset/CharsetSCSU.java new file mode 100644 index 00000000000..4a050bbfbd4 --- /dev/null +++ b/icu4j/src/com/ibm/icu/charset/CharsetSCSU.java @@ -0,0 +1,1254 @@ +/* + ******************************************************************************* + * Copyright (C) 2008, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.charset; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.IntBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + + +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.lang.UCharacter; + +/** + * @author krajwade + * + */ +class CharsetSCSU extends CharsetICU{ + /* SCSU definitions --------------------------------------------------------- */ + + /* SCSU command byte values */ + //enum { + private static final short SQ0=0x01; /* Quote from window pair 0 */ + private static final short SQ7=0x08; /* Quote from window pair 7 */ + private static final short SDX=0x0B; /* Define a window as extended */ + private static final short Srs=0x0C; /* reserved */ + private static final short SQU=0x0E; /* Quote a single Unicode character */ + private static final short SCU=0x0F; /* Change to Unicode mode */ + private static final short SC0=0x10; /* Select window 0 */ + private static final short SC7=0x17; /* Select window 7 */ + private static final short SD0=0x18; /* Define and select window 0 */ + private static final short SD7=0x1F; /* Define and select window 7 */ + + private static final short UC0=0xE0; /* Select window 0 */ + private static final short UC7=0xE7; /* Select window 7 */ + private static final short UD0=0xE8; /* Define and select window 0 */ + private static final short UD7=0xEF; /* Define and select window 7 */ + private static final short UQU=0xF0; /* Quote a single Unicode character */ + private static final short UDX=0xF1; /* Define a Window as extended */ + private static final short Urs=0xF2; /* reserved */ + // }; + + // enum { + /* + * Unicode code points from 3400 to E000 are not adressible by + * dynamic window, since in these areas no short run alphabets are + * found. Therefore add gapOffset to all values from gapThreshold. + */ + private static final int gapThreshold=0x68; + private static final int gapOffset = 0xAC00 ; + /* values between reservedStart and fixedThreshold are reserved */ + private static final int reservedStart=0xA8; + /* use table of predefined fixed offsets for values from fixedThreshold */ + private static final int fixedThreshold=0xF; + //}; + + protected byte[] fromUSubstitution = new byte[]{(byte)0x1A}; + + /* constant offsets for the 8 static windows */ + private static final int staticOffsets[]={ + 0x0000, /* ASCII for quoted tags */ + 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ + 0x0100, /* Latin Extended-A */ + 0x0300, /* Combining Diacritical Marks */ + 0x2000, /* General Punctuation */ + 0x2080, /* Currency Symbols */ + 0x2100, /* Letterlike Symbols and Number Forms */ + 0x3000 /* CJK Symbols and punctuation */ + }; + + /* initial offsets for the 8 dynamic (sliding) windows */ + private static final int initialDynamicOffsets[]={ + 0x0080, /* Latin-1 */ + 0x00C0, /* Latin Extended A */ + 0x0400, /* Cyrillic */ + 0x0600, /* Arabic */ + 0x0900, /* Devanagari */ + 0x3040, /* Hiragana */ + 0x30A0, /* Katakana */ + 0xFF00 /* Fullwidth ASCII */ + }; + + /* Table of fixed predefined Offsets */ + private static final int fixedOffsets[]={ + /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ + /* 0xFA */ 0x0250, /* IPA extensions */ + /* 0xFB */ 0x0370, /* Greek */ + /* 0xFC */ 0x0530, /* Armenian */ + /* 0xFD */ 0x3040, /* Hiragana */ + /* 0xFE */ 0x30A0, /* Katakana */ + /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ + }; + + /* state values */ + //enum { + private static final int readCommand=0; + private static final int quotePairOne=1; + private static final int quotePairTwo=2; + private static final int quoteOne=3; + private static final int definePairOne=4; + private static final int definePairTwo=5; + private static final int defineOne=6; + // }; + + + private final class SCSUData{ + + /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ + int toUDynamicOffsets[] = new int[8] ; + int fromUDynamicOffsets[] = new int[8] ; + + /* state machine state - toUnicode */ + boolean toUIsSingleByteMode; + short toUState; + byte toUQuoteWindow, toUDynamicWindow; + short toUByteOne; + short toUPadding[]; + + /* state machine state - fromUnicode */ + boolean fromUIsSingleByteMode; + byte fromUDynamicWindow; + + /* + * windowUse[] keeps track of the use of the dynamic windows: + * At nextWindowUseIndex there is the least recently used window, + * and the following windows (in a wrapping manner) are more and more + * recently used. + * At nextWindowUseIndex-1 there is the most recently used window. + */ + byte locale; + byte nextWindowUseIndex; + byte windowUse[] = new byte[8]; + + SCSUData(){ + initialize(); + } + + void initialize(){ + for(int i=0;i<8;i++){ + this.toUDynamicOffsets[i] = initialDynamicOffsets[i]; + } + this.toUIsSingleByteMode = true; + this.toUState = readCommand; + this.toUQuoteWindow = 0; + this.toUDynamicWindow = 0; + this.toUByteOne = 0; + this.fromUIsSingleByteMode = true; + this.fromUDynamicWindow = 0; + for(int i=0;i<8;i++){ + this.fromUDynamicOffsets[i] = initialDynamicOffsets[i]; + } + this.nextWindowUseIndex = 0; + switch(this.locale){ + case l_ja: + for(int i=0;i<8;i++){ + this.windowUse[i] = initialWindowUse_ja[i]; + } + break; + default: + for(int i=0;i<8;i++){ + this.windowUse[i] = initialWindowUse[i]; + } + + } + } + } + + static final byte initialWindowUse[]={ 7, 0, 3, 2, 4, 5, 6, 1 }; + static final byte initialWindowUse_ja[]={ 3, 2, 4, 1, 0, 7, 5, 6 }; + + //enum { + private static final int lGeneric = 0; + private static final int l_ja = 1; + //}; + + private SCSUData extraInfo = null; + + public CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases){ + super(icuCanonicalName, javaCanonicalName, aliases); + maxBytesPerChar = 4; + minBytesPerChar = 1; + maxCharsPerByte = 1; + extraInfo = new SCSUData(); + } + + + class CharsetDecoderSCSU extends CharsetDecoderICU { + + /* label Values */ + private static final int FastSingle=0; + private static final int SingleByteMode=1; + private static final int EndLoop=2; + + /* Mode Type */ + private static final int ByteMode = 0; + private static final int UnicodeMode =1; + + + public CharsetDecoderSCSU(CharsetICU cs) { + super(cs); + implReset(); + } + + //private SCSUData data ; + protected void implReset(){ + super.implReset(); + toULength = 0; + extraInfo.initialize(); + } + + short b; + + //Get the state machine state + private boolean isSingleByteMode ; + private short state ; + private byte quoteWindow ; + private byte dynamicWindow ; + private short byteOne; + private boolean LabelLoop; + + //sourceIndex=-1 if the current character began in the previous buffer + private int sourceIndex ; + private int nextSourceIndex ; + + protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, + boolean flush){ + SCSUData data ; + data = extraInfo; + //Get the state machine state + isSingleByteMode = data.toUIsSingleByteMode; + state = data.toUState; + quoteWindow = data.toUQuoteWindow; + dynamicWindow = data.toUDynamicWindow; + byteOne = data.toUByteOne; + LabelLoop = true; + + //sourceIndex=-1 if the current character began in the previous buffer + sourceIndex = data.toUState == readCommand ? 0: -1 ; + nextSourceIndex = 0; + + CoderResult cr = CoderResult.UNDERFLOW; + int labelType = 0; + while(LabelLoop){ + if(isSingleByteMode){ + switch(labelType){ + case FastSingle: + /*fast path for single-byte mode*/ + labelType = fastSingle(source, target, offsets, data, cr, ByteMode); + break; + case SingleByteMode: + /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */ + labelType = singleByteMode(source, target, offsets, data,cr, ByteMode); + break; + case EndLoop: + endLoop(source, target, offsets,data, cr); + break; + } + }else{ + switch(labelType){ + case FastSingle: + /*fast path for single-byte mode*/ + labelType = fastSingle(source, target, offsets, data,cr, UnicodeMode); + break; + case SingleByteMode: + /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */ + labelType = singleByteMode(source, target, offsets, data,cr, UnicodeMode); + break; + case EndLoop: + endLoop(source, target, offsets, data, cr); + break; + } + //LabelLoop = false; + } + } + return cr; + } + + private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, SCSUData data,CoderResult cr, int modeType){ + int label = 0; + if(modeType==ByteMode){ + + if(state==readCommand){ + while(source.hasRemaining() && target.hasRemaining() && (b=(short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK)) >= 0x20){ + source.position(source.position()+1); + ++nextSourceIndex; + if(b <= 0x7f){ + /*Write US graphic character or DEL*/ + target.put((char)b); + if(offsets != null){ + offsets.put(sourceIndex); + } + }else{ + /*Write from dynamic window*/ + int c = data.toUDynamicOffsets[dynamicWindow] + (b&0x7f); + if(c <= 0xffff){ + target.put((char)c); + if(offsets != null){ + offsets.put(sourceIndex); + } + }else{ + /*Output surrogate pair */ + target.put((char)(0xd7c0 + (c>>10))); + if(target.hasRemaining()){ + target.put((char)(0xdc00 | (c&0x3ff))); + if(offsets != null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + }else{ + /* target overflow */ + if(offsets != null){ + offsets.put(sourceIndex); + } + charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff)); + charErrorBufferLength = 1; + label = EndLoop; + cr = CoderResult.OVERFLOW; + LabelLoop = false; + return label; + } + } + } + sourceIndex = nextSourceIndex; + } + // label = SingleByteMode; + } + }else if(modeType==UnicodeMode){ + /* fast path for unicode mode */ + if(state == readCommand){ + while((source.position()+1)(Urs-UC0)){ + target.put((char)((b<<8)|(source.get(source.position()+1)&UConverterConstants.UNSIGNED_BYTE_MASK))); + if(offsets != null){ + offsets.put(sourceIndex); + } + sourceIndex = nextSourceIndex; + nextSourceIndex+=2; + source.position(source.position()+2); + } + } + } + label = SingleByteMode; + return label; + } + + private int singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, SCSUData data,CoderResult cr, int modeType){ + int label = SingleByteMode; + if(modeType == ByteMode){ + while(source.hasRemaining()){ + if(!target.hasRemaining()){ + cr = CoderResult.OVERFLOW; + LabelLoop = false; + return label; + } + //b = (short)source.get(); + b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK); + ++nextSourceIndex; + switch(state){ + case readCommand: + /*redundant conditions are commented out */ + if(((1L<>10))); + if(target.hasRemaining()){ + target.put((char)(0xdc00 | (c&0x3ff))); + if(offsets != null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + }else { + /* target overflow */ + if(offsets != null){ + offsets.put(sourceIndex); + } + charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff)); + charErrorBufferLength = 1; + label = EndLoop; + cr = CoderResult.OVERFLOW; + LabelLoop = false; + return label; + } + } + } + sourceIndex = nextSourceIndex; + state = readCommand; + label = FastSingle; + return label; + case definePairOne: + dynamicWindow = (byte)((b>>5)&7); + byteOne = (byte)(b&0x1f); + toUBytesArray[1] = (byte)b; + toULength = 2; + state = definePairTwo; + break; + case definePairTwo: + data.toUDynamicOffsets[dynamicWindow] = 0x10000 + (byteOne<<15L | b<<7L); + sourceIndex = nextSourceIndex; + state = readCommand; + label = FastSingle; + return label; + case defineOne: + if(b==0){ + /*callback (illegal)*/ + toUBytesArray[1] = (byte)b; + toULength =2; + label = EndLoop; + return label; + }else if(b=fixedThreshold){ + data.toUDynamicOffsets[dynamicWindow] = fixedOffsets[b-fixedThreshold]; + }else{ + /*callback (illegal)*/ + toUBytesArray[1] = (byte)b; + toULength =2; + label = EndLoop; + return label; + } + sourceIndex = nextSourceIndex; + state = readCommand; + label = FastSingle; + return label; + } + //} + } + + }else if(modeType==UnicodeMode){ + while(source.hasRemaining()){ + if(!target.hasRemaining()){ + cr = CoderResult.OVERFLOW; + LabelLoop = false; + return label; + } + // if(target.hasRemaining()){ + b = source.get(); + ++nextSourceIndex; + switch(state){ + case readCommand: + if((byte)(b -UC0)>(Urs - UC0)){ + byteOne = b; + toUBytesArray[0] = (byte)b; + toULength = 1; + state = quotePairOne; + }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UC7){ + dynamicWindow = (byte)(b - UC0); + sourceIndex = nextSourceIndex; + isSingleByteMode = true; + label = FastSingle; + return label; + }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UD7){ + dynamicWindow = (byte)(b - UD0); + isSingleByteMode = true; + toUBytesArray[0] = (byte)b; + toULength = 1; + state = defineOne; + label = SingleByteMode; + return label; + }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UDX){ + isSingleByteMode = true; + toUBytesArray[0] = (byte)b; + toULength = 1; + state = definePairOne; + label = SingleByteMode; + return label; + }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UQU){ + toUBytesArray[0] = (byte)b; + toULength = 1; + state = quotePairOne; + }else { + /* callback (illegal)*/ + cr = CoderResult.malformedForLength(1); + toUBytesArray[0] = (byte)b; + toULength = 1; + label = EndLoop; + return label; + } + break; + case quotePairOne: + byteOne = b; + toUBytesArray[1] = (byte)b; + toULength = 2; + state = quotePairTwo; + break; + case quotePairTwo: + target.put((char)((byteOne<<8) | b)); + if(offsets != null){ + offsets.put(sourceIndex); + } + sourceIndex = nextSourceIndex; + state = readCommand; + label = FastSingle; + return label; + } + // } + } + } + //LabelLoop = false; + label = EndLoop; + return label; + } + + private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, SCSUData data, CoderResult cr){ + if(state == readCommand){ + toULength = 0; + } + data.toUIsSingleByteMode = isSingleByteMode; + data.toUState = state; + data.toUQuoteWindow = quoteWindow; + data.toUDynamicWindow = dynamicWindow; + data.toUByteOne = byteOne; + LabelLoop = false; + } + + } + + + class CharsetEncoderSCSU extends CharsetEncoderICU{ + public CharsetEncoderSCSU(CharsetICU cs) { + super(cs, fromUSubstitution); + implReset(); + } + + //private SCSUData data; + protected void implReset() { + super.implReset(); + extraInfo.initialize(); + } + + /* label Values */ + private static final int Loop=0; + private static final int GetTrailUnicode=1; + private static final int OutputBytes=2; + private static final int EndLoop =3; + + int delta; + int length; + + ///variables of compression heuristics + int offset; + char lead, trail; + int code; + byte window; + + //Get the state machine state + private boolean isSingleByteMode; + private byte dynamicWindow ; + private boolean LabelLoop; + private int currentOffset; + int c = fromUChar32; + + + //sourceIndex=-1 if the current character began in the previous buffer + private int sourceIndex = c== 0 ? 0: -1 ; + private int nextSourceIndex = 0; + private int targetCapacity; + + protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { + CoderResult[] cr = {CoderResult.UNDERFLOW}; + SCSUData data ; + data = extraInfo; + //Get the state machine state + isSingleByteMode = data.fromUIsSingleByteMode; + dynamicWindow = data.fromUDynamicWindow; + LabelLoop = true; + currentOffset = data.fromUDynamicOffsets[dynamicWindow]; + int c = fromUChar32; + + targetCapacity = target.limit()-target.position(); + //sourceIndex=-1 if the current character began in the previous buffer + sourceIndex = c== 0 ? 0: -1 ; + nextSourceIndex = 0; + + int labelType = 0; + while(LabelLoop){ + switch(labelType){ + case Loop: + labelType = loop(source, target, offsets, data,cr); + break; + case GetTrailUnicode: + labelType = getTrailUnicode(source, target, offsets, data,cr); + break; + case OutputBytes: + labelType = outputBytes(source, target, offsets, data, cr); + break; + case EndLoop: + endLoop(source, target, offsets, data, cr); + break; + } + } + return cr[0]; + } + + private byte getWindow(int[] offsets, int c){ + int i; + for (i=0;i<8;i++){ + if((int)((c-offsets[i]) & UConverterConstants.UNSIGNED_SHORT_MASK) <= 0x7f){ + return (byte)i; + } + } + return -1; + } + + private boolean isInOffsetWindowOrDirect(int offset, int c){ + return (boolean)((c & UConverterConstants.UNSIGNED_SHORT_MASK)<=(offset & UConverterConstants.UNSIGNED_SHORT_MASK)+0x7f & + ((c & UConverterConstants.UNSIGNED_SHORT_MASK)>=(offset & UConverterConstants.UNSIGNED_SHORT_MASK) || + ((c & UConverterConstants.UNSIGNED_SHORT_MASK)<=0x7f && ((c & UConverterConstants.UNSIGNED_SHORT_MASK)>=0x20 + || ((1L<<(c & UConverterConstants.UNSIGNED_SHORT_MASK))&0x2601)!=0)))); + } + + private byte getNextDynamicWindow(SCSUData scsu){ + byte window = scsu.windowUse[scsu.nextWindowUseIndex]; + if(++scsu.nextWindowUseIndex==8){ + scsu.nextWindowUseIndex=0; + } + return window; + } + + private void useDynamicWindow(SCSUData scsu, byte window){ + /*first find the index of the window*/ + int i,j; + i = scsu.nextWindowUseIndex; + do{ + if(--i<0){ + i=7; + } + }while(scsu.windowUse[i]!=window); + + /*now copy each window[i+1] to [i]*/ + j= i+1; + if(j==8){ + j=0; + } + while(j!=scsu.nextWindowUseIndex){ + scsu.windowUse[i] = scsu.windowUse[j]; + i=j; + if(++j==8){ + j=0; + } + } + + /*finally, set the window into the most recently used index*/ + scsu.windowUse[i]= window; + } + + + private int getDynamicOffset(int c){ + int i; + for(i=0;i<7;++i){ + if(((c-fixedOffsets[i])&UConverterConstants.UNSIGNED_SHORT_MASK)<=0x7f){ + offset = fixedOffsets[i]; + return 0xf9+i; + } + } + if((c&UConverterConstants.UNSIGNED_SHORT_MASK)<0x80){ + /*No dynamic window for US-ASCII*/ + return -1; + }else if(c<0x3400 || ((c-0x10000)&UConverterConstants.UNSIGNED_INT_MASK)<(0x14000-0x10000) || + ((c-0x1d000)&UConverterConstants.UNSIGNED_INT_MASK)<=(0x1ffff-0x1d000)){ + /*This character is in the code range for a "small", i.e, reasonably windowable, script*/ + offset = (c&UConverterConstants.UNSIGNED_SHORT_MASK)&0x7fffff80; + return (int)(c>>7); + }else if(0xe000<=c && c!=0xfeff && c < 0xfff0){ + /*for these characters we need to take the gapOffset into account*/ + offset=(c&UConverterConstants.UNSIGNED_SHORT_MASK)&0x7fffff80; + return (int)((c-gapOffset)>>7); + }else{ + return -1; + } + + } + + private int loop(CharBuffer source, ByteBuffer target, IntBuffer offsets, SCSUData data,CoderResult[] cr){ + int label = 0; + //int targetCapacity = target.limit()-target.position(); + if(isSingleByteMode){ + if(c!=0 && targetCapacity>0){ + label = getTrail(source, target, offsets,data, cr, c); + return label; + } + /*state machine for single byte mode*/ + while(source.hasRemaining()){ + if(target.capacity()<=0){ + /*target is full*/ + cr[0] = CoderResult.OVERFLOW; + LabelLoop = false; + break; + } + c = source.get(); + ++nextSourceIndex; + if((c -0x20)<=0x5f){ + /*pass US-ASCII graphic character through*/ + target.put((byte)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + --targetCapacity; + }else if((c & UConverterConstants.UNSIGNED_SHORT_MASK)<0x20){ + if(((1L<<(c & UConverterConstants.UNSIGNED_SHORT_MASK))&0x2601)!=0){ + /*CR/LF/TAB/NUL*/ + target.put((byte)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + --targetCapacity; + } else { + /*quote c0 control character*/ + c|=SQ0<<8; + length = 2; + label = OutputBytes; + return label; + } + } else if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){ + /*use the current dynamic window*/ + target.put((byte)(delta|0x80)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + --targetCapacity; + } else if(UTF16.isSurrogate((char)c)){ + if(UTF16.isLeadSurrogate((char)c)){ + label = getTrail(source, target, offsets,data,cr, c); + if(label==EndLoop){ + return label; + } + } else { + /*this is unmatched lead code unit (2nd Surrogate)*/ + /*callback(illegal)*/ + cr[0] = CoderResult.malformedForLength(1); + label = EndLoop; + } + + /*Compress supplementary character U+10000...U+10ffff */ + if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){ + /*use the current dynamic window*/ + target.put((byte)(delta|0x80)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + --targetCapacity; + } else if((window=getWindow(data.fromUDynamicOffsets, c))>=0){ + /*there is a dynamic window that contains this character, change to it*/ + dynamicWindow = window; + currentOffset = data.fromUDynamicOffsets[dynamicWindow]; + useDynamicWindow(data, dynamicWindow); + c = (((int)(SC0+dynamicWindow))<<8 | (c-currentOffset)|0x80); + length = 2; + label = OutputBytes; + return label; + } else if((code=getDynamicOffset(c))>=0){ + /*might check if there are come character in this window to come */ + /*define an extended window with this character*/ + code-=0x200; + dynamicWindow=getNextDynamicWindow(data); + currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; + useDynamicWindow(data, dynamicWindow); + c = ((int)(SDX<<25) | (int)(dynamicWindow<<21)| + (int)(code<<8)| (c- currentOffset) |0x80 ); + length = 4; + label = OutputBytes; + return label; + } else { + /*change to unicode mode and output this (lead, trail) pair*/ + isSingleByteMode = false; + target.put((byte)SCU); + if(offsets!=null){ + offsets.put(sourceIndex); + } + --targetCapacity; + c = ((int)(lead<<16))|trail; + length = 4; + label = OutputBytes; + return label; + } + } else if(c<0xa0){ + /*quote C1 control character*/ + c = (c&0x7f) | (SQ0+1)<<8; /*SQ0+1 == SQ1*/ + length = 2; + label = OutputBytes; + return label; + } else if(c==0xfeff || c>= 0xfff0){ + /*quote signature character = byte order mark and specials*/ + c |= SQU<<16; + length = 3; + label = OutputBytes; + return label; + } else { + /*compress all other BMP characters*/ + if((window=getWindow(data.fromUDynamicOffsets, c))>=0){ + /*there is a window defined that contains this character - switch to it or quote from it*/ + if(source.position()>=source.limit() || isInOffsetWindowOrDirect(data.fromUDynamicOffsets[window], source.get(source.position()))){ + /*change to dynamic window*/ + dynamicWindow = window; + currentOffset = data.fromUDynamicOffsets[dynamicWindow]; + useDynamicWindow(data, dynamicWindow); + c = ((int)((SC0+window)<<8)) | (c- currentOffset) | 0x80; + length = 2; + label = OutputBytes; + return label; + } else { + /*quote from dynamic window*/ + c = ((int)((SQ0+window)<<8)) | (c - data.fromUDynamicOffsets[window]) | + 0x80; + length = 2; + label = OutputBytes; + return label; + } + } else if((window = getWindow(staticOffsets, c))>=0){ + /*quote from static window*/ + c = ((int)((SQ0+window)<<8)) | (c - staticOffsets[window]); + length = 2; + label = OutputBytes; + return label; + }else if((code=getDynamicOffset(c))>=0){ + /*define a dynamic window with this character*/ + dynamicWindow = getNextDynamicWindow(data); + currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; + useDynamicWindow(data, dynamicWindow); + c = ((int)((SD0+dynamicWindow)<<16)) | (int)(code<<8)| + (c- currentOffset) | 0x80; + length = 3; + label = OutputBytes; + return label; + } else if(((int)((c-0x3400)&UConverterConstants.UNSIGNED_SHORT_MASK))<(0xd800-0x3400) && (source.position()>=source.limit() || + ((int)((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_SHORT_MASK))< (0xd800 - 0x3400))){ + /* + * this character is not compressible (a BMP ideograph of similar) + * switch to Unicode mode if this is the last character in the block + * or there is at least one more ideograph following immediately + */ + isSingleByteMode = false; + c|=SCU<<16; + length =3; + label = OutputBytes; + return label; + } else { + /*quote Unicode*/ + c|=SQU<<16; + length = 3; + label = OutputBytes; + return label; + } + } + /*normal end of conversion : prepare for new character */ + c = 0; + sourceIndex = nextSourceIndex; + } + } else { + if(c!=0 && targetCapacity>0){ + label = GetTrailUnicode; + return label; + } + + /*state machine for Unicode*/ + /*unicodeByteMode*/ + while(source.hasRemaining()){ + if(target.capacity()<=0){ + /*target is full*/ + cr[0] = CoderResult.OVERFLOW; + LabelLoop = false; + break; + } + c = source.get(); + ++nextSourceIndex; + + if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400)){ + /*not compressible, write character directly */ + if(targetCapacity>=2){ + target.put((byte)(c>>8)); + target.put((byte)c); + if(offsets!=null){ + offsets.put(sourceIndex); + offsets.put(sourceIndex); + } + targetCapacity-=2; + } else { + length =2; + label = OutputBytes; + return label; + } + } else if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300*/){ + /*compress BMP character if the following one is not an uncompressible ideograph*/ + if(!(source.hasRemaining() && (((source.get(source.position())-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400))){ + if(((((c-0x30)&UConverterConstants.UNSIGNED_INT_MASK))<10 || (((c-0x61)&UConverterConstants.UNSIGNED_INT_MASK))<26 + || (((c-0x41)&UConverterConstants.UNSIGNED_INT_MASK))<26)){ + /*ASCII digit or letter*/ + isSingleByteMode = true; + c |=((int)((UC0+dynamicWindow)<<8))|c; + length = 2; + label = OutputBytes; + return label; + } else if((window=getWindow(data.fromUDynamicOffsets, c))>=0){ + /*there is a dynamic window that contains this character, change to it*/ + isSingleByteMode = true; + dynamicWindow = window; + currentOffset = data.fromUDynamicOffsets[dynamicWindow]; + useDynamicWindow(data, dynamicWindow); + c = ((int)((UC0+dynamicWindow)<<8)) | (c- currentOffset) | 0x80; + length = 2; + label = OutputBytes; + return label; + } else if((code=getDynamicOffset(c))>=0){ + /*define a dynamic window with this character*/ + isSingleByteMode = true; + dynamicWindow = getNextDynamicWindow(data); + currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset; + useDynamicWindow(data, dynamicWindow); + c = ((int)((UD0+dynamicWindow)<<16)) | (int)(code<<8) + |(c- currentOffset) | 0x80; + length = 3; + label = OutputBytes; + return label; + } + } + + /*don't know how to compress these character, just write it directly*/ + length = 2; + label = OutputBytes; + return label; + } else if(c<0xe000){ + label = GetTrailUnicode; + return label; + } else { + /*quote to avoid SCSU tags*/ + c|=UQU<<16; + length = 3; + label = OutputBytes; + return label; + } + + /*normal end of conversion, prepare for a new character*/ + c = 0; + sourceIndex = nextSourceIndex; + } + } + label = EndLoop; + return label; + } + + private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets, SCSUData data,CoderResult[] cr, int b){ + lead = (char)b; + int label = Loop; + if(source.hasRemaining()){ + /*test the following code unit*/ + trail = source.get(source.position()); + if(UTF16.isTrailSurrogate((char)trail)){ + source.position(source.position()+1); + ++nextSourceIndex; + c = UCharacter.getCodePoint((char)b, trail); + label = Loop; + } else { + /*this is unmatched lead code unit (1st Surrogate)*/ + /*callback(illegal)*/ + cr[0] = CoderResult.malformedForLength(1); + label = EndLoop; + } + }else { + /*no more input*/ + label = EndLoop; + } + return label; + } + + private int getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets, SCSUData data,CoderResult[] cr){ + int label = EndLoop; + /*c is surrogate*/ + if(UTF16.isLeadSurrogate((char)c)){ + // getTrailUnicode: + lead = (char)c; + if(source.hasRemaining()){ + /*test the following code unit*/ + trail = source.get(source.position()); + if(UTF16.isTrailSurrogate(trail)){ + source.get(); + ++nextSourceIndex; + c = UCharacter.getCodePoint((char)c, trail); + /*convert this surrogate code point*/ + /*exit this condition tree*/ + } else { + /*this is unmatched lead code unit(1st surrogate)*/ + /*callback(illegal)*/ + cr[0] = CoderResult.malformedForLength(1); + label = EndLoop; + return label; + } + } else { + /*no more input*/ + label = EndLoop; + return label; + } + } else { + /*this is an unmatched trail code point (2nd surrogate)*/ + /*callback (illegal)*/ + cr[0] = CoderResult.malformedForLength(1); + label = EndLoop; + return label; + } + + /*compress supplementary character*/ + if((window=getWindow(data.fromUDynamicOffsets, c))>=0 && + !(source.hasRemaining() && ((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK) < + (0xd800 - 0x3400))){ + /* + * this is the dynamic window that contains this character and the following + * character is not uncompressible, + * change to the window + */ + isSingleByteMode = true; + dynamicWindow = window; + currentOffset = data.fromUDynamicOffsets[dynamicWindow]; + useDynamicWindow(data, dynamicWindow); + c = ((UC0+dynamicWindow)<<8 | (c-currentOffset) | 0x80); + length = 2; + label = OutputBytes; + return label; + } else if(source.hasRemaining() && lead == source.get(source.position()) && (code=getDynamicOffset(c))>=0){ + /*two supplementary characters in (probably) the same window - define an extended one*/ + isSingleByteMode = true; + dynamicWindow = getNextDynamicWindow(data); + currentOffset = data.fromUDynamicOffsets[dynamicWindow] = offset; + useDynamicWindow(data, dynamicWindow); + c = (UDX<<24) | (dynamicWindow<<21) |(code<<8) |(c - currentOffset) | 0x80; + length = 4; + label = OutputBytes; + return label; + } else { + /*don't know how to compress this character, just write it directly*/ + c = (lead<<16)|trail; + length = 4; + label = OutputBytes; + return label; + } + + } + + private void endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, SCSUData data,CoderResult[] cr){ + /*set the converter state back to UConverter*/ + data.fromUIsSingleByteMode = isSingleByteMode; + data.fromUDynamicWindow = dynamicWindow; + fromUChar32 = c; + LabelLoop = false; + } + + private int outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets, SCSUData data,CoderResult[] cr){ + int label; + //int targetCapacity = target.limit()-target.position(); + /*write the output character byte from c and length*/ + /*from the first if in the loop we know that targetCapacity>0*/ + if(length<=targetCapacity){ + if(offsets==null){ + switch(length){ + /*each branch falls through the next one*/ + case 4: + target.put((byte)(c>>24)); + case 3: + target.put((byte)(c>>16)); + case 2: + target.put((byte)(c>>8)); + case 1: + target.put((byte)c); + default: + /*will never occur*/ + break; + } + }else { + switch(length){ + /*each branch falls through to the next one*/ + case 4: + target.put((byte)(c>>24)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + case 3: + target.put((byte)(c>>16)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(c>>8)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + case 1: + target.put((byte)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + default: + /*will never occur*/ + break; + } + } + targetCapacity-=length; + + /*normal end of conversion: prepare for a new character*/ + c = 0; + sourceIndex = nextSourceIndex; + label = Loop; + return label; + } else { + ByteBuffer p = ByteBuffer.wrap(errorBuffer); + /* + * We actually do this backwards here: + * In order to save an intermediate variable, we output + * first to the overflow buffer what does not fit into the + * regular target + */ + /* we know that 0<=targetCapacity>24)); + case 3: + p.put((byte)(c>>16)); + case 2: + p.put((byte)(c>>8)); + case 1: + p.put((byte)c); + default: + /*will never occur*/ + break; + } + errorBufferLength = length; + + /*now output what fits into the regular target*/ + c>>=8*length; //length was reduced by targetCapacity + switch(targetCapacity){ + /*each branch falls through the next one*/ + case 3: + target.put((byte)(c>>16)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + case 2: + target.put((byte)(c>>8)); + if(offsets!=null){ + offsets.put(sourceIndex); + } + case 1: + target.put((byte)c); + if(offsets!=null){ + offsets.put(sourceIndex); + } + default: + break; + } + + /*target overflow*/ + targetCapacity = 0; + cr[0] = CoderResult.OVERFLOW; + c = 0; + label = EndLoop; + return label; + } + } + + } + + + public CharsetDecoder newDecoder() { + return new CharsetDecoderSCSU(this); + } + + public CharsetEncoder newEncoder() { + return new CharsetEncoderSCSU(this); + } + + void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ + CharsetICU.getCompleteUnicodeSet(setFillIn); + } + +}