ICU-1763 Synchronize with ICU4C

X-SVN-Rev: 8278
This commit is contained in:
Ram Viswanadha 2002-03-28 01:51:50 +00:00
parent e6ca8550ef
commit 59c4ad0ada
4 changed files with 284 additions and 83 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerDataReader.java,v $
* $Date: 2002/03/13 05:56:29 $
* $Revision: 1.2 $
* $Date: 2002/03/28 01:50:59 $
* $Revision: 1.3 $
*******************************************************************************
*/
@ -225,24 +225,55 @@ import com.ibm.icu.impl.ICUDebug;
*
* - Auxiliary trie and data
*
* The auxiliary 32-bit trie contains data for additional properties.
*
* The auxiliary 16-bit trie contains data for additional properties.
* Bits
* 31 set if lead surrogate offset
* 30 composition exclusion
* 29..20 index into extraData[] to FC_NFKC_Closure string (bit 31==0),
* or lead surrogate offset (bit 31==1)
* 19..16 skippable flags
* 15 reserved
* 14 flag: not a safe starter for canonical closure
* 13.. 0 index to serialized USet for canonical closure
* the set lists the code points whose decompositions start with
* the one that this data is for
* for how USets are serialized see uset.c
* 15..12 reserved (for skippable flags, see NormalizerTransliterator)
* 11 flag: not a safe starter for canonical closure
* 10 composition exclusion
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
* (not for lead surrogate),
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
*
* - FC_NFKC_Closure strings in extraData[]
*
* Strings are either stored as a single code unit or as the length
* followed by that many units.
*
* - structure inside canonStartSets[]
*
* This array maps from code points c to sets of code points (USerializedSet).
* The result sets are the code points whose canonical decompositions start
* with c.
*
* canonStartSets[] contains the following sub-arrays:
*
* indexes[_NORM_SET_INDEX_TOP]
* - contains lengths of sub-arrays etc.
*
* startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
* - contains serialized sets (USerializedSet) of canonical starters for
* enumerating canonically equivalent strings
* indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
* for details about the structure see uset.c
*
* bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
* - a sorted search table for BMP code points whose results are
* either indexes to USerializedSets or single code points for
* single-code point sets;
* each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
* if yy==01 then there is a USerializedSet at canonStartSets+x
* else build a USerializedSet with result as the single code point
*
* suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
* - a sorted search table for supplementary code points whose results are
* either indexes to USerializedSets or single code points for
* single-code point sets;
* each entry is a triplet of { high16(cp), low16(cp), result }
* each code point's high-word may contain extra data in bits 15..5:
* if the high word has bit 15 set, then build a set with a single code point
* which is (((high16(cp)&0x1f00)<<8)|result;
* else there is a USerializedSet at canonStartSets+result
*/
final class NormalizerDataReader {
private final static boolean debug = ICUDebug.enabled("NormalizerDataReader");
@ -278,36 +309,10 @@ final class NormalizerDataReader {
*/
protected void read(NormalizerImpl impl)
throws IOException{
/*
* - Overall partition
*
* unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c.
* After that there are the following structures:
*
* char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
*
* Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
*
* char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
* extraData[0] contains the number of units for
* FC_NFKC_Closure (formatVersion>=2.1)
*
* char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
* combiningTableTop may include one 16-bit padding unit
* to make sure that fcdTrie is 32-bit-aligned
*
* Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
*
* Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
*
* char canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[INDEX_CANON_SET_COUNT]
* serialized USets, see uset.c
*
*/
//Read the indexes
int[] indexes = new int[NormalizerImpl.INDEX_TOP];
for (int i = 0; i <indexes.length ; i ++) {
for (int i = 0; i <indexes.length ; i++) {
indexes[i] = dataInputStream.readInt();
}
@ -343,16 +348,32 @@ final class NormalizerDataReader {
ByteArrayInputStream auxTrieStream= new ByteArrayInputStream(auxBytes);
//Read the canonical start sets
char[] canonStartSets=new char[indexes[NormalizerImpl.INDEX_CANON_SET_COUNT]];
for(int i=0; i<canonStartSets.length; i++){
canonStartSets[i]=dataInputStream.readChar();
Object[] canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
for(int i=0; i<canonStartSetsIndexes.length; i++){
canonStartSetsIndexes[i]=dataInputStream.readChar();
}
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]-NormalizerImpl.SET_INDEX_TOP];
for(int i=0; i<startSets.length; i++){
startSets[i]=dataInputStream.readChar();
}
char[] bmpTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_BMP_TABLE_LENGTH]];
for(int i=0; i<bmpTable.length; i++){
bmpTable[i]=dataInputStream.readChar();
}
char[] suppTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SUPP_TABLE_LENGTH]];
for(int i=0; i<suppTable.length; i++){
suppTable[i]=dataInputStream.readChar();
}
canonStartSets[NormalizerImpl.CANON_SET_INDICIES_INDEX ] = canonStartSetsIndexes;
canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX ] = bmpTable;
canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;
//Now set the tries
impl.normTrieImpl.normTrie = new IntTrie( normTrieStream,impl.normTrieImpl );
impl.fcdTrieImpl.fcdTrie = new CharTrie(fcdTrieStream,impl.fcdTrieImpl );
impl.auxTrieImpl.auxTrie = new IntTrie( auxTrieStream, impl.auxTrieImpl );
impl.auxTrieImpl.auxTrie = new CharTrie( auxTrieStream, impl.auxTrieImpl );
impl.indexes = indexes;
impl.extraData = extraData;
impl.combiningTable = combiningTable;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerImpl.java,v $
* $Date: 2002/03/13 05:56:31 $
* $Revision: 1.3 $
* $Date: 2002/03/28 01:50:59 $
* $Revision: 1.4 $
*******************************************************************************
*/
@ -105,23 +105,33 @@ public final class NormalizerImpl {
static final int INDEX_CANON_SET_COUNT = 12; /* number of uint16_t in the array of serialized USet */
static final int INDEX_TOP = 32; /* changing this requires a new formatVersion */
/* AUX constants */
/* value constants for auxTrie */
/* value constants for auxTrie */
static final int AUX_UNSAFE_SHIFT = 11;
static final int AUX_COMP_EX_SHIFT = 10;
static final int AUX_UNSAFE_SHIFT = 14;
static final int AUX_FNC_SHIFT = 20;
static final int AUX_COMP_EX_SHIFT = 30;
static final int AUX_IS_LEAD_SHIFT = 31;
static final int AUX_MAX_CANON_SET = (1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK;
static final int AUX_MAX_FNC = (1<<(AUX_COMP_EX_SHIFT-AUX_FNC_SHIFT));
static final int AUX_CANON_SET_MASK = (AUX_MAX_CANON_SET-1);
static final int AUX_MAX_FNC = ((int)1<<AUX_COMP_EX_SHIFT);
static final int AUX_UNSAFE_MASK = (1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK;
static final int AUX_FNC_MASK = ((AUX_MAX_FNC-1)<<AUX_FNC_SHIFT) & UNSIGNED_INT_MASK;
static final int AUX_FNC_MASK = (AUX_MAX_FNC-1) & UNSIGNED_INT_MASK;
static final int AUX_COMP_EX_MASK = (1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK;
static final int AUX_IS_LEAD_MASK = (1<<AUX_IS_LEAD_SHIFT) & UNSIGNED_INT_MASK;
/* canonStartSets[0..31] contains indexes for what is in the array */
static final int SET_INDEX_CANON_SETS_LENGTH = 0; /* number of uint16_t in canonical starter sets */
static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1; /* number of uint16_t in the BMP search table (contains pairs) */
static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2; /* number of uint16_t in the supplementary search table (contains triplets) */
static final int SET_INDEX_TOP = 32;/* changing this requires a new formatVersion */
static final int CANON_SET_INDICIES_INDEX = 0;
static final int CANON_SET_START_SETS_INDEX = 1;
static final int CANON_SET_BMP_TABLE_INDEX = 2;
static final int CANON_SET_SUPP_TABLE_INDEX = 3;
static final int CANON_SET_MAX_CANON_SETS = 0x0004; /* 14 bit indexes to canonical USerializedSets */
/* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */
static final int CANON_SET_BMP_MASK = 0xc000;
static final int CANON_SET_BMP_IS_INDEX = 0x4000;
/*******************************/
@ -158,7 +168,7 @@ public final class NormalizerImpl {
}
static final class AuxTrieImpl implements Trie.DataManipulate{
static IntTrie auxTrie = null;
static CharTrie auxTrie = null;
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
@ -167,11 +177,7 @@ public final class NormalizerImpl {
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value){
if(value<0) {
return (value & AUX_FNC_MASK)>>(AUX_FNC_SHIFT-5);
} else {
return 0;
}
return (value&AUX_FNC_MASK)<<5;
}
}
@ -184,7 +190,7 @@ public final class NormalizerImpl {
static int[] indexes;
static char[] combiningTable;
static char[] extraData;
static char[] canonStartSets;
static Object[] canonStartSets;
static boolean isDataLoaded;
static boolean isFormatVersion_2_1;
@ -441,8 +447,8 @@ public final class NormalizerImpl {
public static boolean isFullCompositionExclusion(int c) {
if(isFormatVersion_2_1) {
int aux32=auxTrieImpl.auxTrie.getCodePointValue(c);
return (boolean)((aux32&AUX_COMP_EX_MASK)!=0);
int aux =auxTrieImpl.auxTrie.getCodePointValue(c);
return (boolean)((aux & AUX_COMP_EX_MASK)!=0);
} else {
return false;
}
@ -450,8 +456,8 @@ public final class NormalizerImpl {
public static boolean isCanonSafeStart(int c) {
if(isFormatVersion_2_1) {
int aux32 = auxTrieImpl.auxTrie.getValue(c);
return (boolean)((aux32&AUX_UNSAFE_MASK)==0);
int aux = auxTrieImpl.auxTrie.getCodePointValue(c);
return (boolean)((aux & AUX_UNSAFE_MASK)==0);
} else {
return false;
}
@ -460,15 +466,88 @@ public final class NormalizerImpl {
public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
if(fillSet!=null && canonStartSets!=null) {
int aux32=auxTrieImpl.auxTrie.getValue(c);
aux32&=AUX_CANON_SET_MASK;
/*
* binary search for c
*
* There are two search tables,
* one for BMP code points and one for supplementary ones.
* See unormimp.h for details.
*/
char[] table;
int i, start, limit;
return aux32!=0 &&
fillSet.getSet(canonStartSets,indexes[INDEX_CANON_SET_COUNT]-aux32);
} else {
return false;
if(c<=0xffff) {
table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
start=0;
limit=table.length;
/* each entry is a pair { c, result } */
while(start<limit) {
i=(char)((start+limit)/2);
if(c<table[i]) {
limit=i;
} else {
start=i;
}
}
/* found? */
if(c==table[start]) {
i=table[start+1];
if((i&CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
/* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
i&=(CANON_SET_MAX_CANON_SETS-1);
return fillSet.getSet(table,i);
} else {
/* other result values are BMP code points for single-code point sets */
fillSet.setSerializedToOne(i);
return true;
}
}
} else {
char high, low, h;
table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
start=0;
limit=table.length;
high=(char)(c>>16);
low=(char)c;
/* each entry is a triplet { high(c), low(c), result } */
while(start<limit-3) {
i=(char)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
h=(char)(table[i]&0x1f); /* high word */
if(high<h || (high==h && low<table[i+1])) {
limit=i;
} else {
start=i;
}
}
/* found? */
h=table[start];
if(high==(h&0x1f) && low==table[start+1]) {
i=table[start+2];
if((h&0x8000)==0) {
/* the result is an index to a USerializedSet */
return fillSet.getSet(table,i);
} else {
/*
* single-code point set {x} in
* triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
*/
i|=((int)h&0x1f00)<<8; /* add high bits from high(c) */
fillSet.setSerializedToOne((int)i);
return true;
}
}
}
}
return false; /* not found */
}
/**
* Internal API, used by collation code.
* Get access to the internal FCD trie table to be able to perform

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/USerializedSet.java,v $
* $Date: 2002/03/12 17:49:15 $
* $Revision: 1.1 $
* $Date: 2002/03/28 01:50:59 $
* $Revision: 1.2 $
*
*****************************************************************************************
*/
@ -83,7 +83,11 @@ public final class USerializedSet {
if(rangeIndex<0) {
return false;
}
if(array==null){
array = new char[8];
}
range=new int[2];
rangeIndex*=2; /* address start/limit pairs */
if(rangeIndex<bmpLength) {
range[0]=array[arrayOffset+rangeIndex++];
@ -114,6 +118,103 @@ public final class USerializedSet {
}
}
}
public final void setSerializedToOne(int c) {
if( 0x10ffff<c) {
return;
}
if(c<0xffff) {
bmpLength=length=2;
array[0]=(char)c;
array[1]=(char)(c+1);
} else if(c==0xffff) {
bmpLength=1;
length=3;
array[0]=0xffff;
array[1]=1;
array[2]=0;
} else if(c<0x10ffff) {
bmpLength=0;
length=4;
array[0]=(char)(c>>16);
array[1]=(char)c;
++c;
array[2]=(char)(c>>16);
array[3]=(char)c;
} else /* c==0x10ffff */ {
bmpLength=0;
length=2;
array[0]=0x10;
array[1]=0xffff;
}
}
public final boolean getSerializedRange( int rangeIndex,int[] range) {
if( rangeIndex<0) {
return false;
}
if(array==null){
array = new char[8];
}
range=new int[2];
rangeIndex*=2; /* address start/limit pairs */
if(rangeIndex<bmpLength) {
range[0]=array[rangeIndex++];
if(rangeIndex<bmpLength) {
range[1]=array[rangeIndex];
} else if(rangeIndex<length) {
range[1]=(((int)array[rangeIndex])<<16)|array[rangeIndex+1];
} else {
range[1]=0x110000;
}
return true;
} else {
rangeIndex-=bmpLength;
rangeIndex*=2; /* address pairs of pairs of units */
length-=bmpLength;
if(rangeIndex<length) {
int offset=arrayOffset+bmpLength;
range[0]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
rangeIndex+=2;
if(rangeIndex<length) {
range[1]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
} else {
range[1]=0x110000;
}
return false;
} else {
return false;
}
}
}
public final boolean serializedContains(int c) {
if(c>0x10ffff) {
return false;
}
if(c<=0xffff) {
int i;
/* find c in the BMP part */
for(i=0; i<bmpLength && (char)c>=array[i]; ++i) {}
return (boolean)((i&1) != 0);
} else {
int i;
/* find c in the supplementary part */
char high=(char)(c>>16), low=(char)c;
for(i=bmpLength;
i<length && (high>array[i] || (high==array[i] && low>=array[i+1]));
i+=2) {}
/* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
return (boolean)(((i+bmpLength)&2)!=0);
}
}
public final int countSerializedRanges() {
return (bmpLength+(length-bmpLength)/2+1)/2;
}
private char array[];
private int arrayOffset, bmpLength, length;

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:52b68530dbb9f4a2e2a089c50726005f8aa87b6b5ba466f1fbcae06316bad4c6
size 172384
oid sha256:665f02a0fd842a47ca65ecf36c1d301ef5cae01990b68f05695cfc693a783406
size 106300