ICU-1763 Synchronize with ICU4C

X-SVN-Rev: 8278
2025-04-14 17:24:01 +00:00 · 2002-03-28 01:51:50 +00:00 · 2002-03-28 01:51:50 +00:00 · 59c4ad0ada
commit 59c4ad0ada
parent e6ca8550ef
4 changed files with 284 additions and 83 deletions
--- a/icu4j/src/com/ibm/icu/impl/NormalizerDataReader.java
+++ b/icu4j/src/com/ibm/icu/impl/NormalizerDataReader.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerDataReader.java,v $
- * $Date: 2002/03/13 05:56:29 $
- * $Revision: 1.2 $
+ * $Date: 2002/03/28 01:50:59 $
+ * $Revision: 1.3 $
 *******************************************************************************
 */
 
@ -225,24 +225,55 @@ import com.ibm.icu.impl.ICUDebug;
 	 *
 	 * - Auxiliary trie and data
 	 *
-	 * The auxiliary 32-bit trie contains data for additional properties.
+	 *
+	 * The auxiliary 16-bit trie contains data for additional properties.
 	 * Bits
-	 *     31   set if lead surrogate offset
-	 *     30   composition exclusion
-	 * 29..20   index into extraData[] to FC_NFKC_Closure string (bit 31==0),
-	 *          or lead surrogate offset (bit 31==1)
-	 * 19..16   skippable flags
-	 *     15   reserved
-	 *     14   flag: not a safe starter for canonical closure
-	 * 13.. 0   index to serialized USet for canonical closure
-	 *            the set lists the code points whose decompositions start with
-	 *            the one that this data is for
-	 *          for how USets are serialized see uset.c
+	 * 15..12   reserved (for skippable flags, see NormalizerTransliterator)
+	 *     11   flag: not a safe starter for canonical closure
+	 *     10   composition exclusion
+	 *  9.. 0   index into extraData[] to FC_NFKC_Closure string
+	 *          (not for lead surrogate),
+	 *          or lead surrogate offset (for lead surrogate, if 9..0 not zero)
 	 *
 	 * - FC_NFKC_Closure strings in extraData[]
 	 *
 	 * Strings are either stored as a single code unit or as the length
 	 * followed by that many units.
+	 * 
+     * - structure inside canonStartSets[]
+	 *
+	 * This array maps from code points c to sets of code points (USerializedSet).
+	 * The result sets are the code points whose canonical decompositions start
+	 * with c.
+	 *
+	 * canonStartSets[] contains the following sub-arrays:
+	 *
+	 * indexes[_NORM_SET_INDEX_TOP]
+	 *   - contains lengths of sub-arrays etc.
+	 *
+	 * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
+	 *   - contains serialized sets (USerializedSet) of canonical starters for
+	 *     enumerating canonically equivalent strings
+	 *     indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
+	 *     for details about the structure see uset.c
+	 *
+	 * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
+	 *   - a sorted search table for BMP code points whose results are
+	 *     either indexes to USerializedSets or single code points for
+	 *     single-code point sets;
+	 *     each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
+	 *     if yy==01 then there is a USerializedSet at canonStartSets+x
+	 *     else build a USerializedSet with result as the single code point
+	 *
+	 * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
+	 *   - a sorted search table for supplementary code points whose results are
+	 *     either indexes to USerializedSets or single code points for
+	 *     single-code point sets;
+	 *     each entry is a triplet of { high16(cp), low16(cp), result }
+	 *     each code point's high-word may contain extra data in bits 15..5:
+	 *     if the high word has bit 15 set, then build a set with a single code point
+	 *     which is (((high16(cp)&0x1f00)<<8)|result;
+	 *     else there is a USerializedSet at canonStartSets+result
 	 */
 final class NormalizerDataReader {
 	private final static boolean debug = ICUDebug.enabled("NormalizerDataReader");
@ -278,36 +309,10 @@ final class NormalizerDataReader {
    */
    protected void read(NormalizerImpl impl) 
    		throws IOException{
-		/*
-		 * - Overall partition
-		 *
-		 * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c.
-		 * After that there are the following structures:
-		 *
-		 * char indexes[INDEX_TOP];           -- INDEX_TOP=32, see enum in this file
-		 *
-		 * Trie normTrie;                              -- size in bytes=indexes[INDEX_TRIE_SIZE]
-		 * 
-		 * char extraData[extraDataTop];            -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
-		 *                                                 extraData[0] contains the number of units for
-		 *                                                 FC_NFKC_Closure (formatVersion>=2.1)
-		 *
-		 * char combiningTable[combiningTableTop];  -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
-		 *                                                 combiningTableTop may include one 16-bit padding unit
-		 *                                                 to make sure that fcdTrie is 32-bit-aligned
-		 *
-		 * Trie fcdTrie;                               -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
-		 *
-		 * Trie auxTrie;                               -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
-		 *
-		 * char canonStartSets[canonStartSetsTop]   -- canonStartSetsTop=indexes[INDEX_CANON_SET_COUNT]
-		 *                                                 serialized USets, see uset.c
-		 *
-		 */
 	 
 	 	//Read the indexes
 	 	int[] indexes = new int[NormalizerImpl.INDEX_TOP];
-        for (int i = 0; i <indexes.length ; i ++) {
+        for (int i = 0; i <indexes.length ; i++) {
             indexes[i] = dataInputStream.readInt();
        }
 	
@ -343,16 +348,32 @@ final class NormalizerDataReader {
 	 	ByteArrayInputStream auxTrieStream= new ByteArrayInputStream(auxBytes);
 		
 		//Read the canonical start sets
-		char[] canonStartSets=new char[indexes[NormalizerImpl.INDEX_CANON_SET_COUNT]];
-        for(int i=0; i<canonStartSets.length; i++){
-	 		canonStartSets[i]=dataInputStream.readChar();
+		Object[] canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
+		int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
+		for(int i=0; i<canonStartSetsIndexes.length; i++){
+	 		canonStartSetsIndexes[i]=dataInputStream.readChar();
 	 	}
-	 	
+		char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]-NormalizerImpl.SET_INDEX_TOP];
+        for(int i=0; i<startSets.length; i++){
+	 		startSets[i]=dataInputStream.readChar();
+	 	}
+	 	char[] bmpTable  = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_BMP_TABLE_LENGTH]];
+        for(int i=0; i<bmpTable.length; i++){
+	 		bmpTable[i]=dataInputStream.readChar();
+	 	}		
+		char[] suppTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SUPP_TABLE_LENGTH]];
+        for(int i=0; i<suppTable.length; i++){
+	 		suppTable[i]=dataInputStream.readChar();
+	 	}
+	 	canonStartSets[NormalizerImpl.CANON_SET_INDICIES_INDEX  ] = canonStartSetsIndexes;
+	 	canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
+	 	canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX	] = bmpTable;
+	 	canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;	 	
 	 	 	
 	 	//Now set the tries 
 	 	impl.normTrieImpl.normTrie  	= new IntTrie( normTrieStream,impl.normTrieImpl	);
 	 	impl.fcdTrieImpl.fcdTrie   		= new CharTrie(fcdTrieStream,impl.fcdTrieImpl	);
-	 	impl.auxTrieImpl.auxTrie		= new IntTrie( auxTrieStream, impl.auxTrieImpl	);
+	 	impl.auxTrieImpl.auxTrie		= new CharTrie( auxTrieStream, impl.auxTrieImpl	);
 	 	impl.indexes   					= indexes;
 	 	impl.extraData 					= extraData;
 	 	impl.combiningTable 			= combiningTable;
--- a/icu4j/src/com/ibm/icu/impl/NormalizerImpl.java
+++ b/icu4j/src/com/ibm/icu/impl/NormalizerImpl.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerImpl.java,v $
- * $Date: 2002/03/13 05:56:31 $
- * $Revision: 1.3 $
+ * $Date: 2002/03/28 01:50:59 $
+ * $Revision: 1.4 $
 *******************************************************************************
 */
 
@ -105,23 +105,33 @@ public final class NormalizerImpl {
    static final int INDEX_CANON_SET_COUNT    = 12;    /* number of uint16_t in the array of serialized USet */

 	static final int INDEX_TOP                = 32;    /* changing this requires a new formatVersion */
-
+	
+	
 	/* AUX constants */
-	/* value constants for auxTrie */
+	/* value constants for auxTrie */	
+	static final int AUX_UNSAFE_SHIFT	= 11;
+	static final int AUX_COMP_EX_SHIFT	= 10;
 	
-	static final int AUX_UNSAFE_SHIFT	= 14;
-	static final int AUX_FNC_SHIFT		= 20;
-	static final int AUX_COMP_EX_SHIFT	= 30;
-	static final int AUX_IS_LEAD_SHIFT	= 31;
-	
-	static final int AUX_MAX_CANON_SET  =   (1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK;
-	static final int AUX_MAX_FNC        =   (1<<(AUX_COMP_EX_SHIFT-AUX_FNC_SHIFT));
-
-	static final int AUX_CANON_SET_MASK =   (AUX_MAX_CANON_SET-1);
+	static final int AUX_MAX_FNC        =   ((int)1<<AUX_COMP_EX_SHIFT);
 	static final int AUX_UNSAFE_MASK    =   (1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK;
-	static final int AUX_FNC_MASK       =   ((AUX_MAX_FNC-1)<<AUX_FNC_SHIFT) & UNSIGNED_INT_MASK;
+	static final int AUX_FNC_MASK       =   (AUX_MAX_FNC-1) & UNSIGNED_INT_MASK;
 	static final int AUX_COMP_EX_MASK   =   (1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK;
-	static final int AUX_IS_LEAD_MASK   =   (1<<AUX_IS_LEAD_SHIFT) & UNSIGNED_INT_MASK;
+	
+	/* canonStartSets[0..31] contains indexes for what is in the array */
+    static final int SET_INDEX_CANON_SETS_LENGTH		= 0; /* number of uint16_t in canonical starter sets */
+    static final int SET_INDEX_CANON_BMP_TABLE_LENGTH	= 1; /* number of uint16_t in the BMP search table (contains pairs) */
+    static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH  = 2; /* number of uint16_t in the supplementary search table (contains triplets) */
+    static final int SET_INDEX_TOP						= 32;/* changing this requires a new formatVersion */
+	
+	static final int CANON_SET_INDICIES_INDEX  			= 0;
+	static final int CANON_SET_START_SETS_INDEX			= 1;
+	static final int CANON_SET_BMP_TABLE_INDEX			= 2;
+	static final int CANON_SET_SUPP_TABLE_INDEX			= 3;
+	
+	static final int CANON_SET_MAX_CANON_SETS     		= 0x0004; /* 14 bit indexes to canonical USerializedSets */
+	/* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */
+	static final int CANON_SET_BMP_MASK        			= 0xc000;
+	static final int CANON_SET_BMP_IS_INDEX    			= 0x4000;
 	
 	/*******************************/
 	
@ -158,7 +168,7 @@ public final class NormalizerImpl {
 	}
 	
 	static final class AuxTrieImpl implements Trie.DataManipulate{
-		static IntTrie auxTrie = null;
+		static CharTrie auxTrie = null;
 	   /**
 	    * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
 	    * data the index array offset of the indexes for that lead surrogate.
@ -167,11 +177,7 @@ public final class NormalizerImpl {
 	    * @return data offset or 0 if there is no data for the lead surrogate
 	    */
 	    public int getFoldingOffset(int value){
-		    if(value<0) {
-		        return (value & AUX_FNC_MASK)>>(AUX_FNC_SHIFT-5);
-		    } else {
-		        return 0;
-		    }
+	        return (value&AUX_FNC_MASK)<<5;
 	    }
 	}
 		 
@ -184,7 +190,7 @@ public final class NormalizerImpl {
 	static int[] indexes;
 	static char[] combiningTable;
 	static char[] extraData;
-	static char[] canonStartSets;
+	static Object[] canonStartSets;
 	
 	static boolean isDataLoaded;
 	static boolean isFormatVersion_2_1;
@ -441,8 +447,8 @@ public final class NormalizerImpl {
 	
 	public static boolean isFullCompositionExclusion(int c) {
 	    if(isFormatVersion_2_1) {
-	        int aux32=auxTrieImpl.auxTrie.getCodePointValue(c);
-	        return (boolean)((aux32&AUX_COMP_EX_MASK)!=0);
+	        int aux =auxTrieImpl.auxTrie.getCodePointValue(c);
+	        return (boolean)((aux & AUX_COMP_EX_MASK)!=0);
 	    } else {
 	        return false;
 	    }
@ -450,8 +456,8 @@ public final class NormalizerImpl {
 	
 	public static boolean isCanonSafeStart(int c) {
 	    if(isFormatVersion_2_1) {
-	        int aux32 = auxTrieImpl.auxTrie.getValue(c);
-	        return (boolean)((aux32&AUX_UNSAFE_MASK)==0);
+	        int aux = auxTrieImpl.auxTrie.getCodePointValue(c);
+	        return (boolean)((aux & AUX_UNSAFE_MASK)==0);
 	    } else {
 	        return false;
 	    }
@ -460,15 +466,88 @@ public final class NormalizerImpl {
 	public static boolean getCanonStartSet(int c, USerializedSet fillSet) {

 	    if(fillSet!=null && canonStartSets!=null) {
-	        int aux32=auxTrieImpl.auxTrie.getValue(c);
-	        aux32&=AUX_CANON_SET_MASK;
+	 		/*
+	         * binary search for c
+	         *
+	         * There are two search tables,
+	         * one for BMP code points and one for supplementary ones.
+	         * See unormimp.h for details.
+	         */
+	        char[] table;
+	        int i, start, limit;
 	        
-	        return aux32!=0 &&
-	            fillSet.getSet(canonStartSets,indexes[INDEX_CANON_SET_COUNT]-aux32);
-	    } else {
-	        return false;
+	        if(c<=0xffff) {
+	            table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
+	            start=0;
+	            limit=table.length;
+	
+	            /* each entry is a pair { c, result } */
+	            while(start<limit) {
+	                i=(char)((start+limit)/2); 
+	                if(c<table[i]) {
+	                    limit=i;
+	                } else {
+	                    start=i;
+	                }
+	            }
+	
+	            /* found? */
+	            if(c==table[start]) {
+	                i=table[start+1];
+	                if((i&CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
+	                    /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
+	                    i&=(CANON_SET_MAX_CANON_SETS-1);
+	                    return fillSet.getSet(table,i);
+	                } else {
+	                    /* other result values are BMP code points for single-code point sets */
+	                    fillSet.setSerializedToOne(i);
+	                    return true;
+	                }
+	            }
+	        } else {
+	            char high, low, h;
+	
+	            table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
+	            start=0;
+	            limit=table.length;
+	
+	            high=(char)(c>>16);
+	            low=(char)c;
+	
+	            /* each entry is a triplet { high(c), low(c), result } */
+	            while(start<limit-3) {
+	                i=(char)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
+	                h=(char)(table[i]&0x1f); /* high word */
+	                if(high<h || (high==h && low<table[i+1])) {
+	                    limit=i;
+	                } else {
+	                    start=i;
+	                }
+	            }
+	
+	            /* found? */
+	            h=table[start];
+	            if(high==(h&0x1f) && low==table[start+1]) {
+	                i=table[start+2];
+	                if((h&0x8000)==0) {
+	                    /* the result is an index to a USerializedSet */
+	                    return fillSet.getSet(table,i);
+	                } else {
+	                    /*
+	                     * single-code point set {x} in
+	                     * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
+	                     */
+	                    i|=((int)h&0x1f00)<<8; /* add high bits from high(c) */
+	                    fillSet.setSerializedToOne((int)i);
+	                    return true;
+	                }
+	            }
+	        }
 	    }
+	
+	    return false; /* not found */
 	}
+	
 	/**
 	 * Internal API, used by collation code.
 	 * Get access to the internal FCD trie table to be able to perform
--- a/icu4j/src/com/ibm/icu/impl/USerializedSet.java
+++ b/icu4j/src/com/ibm/icu/impl/USerializedSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/USerializedSet.java,v $ 
- * $Date: 2002/03/12 17:49:15 $ 
- * $Revision: 1.1 $
+ * $Date: 2002/03/28 01:50:59 $ 
+ * $Revision: 1.2 $
 *
 *****************************************************************************************
 */
@ -83,7 +83,11 @@ public final class USerializedSet {
        if(rangeIndex<0) {
            return false;
        }
-
+		if(array==null){
+			array = new char[8];
+		}
+		range=new int[2];
+		
        rangeIndex*=2; /* address start/limit pairs */
        if(rangeIndex<bmpLength) {
            range[0]=array[arrayOffset+rangeIndex++];
@ -114,6 +118,103 @@ public final class USerializedSet {
            }
        }
    }
+	public final void setSerializedToOne(int c) {
+	    if( 0x10ffff<c) {
+	        return;
+	    }
+	
+	    if(c<0xffff) {
+	        bmpLength=length=2;
+	        array[0]=(char)c;
+	        array[1]=(char)(c+1);
+	    } else if(c==0xffff) {
+	        bmpLength=1;
+	        length=3;
+	        array[0]=0xffff;
+	        array[1]=1;
+	        array[2]=0;
+	    } else if(c<0x10ffff) {
+	        bmpLength=0;
+	        length=4;
+	        array[0]=(char)(c>>16);
+	        array[1]=(char)c;
+	        ++c;
+	        array[2]=(char)(c>>16);
+	        array[3]=(char)c;
+	    } else /* c==0x10ffff */ {
+	        bmpLength=0;
+	        length=2;
+	        array[0]=0x10;
+	        array[1]=0xffff;
+	    }
+	}
+	
+	
+	public final boolean getSerializedRange( int rangeIndex,int[] range) {
+	    if( rangeIndex<0) {
+	        return false;
+	    }
+	    if(array==null){
+			array = new char[8];
+		}
+	    range=new int[2];
+        rangeIndex*=2; /* address start/limit pairs */
+	    if(rangeIndex<bmpLength) {
+	        range[0]=array[rangeIndex++];
+	        if(rangeIndex<bmpLength) {
+	            range[1]=array[rangeIndex];
+	        } else if(rangeIndex<length) {
+	            range[1]=(((int)array[rangeIndex])<<16)|array[rangeIndex+1];
+	        } else {
+	            range[1]=0x110000;
+	        }
+	        return true;
+	    } else {
+	        rangeIndex-=bmpLength;
+	        rangeIndex*=2; /* address pairs of pairs of units */
+	        length-=bmpLength;
+	        if(rangeIndex<length) {
+	            int offset=arrayOffset+bmpLength;
+	            range[0]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
+	            rangeIndex+=2;
+	            if(rangeIndex<length) {
+	                range[1]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
+	            } else {
+	                range[1]=0x110000;
+	            }
+	            return false;
+	        } else {
+	            return false;
+	        }
+	    }
+	}
+	public final boolean serializedContains(int c) {
+	
+	    if(c>0x10ffff) {
+	        return false;
+	    }
+	    
+	    if(c<=0xffff) {
+	    	int i;
+	        /* find c in the BMP part */
+	        for(i=0; i<bmpLength && (char)c>=array[i]; ++i) {}
+	        return (boolean)((i&1) != 0);
+	    } else {
+	    	int i;
+	        /* find c in the supplementary part */
+	        char high=(char)(c>>16), low=(char)c;
+	        for(i=bmpLength;
+	            i<length && (high>array[i] || (high==array[i] && low>=array[i+1]));
+	            i+=2) {}
+	
+	        /* count pairs of 16-bit units even per BMP and check if the number of pairs is odd */
+	        return (boolean)(((i+bmpLength)&2)!=0);
+	    }
+	}
+	
+	public final int countSerializedRanges() {
+	    return (bmpLength+(length-bmpLength)/2+1)/2;
+	}

    private char array[];
    private int arrayOffset, bmpLength, length;
--- a/icu4j/src/com/ibm/icu/impl/data/unorm.dat
+++ b/icu4j/src/com/ibm/icu/impl/data/unorm.dat
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52b68530dbb9f4a2e2a089c50726005f8aa87b6b5ba466f1fbcae06316bad4c6
-size 172384
+oid sha256:665f02a0fd842a47ca65ecf36c1d301ef5cae01990b68f05695cfc693a783406
+size 106300