ICU-7273 build data for CanonicalIterator start sets on the fly; replace remaining uses of NormalizerImpl

X-SVN-Rev: 27561
2025-04-16 10:17:23 +00:00 · 2010-02-13 22:13:37 +00:00 · 2010-02-13 22:13:37 +00:00 · b15f884b16
commit b15f884b16
parent 0ec6c28016
12 changed files with 183 additions and 986 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
@ -33,6 +33,8 @@ public final class Normalizer2Impl {
        public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
        public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;

+        public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
+
        public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
        public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;

@ -502,25 +504,19 @@ public final class Normalizer2Impl {
            canonStartSets=new ArrayList<UnicodeSet>();
            Iterator<Trie2.Range> trieIterator=normTrie.iterator();
            while(trieIterator.hasNext()) {
-                Trie2.Range range=trieIterator.next();
-                int norm16=range.value;
-                if(norm16==0) {
-                    continue;  // inert
-                }
-                if(norm16==minYesNo) {
-                    // Hangul LV & LVT: Set has-compositions for all syllables
-                    // to minimize the trie size, although only LV syllables
-                    // do have compositions. Handle at runtime.
-                    // Set the same value for the whole range because
-                    // there cannot be other data. Hangul syllables are segment starters,
-                    // and since they decompose they cannot have canonStartSets.
-                    // (There is no decomposable character in a decomposition mapping.)
-                    range.value=CANON_HAS_COMPOSITIONS;
-                    newData.setRange(range, true);
+                final Trie2.Range range=trieIterator.next();
+                final int norm16=range.value;
+                if(range.leadSurrogate || norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
+                    // Inert, or 2-way mapping (including Hangul syllable).
+                    // We do not write a canonStartSet for any yesNo character.
+                    // Composites from 2-way mappings are added at runtime from the
+                    // starter's compositions list, and the other characters in
+                    // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
+                    // "maybe" characters.
                    continue;
                }
                for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
-                    int oldValue=newData.get(c);
+                    final int oldValue=newData.get(c);
                    int newValue=oldValue;
                    if(norm16>=minMaybeYes) {
                        // not a segment starter if it occurs in a decomposition or has cc!=0
@ -531,36 +527,39 @@ public final class Normalizer2Impl {
                    } else if(norm16<minYesNo) {
                        newValue|=CANON_HAS_COMPOSITIONS;
                    } else {
-                        // c has a decomposition
+                        // c has a one-way decomposition
                        int c2=c;
-                        while(limitNoNo<=norm16 && norm16<minMaybeYes) {
-                            c2=this.mapAlgorithmic(c2, norm16);
-                            norm16=getNorm16(c2);
+                        int norm16_2=norm16;
+                        while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
+                            c2=this.mapAlgorithmic(c2, norm16_2);
+                            norm16_2=getNorm16(c2);
                        }
-                        if(minYesNo<=norm16 && norm16<limitNoNo) {
+                        if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
                            // c decomposes, get everything from the variable-length extra data
-                            int firstUnit=extraData.charAt(norm16++);
-                            if(c==c2 && (firstUnit&MAPPING_PLUS_COMPOSITION_LIST)!=0) {
-                                newValue|=CANON_HAS_COMPOSITIONS;  // original c has compositions
-                            }
+                            int firstUnit=extraData.charAt(norm16_2++);
                            int length=firstUnit&MAPPING_LENGTH_MASK;
                            if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
-                                if(c==c2 && (extraData.charAt(norm16)&0xff)!=0) {
+                                if(c==c2 && (extraData.charAt(norm16_2)&0xff)!=0) {
                                    newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
                                }
-                                ++norm16;
+                                ++norm16_2;
                            }
+                            // Skip empty mappings (no characters in the decomposition).
                            if(length!=0) {
                                // add c to first code point's start set
-                                int limit=norm16+length;
-                                c2=extraData.codePointAt(norm16);
+                                int limit=norm16_2+length;
+                                c2=extraData.codePointAt(norm16_2);
                                addToStartSet(newData, c, c2);
-                                // set CANON_NOT_SEGMENT_STARTER for each remaining code point
-                                while((norm16+=Character.charCount(c2))<limit) {
-                                    c2=extraData.codePointAt(norm16);
-                                    int c2Value=newData.get(c2);
-                                    if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
-                                        newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
+                                // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
+                                // one-way mapping. A 2-way mapping is possible here after
+                                // intermediate algorithmic mapping.
+                                if(norm16_2>=minNoNo) {
+                                    while((norm16_2+=Character.charCount(c2))<limit) {
+                                        c2=extraData.codePointAt(norm16_2);
+                                        int c2Value=newData.get(c2);
+                                        if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
+                                            newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
+                                        }
                                    }
                                }
                            }
@ -692,6 +691,29 @@ public final class Normalizer2Impl {
    public boolean isCanonSegmentStarter(int c) {
        return canonIterData.get(c)>=0;
    }
+    public boolean getCanonStartSet(int c, UnicodeSet set) {
+        int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
+        if(canonValue==0) {
+            return false;
+        }
+        set.clear();
+        int value=canonValue&CANON_VALUE_MASK;
+        if((canonValue&CANON_HAS_SET)!=0) {
+            set.addAll(canonStartSets.get(value));
+        } else if(value!=0) {
+            set.add(value);
+        }
+        if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
+            int norm16=getNorm16(c);
+            if(norm16==JAMO_L) {
+                int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
+                set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
+            } else {
+                addComposites(getCompositionsList(norm16), set);
+            }
+        }
+        return true;
+    }

    public static final int MIN_CCC_LCCC_CP=0x300;

@ -1503,7 +1525,7 @@ public final class Normalizer2Impl {
    /**
     * @return index into maybeYesCompositions, or -1
     */
-    private int getCompositionsListForDecompYesAndZeroCC(int norm16) {
+    private int getCompositionsListForDecompYes(int norm16) {
        if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
            return -1;
        } else {
@ -1527,6 +1549,15 @@ public final class Normalizer2Impl {
            (firstUnit&MAPPING_LENGTH_MASK)+  // + mapping length
            ((firstUnit>>7)&1);  // +1 if MAPPING_HAS_CCC_LCCC_WORD
    }
+    /**
+     * @param c code point must have compositions
+     * @return index into maybeYesCompositions
+     */
+    private int getCompositionsList(int norm16) {
+        return isDecompYes(norm16) ?
+                getCompositionsListForDecompYes(norm16) :
+                getCompositionsListForComposite(norm16);
+    }

    // Decompose a short piece of text which is likely to contain characters that
    // fail the quick check loop and/or where the quick check loop's overhead
@ -1639,6 +1670,29 @@ public final class Normalizer2Impl {
        }
        return -1;
    }
+    /**
+     * @param c Character which has compositions
+     * @param set recursively receives the composites from c's compositions
+     */
+    private void addComposites(int list, UnicodeSet set) {
+        int firstUnit, compositeAndFwd;
+        do {
+            firstUnit=maybeYesCompositions.charAt(list);
+            if((firstUnit&COMP_1_TRIPLE)==0) {
+                compositeAndFwd=maybeYesCompositions.charAt(list+1);
+                list+=2;
+            } else {
+                compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
+                                maybeYesCompositions.charAt(list+2);
+                list+=3;
+            }
+            int composite=compositeAndFwd>>1;
+            if((compositeAndFwd&1)!=0) {
+                addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
+            }
+            set.add(composite);
+        } while((firstUnit&COMP_1_LAST_TUPLE)==0);
+    }
    /*
     * Recomposes the buffer text starting at recomposeStartIndex
     * (which is in NFD - decomposed and canonically ordered),
@ -1777,7 +1831,7 @@ public final class Normalizer2Impl {
            // If c did not combine, then check if it is a starter.
            if(cc==0) {
                // Found a new starter.
-                if((compositionsList=getCompositionsListForDecompYesAndZeroCC(norm16))>=0) {
+                if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
                    // It may combine with something, prepare for it.
                    if(c<=0xffff) {
                        starterIsSupplementary=false;
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/NormalizerDataReader.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/NormalizerDataReader.java
@ -1,427 +0,0 @@
-/*
- *******************************************************************************
- * Copyright (C) 1996-2008, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- *******************************************************************************
- */
- 
-package com.ibm.icu.impl;
-import java.io.*;
-import com.ibm.icu.impl.ICUDebug;
-
-/**
- * @version     1.0
- * @author        Ram Viswanadha
- */
-
-/*
- * Description of the format of unorm.icu version 2.1.
- *
- * Main change from version 1 to version 2:
- * Use of new, common Trie instead of normalization-specific tries.
- * Change to version 2.1: add third/auxiliary trie with associated data.
- *
- * For more details of how to use the data structures see the code
- * in unorm.cpp (runtime normalization code) and
- * in gennorm.c and gennorm/store.c (build-time data generation).
- *
- * For the serialized format of Trie see Trie.c/TrieHeader.
- *
- * - Overall partition
- *
- * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
- * After that there are the following structures:
- *
- * char indexes[INDEX_TOP];                   -- INDEX_TOP=32, see enum in this file
- *
- * Trie normTrie;                           -- size in bytes=indexes[INDEX_TRIE_SIZE]
- * 
- * char extraData[extraDataTop];            -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
- *                                                 extraData[0] contains the number of units for
- *                                                 FC_NFKC_Closure (formatVersion>=2.1)
- *
- * char combiningTable[combiningTableTop];  -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
- *                                                 combiningTableTop may include one 16-bit padding unit
- *                                                 to make sure that fcdTrie is 32-bit-aligned
- *
- * Trie fcdTrie;                            -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
- *
- * Trie auxTrie;                            -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
- *
- * char canonStartSets[canonStartSetsTop]   -- canonStartSetsTop=indexes[INDEX_CANON_SET_COUNT]
- *                                                 serialized USets, see uset.c
- *
- *
- * The indexes array contains lengths and sizes of the following arrays and structures
- * as well as the following values:
- *  indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
- *      -- one more than the highest combining index computed for forward-only-combining characters
- *  indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
- *      -- number of combining indexes computed for both-ways-combining characters
- *  indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
- *      -- number of combining indexes computed for backward-only-combining characters
- *
- *  indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
- *      -- first code point with a quick check NF* value of NO/MAYBE
- *
- *
- * - Tries
- *
- * The main structures are two Trie tables ("compact arrays"),
- * each with one index array and one data array.
- * See Trie.h and Trie.c.
- *
- *
- * - Tries in unorm.icu
- *
- * The first trie (normTrie above)
- * provides data for the NF* quick checks and normalization.
- * The second trie (fcdTrie above) provides data just for FCD checks.
- *
- *
- * - norm32 data words from the first trie
- *
- * The norm32Table contains one 32-bit word "norm32" per code point.
- * It contains the following bit fields:
- * 31..16   extra data index, EXTRA_SHIFT is used to shift this field down
- *          if this index is <EXTRA_INDEX_TOP then it is an index into
- *              extraData[] where variable-length normalization data for this
- *              code point is found
- *          if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
- *              then this is a norm32 for a leading surrogate, and the index
- *              value is used together with the following trailing surrogate
- *              code unit in the second trie access
- *          if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
- *              then this is a norm32 for a "special" character,
- *              i.e., the character is a Hangul syllable or a Jamo
- *              see EXTRA_HANGUL etc.
- *          generally, instead of extracting this index from the norm32 and
- *              comparing it with the above constants,
- *              the normalization code compares the entire norm32 value
- *              with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
- *
- * 15..8    combining class (cc) according to UnicodeData.txt
- *
- *  7..6    COMBINES_ANY flags, used in composition to see if a character
- *              combines with any following or preceding character(s)
- *              at all
- *     7    COMBINES_BACK
- *     6    COMBINES_FWD
- *
- *  5..0    quick check flags, set for "no" or "maybe", with separate flags for
- *              each normalization form
- *              the higher bits are "maybe" flags; for NF*D there are no such flags
- *              the lower bits are "no" flags for all forms, in the same order
- *              as the "maybe" flags,
- *              which is (MSB to LSB): NFKD NFD NFKC NFC
- *  5..4    QC_ANY_MAYBE
- *  3..0    QC_ANY_NO
- *              see further related constants
- *
- *
- * - Extra data per code point
- *
- * "Extra data" is referenced by the index in norm32.
- * It is variable-length data. It is only present, and only those parts
- * of it are, as needed for a given character.
- * The norm32 extra data index is added to the beginning of extraData[]
- * to get to a vector of 16-bit words with data at the following offsets:
- *
- * [-1]     Combining index for composition.
- *              Stored only if norm32&COMBINES_ANY .
- * [0]      Lengths of the canonical and compatibility decomposition strings.
- *              Stored only if there are decompositions, i.e.,
- *              if norm32&(QC_NFD|QC_NFKD)
- *          High byte: length of NFKD, or 0 if none
- *          Low byte: length of NFD, or 0 if none
- *          Each length byte also has another flag:
- *              Bit 7 of a length byte is set if there are non-zero
- *              combining classes (cc's) associated with the respective
- *              decomposition. If this flag is set, then the decomposition
- *              is preceded by a 16-bit word that contains the
- *              leading and trailing cc's.
- *              Bits 6..0 of a length byte are the length of the
- *              decomposition string, not counting the cc word.
- * [1..n]   NFD
- * [n+1..]  NFKD
- *
- * Each of the two decompositions consists of up to two parts:
- * - The 16-bit words with the leading and trailing cc's.
- *   This is only stored if bit 7 of the corresponding length byte
- *   is set. In this case, at least one of the cc's is not zero.
- *   High byte: leading cc==cc of the first code point in the decomposition string
- *   Low byte: trailing cc==cc of the last code point in the decomposition string
- * - The decomposition string in UTF-16, with length code units.
- *
- *
- * - Combining indexes and combiningTable[]
- *
- * Combining indexes are stored at the [-1] offset of the extra data
- * if the character combines forward or backward with any other characters.
- * They are used for (re)composition in NF*C.
- * Values of combining indexes are arranged according to whether a character
- * combines forward, backward, or both ways:
- *    forward-only < both ways < backward-only
- *
- * The index values for forward-only and both-ways combining characters
- * are indexes into the combiningTable[].
- * The index values for backward-only combining characters are simply
- * incremented from the preceding index values to be unique.
- *
- * In the combiningTable[], a variable-length list
- * of variable-length (back-index, code point) pair entries is stored
- * for each forward-combining character.
- *
- * These back-indexes are the combining indexes of both-ways or backward-only
- * combining characters that the forward-combining character combines with.
- *
- * Each list is sorted in ascending order of back-indexes.
- * Each list is terminated with the last back-index having bit 15 set.
- *
- * Each pair (back-index, code point) takes up either 2 or 3
- * 16-bit words.
- * The first word of a list entry is the back-index, with its bit 15 set if
- * this is the last pair in the list.
- *
- * The second word contains flags in bits 15..13 that determine
- * if there is a third word and how the combined character is encoded:
- * 15   set if there is a third word in this list entry
- * 14   set if the result is a supplementary character
- * 13   set if the result itself combines forward
- *
- * According to these bits 15..14 of the second word,
- * the result character is encoded as follows:
- * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
- *          the second word.
- * 10       The result is 0x2000..0xffff and stored in the third word.
- *          Bits 12..0 of the second word are not used.
- * 11       The result is a supplementary character.
- *          Bits 9..0 of the leading surrogate are in bits 9..0 of
- *          the second word.
- *          Add 0xd800 to these bits to get the complete surrogate.
- *          Bits 12..10 of the second word are not used.
- *          The trailing surrogate is stored in the third word.
- *
- *
- * - FCD trie
- *
- * The FCD trie is very simple.
- * It is a folded trie with 16-bit data words.
- * In each word, the high byte contains the leading cc of the character,
- * and the low byte contains the trailing cc of the character.
- * These cc's are the cc's of the first and last code points in the
- * canonical decomposition of the character.
- *
- * Since all 16 bits are used for cc's, lead surrogates must be tested
- * by checking the code unit instead of the trie data.
- * This is done only if the 16-bit data word is not zero.
- * If the code unit is a leading surrogate and the data word is not zero,
- * then instead of cc's it contains the offset for the second trie lookup.
- *
- *
- * - Auxiliary trie and data
- *
- *
- * The auxiliary 16-bit trie contains data for additional properties.
- * Bits
- * 15..13   reserved
- *     12   not NFC_Skippable (f) (formatVersion>=2.2)
- *     11   flag: not a safe starter for canonical closure
- *     10   composition exclusion
- *  9.. 0   index into extraData[] to FC_NFKC_Closure string
- *          (not for lead surrogate),
- *          or lead surrogate offset (for lead surrogate, if 9..0 not zero)
- * 
- * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
- * (used in NormalizerTransliterator)
- *
- * A skippable character is
- * a) unassigned, or ALL of the following:
- * b) of combining class 0.
- * c) not decomposed by this normalization form.
- * AND if NFC or NFKC,
- * d) can never compose with a previous character.
- * e) can never compose with a following character.
- * f) can never change if another character is added.
- *    Example: a-breve might satisfy all but f, but if you
- *    add an ogonek it changes to a-ogonek + breve
- *
- * a)..e) must be tested from norm32.
- * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
- * into the auxiliary trie.
- * The same bit is used for NFC and NFKC; (c) differs for them.
- * As usual, we build the "not skippable" flags so that unassigned
- * code points get a 0 bit.
- * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
- * Test Hangul LV syllables entirely in code.
- *   
- * 
- * - FC_NFKC_Closure strings in extraData[]
- *
- * Strings are either stored as a single code unit or as the length
- * followed by that many units.
- * 
- * - structure inside canonStartSets[]
- *
- * This array maps from code points c to sets of code points (USerializedSet).
- * The result sets are the code points whose canonical decompositions start
- * with c.
- *
- * canonStartSets[] contains the following sub-arrays:
- *
- * indexes[_NORM_SET_INDEX_TOP]
- *   - contains lengths of sub-arrays etc.
- *
- * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
- *   - contains serialized sets (USerializedSet) of canonical starters for
- *     enumerating canonically equivalent strings
- *     indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
- *     for details about the structure see uset.c
- *
- * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
- *   - a sorted search table for BMP code points whose results are
- *     either indexes to USerializedSets or single code points for
- *     single-code point sets;
- *     each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
- *     if yy==01 then there is a USerializedSet at canonStartSets+x
- *     else build a USerializedSet with result as the single code point
- *
- * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
- *   - a sorted search table for supplementary code points whose results are
- *     either indexes to USerializedSets or single code points for
- *     single-code point sets;
- *     each entry is a triplet of { high16(cp), low16(cp), result }
- *     each code point's high-word may contain extra data in bits 15..5:
- *     if the high word has bit 15 set, then build a set with a single code point
- *     which is (((high16(cp)&0x1f00)<<8)|result;
- *     else there is a USerializedSet at canonStartSets+result
- */
-final class NormalizerDataReader implements ICUBinary.Authenticate {
-    private final static boolean debug = ICUDebug.enabled("NormalizerDataReader");
-    
-   /**
-    * <p>Protected constructor.</p>
-    * @param inputStream ICU uprop.dat file input stream
-    * @exception IOException throw if data file fails authentication 
-    */
-    protected NormalizerDataReader(InputStream inputStream) 
-                                        throws IOException{
-        if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
-        
-        unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
-        
-        if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
-        
-        dataInputStream = new DataInputStream(inputStream);
-        
-        if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
-    }
-    
-    // protected methods -------------------------------------------------
-    
-    protected int[] readIndexes(int length)throws IOException{
-        int[] indexes = new int[length];
-        //Read the indexes
-        for (int i = 0; i <length ; i++) {
-             indexes[i] = dataInputStream.readInt();
-        }
-        return indexes;
-    } 
-    /**
-    * <p>Reads unorm.icu, parse it into blocks of data to be stored in
-    * NormalizerImpl.</P
-    * @param normBytes
-    * @param fcdBytes
-    * @param auxBytes
-    * @param extraData
-    * @param combiningTable
-    * @param canonStartSets
-    * @exception IOException thrown when data reading fails
-    */
-    protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
-                        char[] extraData, char[] combiningTable, 
-                        Object[] canonStartSets) 
-                        throws IOException
-    {
-         // Read the bytes that make up the normTrie
-        dataInputStream.readFully(normBytes);
-
-        // normTrieStream= new ByteArrayInputStream(normBytes);
-
-        // Read the extra data
-        for (int i = 0; i < extraData.length; i++) {
-            extraData[i] = dataInputStream.readChar();
-        }
-
-        // Read the combining class table
-        for (int i = 0; i < combiningTable.length; i++) {
-            combiningTable[i] = dataInputStream.readChar();
-        }
-
-        // Read the fcdTrie
-        dataInputStream.readFully(fcdBytes);
-
-        // Read the AuxTrie
-        dataInputStream.readFully(auxBytes);
-
-        // Read the canonical start sets
-        int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
-
-        for (int i = 0; i < canonStartSetsIndexes.length; i++) {
-            canonStartSetsIndexes[i] = dataInputStream.readChar();
-        }
-
-        char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH] - NormalizerImpl.SET_INDEX_TOP];
-
-        for (int i = 0; i < startSets.length; i++) {
-            startSets[i] = dataInputStream.readChar();
-        }
-        char[] bmpTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_BMP_TABLE_LENGTH]];
-        for (int i = 0; i < bmpTable.length; i++) {
-            bmpTable[i] = dataInputStream.readChar();
-        }
-        char[] suppTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SUPP_TABLE_LENGTH]];
-        for (int i = 0; i < suppTable.length; i++) {
-            suppTable[i] = dataInputStream.readChar();
-        }
-        canonStartSets[NormalizerImpl.CANON_SET_INDICIES_INDEX] = canonStartSetsIndexes;
-        canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
-        canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX] = bmpTable;
-        canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;         
-    }
-    
-    public byte[] getDataFormatVersion(){
-        return DATA_FORMAT_VERSION;
-    }
-    
-    public boolean isDataVersionAcceptable(byte version[])
-    {
-        return version[0] == DATA_FORMAT_VERSION[0] 
-               && version[2] == DATA_FORMAT_VERSION[2] 
-               && version[3] == DATA_FORMAT_VERSION[3];
-    }
-    
-    public byte[] getUnicodeVersion(){
-        return unicodeVersion;    
-    }
-    // private data members -------------------------------------------------
-      
-
-    /**
-    * ICU data file input stream
-    */
-    private DataInputStream dataInputStream;
-    
-    private byte[] unicodeVersion;
-                                       
-    /**
-    * File format version that this class understands.
-    * No guarantees are made if a older version is used
-    * see store.c of gennorm for more information and values
-    */
-    private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F, 
-                                                    (byte)0x72, (byte)0x6D};
-    private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2, 
-                                                        (byte)0x5, (byte)0x2};
-    
-}
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/NormalizerImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/NormalizerImpl.java
@ -1,384 +0,0 @@
- /*
- *******************************************************************************
- * Copyright (C) 1996-2010, International Business Machines Corporation and
- * others. All Rights Reserved.
- *******************************************************************************
- */
- 
-package com.ibm.icu.impl;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.BufferedInputStream;
-import java.io.InputStream;
-import java.util.MissingResourceException;
-
-import com.ibm.icu.text.UnicodeSet;
-import com.ibm.icu.util.RangeValueIterator;
-import com.ibm.icu.util.VersionInfo;
-
-/**
- * @version     1.0
- * @author  Ram Viswanadha
- */
-public final class NormalizerImpl {
-    // Static block for the class to initialize its own self 
-    static final NormalizerImpl IMPL;
-    
-    static
-    {
-        try
-        {
-            IMPL = new NormalizerImpl();
-        }
-        catch (Exception e)
-        {
-            throw new MissingResourceException(e.getMessage(), "", "");
-        }
-    }
-    
-    static final int UNSIGNED_BYTE_MASK =0xFF;
-    static final long UNSIGNED_INT_MASK = 0xffffffffL;
-    /*
-     * This new implementation of the normalization code loads its data from
-     * unorm.icu, which is generated with the gennorm tool.
-     * The format of that file is described at the end of this file.
-     */
-    private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/unorm.icu";
-
-    /* indexes[] value names */
-    /* number of bytes in normalization trie */
-    static final int INDEX_TRIE_SIZE           = 0;
-     /* number of chars in extra data */     
-    static final int INDEX_CHAR_COUNT           = 1;    
-    /* number of uint16_t words for combining data */
-    static final int INDEX_COMBINE_DATA_COUNT = 2;
-    /* number of code points that combine forward */     
-    static final int INDEX_COMBINE_FWD_COUNT  = 3;
-    /* number of code points that combine forward and backward */     
-    static final int INDEX_COMBINE_BOTH_COUNT = 4;
-    /* number of code points that combine backward */     
-    static final int INDEX_COMBINE_BACK_COUNT = 5;     
-     /* first code point with quick check NFC NO/MAYBE */
-    public static final int INDEX_MIN_NFC_NO_MAYBE   = 6;
-    /* first code point with quick check NFKC NO/MAYBE */    
-    public static final int INDEX_MIN_NFKC_NO_MAYBE  = 7;
-     /* first code point with quick check NFD NO/MAYBE */     
-    public static final int INDEX_MIN_NFD_NO_MAYBE   = 8;
-    /* first code point with quick check NFKD NO/MAYBE */    
-    public static final int INDEX_MIN_NFKD_NO_MAYBE  = 9;     
-    /* number of bytes in FCD trie */
-    static final int INDEX_FCD_TRIE_SIZE      = 10;
-    /* number of bytes in the auxiliary trie */    
-    static final int INDEX_AUX_TRIE_SIZE      = 11;
-    /* number of uint16_t in the array of serialized USet */    
-    static final int INDEX_CANON_SET_COUNT    = 12;    
-    /* changing this requires a new formatVersion */
-    static final int INDEX_TOP                = 32;    
-    
-    
-    /* AUX constants */
-    /* value constants for auxTrie */    
-    private static final int AUX_UNSAFE_SHIFT           = 11;
-    private static final int AUX_COMP_EX_SHIFT           = 10;
-    
-    private static final int AUX_MAX_FNC          =   1<<AUX_COMP_EX_SHIFT;
-    private static final int AUX_UNSAFE_MASK      =   (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
-    private static final int AUX_FNC_MASK         =   (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
-    
-    /* canonStartSets[0..31] contains indexes for what is in the array */
-    /* number of uint16_t in canonical starter sets */
-    static final int SET_INDEX_CANON_SETS_LENGTH        = 0;
-    /* number of uint16_t in the BMP search table (contains pairs) */ 
-    static final int SET_INDEX_CANON_BMP_TABLE_LENGTH    = 1;
-    /* number of uint16_t in the supplementary search table(contains triplets)*/ 
-    static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH  = 2;
-    /* changing this requires a new formatVersion */ 
-    static final int SET_INDEX_TOP                        = 32;
-    
-    static final int CANON_SET_INDICIES_INDEX              = 0;
-    static final int CANON_SET_START_SETS_INDEX            = 1;
-    static final int CANON_SET_BMP_TABLE_INDEX            = 2;
-    static final int CANON_SET_SUPP_TABLE_INDEX            = 3;
-    /* 14 bit indexes to canonical USerializedSets */
-    static final int CANON_SET_MAX_CANON_SETS             = 0x4000; 
-    /* single-code point BMP sets are encoded directly in the search table 
-     * except if result=0x4000..0x7fff 
-     */
-    static final int CANON_SET_BMP_MASK                    = 0xc000;
-    static final int CANON_SET_BMP_IS_INDEX                = 0x4000;
-    
-    /**
-     * Internal option for cmpEquivFold() for decomposing.
-     * If not set, just do strcasecmp().
-     * @internal
-     */
-     public static final int COMPARE_EQUIV = 0x80000;
-    
-    /*******************************/
-
-    /* Wrappers for Trie implementations */ 
-    static final class AuxTrieImpl implements Trie.DataManipulate{
-        static CharTrie auxTrie = null;
-       /**
-        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 
-        * data the index array offset of the indexes for that lead surrogate.
-        * @param value data value for a surrogate from the trie, including 
-        *        the folding offset
-        * @return data offset or 0 if there is no data for the lead surrogate
-        */
-        /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
-        public int getFoldingOffset(int value) {
-            return (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
-        }
-    }
-         
-    /****************************************************/
-
-    private static AuxTrieImpl auxTrieImpl;
-    private static int[] indexes;
-    private static char[] combiningTable;
-    private static char[] extraData;
-    private static Object[] canonStartSets;
-    
-    private static boolean isDataLoaded;
-    private static boolean isFormatVersion_2_1;
-    private static byte[] unicodeVersion;
-    
-    /**
-     * Default buffer size of datafile
-     */
-    private static final int DATA_BUFFER_SIZE = 25000;
-    
-    /**
-     * FCD check: everything below this code point is known to have a 0 
-     * lead combining class 
-     */
-    public static final int MIN_WITH_LEAD_CC=0x300;
-
-    /** Number of bits of a trail surrogate that are used in index table 
-     * lookups. 
-     */
-    private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
-
-
-   // protected constructor ---------------------------------------------
-    
-    /**
-    * Constructor
-    * @exception thrown when data reading fails or data corrupted
-    */
-    private NormalizerImpl() throws IOException {
-        //data should be loaded only once
-        if(!isDataLoaded){
-            
-            // jar access
-            InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
-            BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
-            NormalizerDataReader reader = new NormalizerDataReader(b);
-            
-            // read the indexes            
-            indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
-            
-            byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
-            
-            int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
-            combiningTable = new char[combiningTableTop];
-            
-            int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
-            extraData = new char[extraDataTop];
-
-            byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
-            byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
-            canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
-            
-            auxTrieImpl = new AuxTrieImpl();
-                        
-            // load the rest of the data data and initialize the data members
-            reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable, 
-                        canonStartSets);
-                                       
-            AuxTrieImpl.auxTrie   = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl  );
-            
-            // we reached here without any exceptions so the data is fully 
-            // loaded set the variable to true
-            isDataLoaded = true;
-            
-            // get the data format version                           
-            byte[] formatVersion = reader.getDataFormatVersion();
-            
-            isFormatVersion_2_1 =( formatVersion[0]>2 
-                                    ||
-                                   (formatVersion[0]==2 && formatVersion[1]>=1)
-                                 );
-            unicodeVersion = reader.getUnicodeVersion();
-            b.close();
-        }
-    }
-        
-    /* ---------------------------------------------------------------------- */
-    
-    /* Korean Hangul and Jamo constants */
-    
-    public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
-    public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
-    public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
-    
-    public static final int HANGUL_BASE=0xac00;
-    
-    public static final int JAMO_L_COUNT=19;
-    public static final int JAMO_V_COUNT=21;
-    public static final int JAMO_T_COUNT=28;
-    public  static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
-    
-    /* data access primitives ----------------------------------------------- */
-    
-    public static VersionInfo getUnicodeVersion(){
-        return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
-                                       unicodeVersion[2], unicodeVersion[3]);
-    }
-
-    public static boolean isCanonSafeStart(int c) {
-        if(isFormatVersion_2_1) {
-            int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
-            return (aux & AUX_UNSAFE_MASK) == 0;
-        } else {
-            return false;
-        }
-    }
-    
-    public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
-
-        if(fillSet!=null && canonStartSets!=null) {
-             /*
-             * binary search for c
-             *
-             * There are two search tables,
-             * one for BMP code points and one for supplementary ones.
-             * See unormimp.h for details.
-             */
-            char[] table;
-            int i=0, start, limit;
-            
-            int[] idxs = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
-            char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
-            
-            if(c<=0xffff) {
-                table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
-                start=0;
-                limit=table.length;
-    
-                /* each entry is a pair { c, result } */
-                while(start<limit-2) {
-                    i=(char)(((start+limit)/4)*2); 
-                    if(c<table[i]) {
-                        limit=i;
-                    } else {
-                        start=i;
-                    }
-                }
-                //System.out.println(i);
-                /* found? */
-                if(c==table[start]) {
-                    i=table[start+1];
-                    if((i & CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
-                        /* result 01xxxxxx xxxxxx contains index x to a 
-                         * USerializedSet */
-                        i&=(CANON_SET_MAX_CANON_SETS-1);
-                        return fillSet.getSet(startSets,(i-idxs.length));
-                    } else {
-                        /* other result values are BMP code points for 
-                         * single-code point sets */
-                        fillSet.setToOne(i);
-                        return true;
-                    }
-                }
-            } else {
-                char high, low, h,j=0;
-    
-                table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
-                start=0;
-                limit=table.length;
-    
-                high=(char)(c>>16);
-                low=(char)c;
-    
-                /* each entry is a triplet { high(c), low(c), result } */
-                while(start<limit-3) {
-                    /* (start+limit)/2 and address triplets */
-                    i=(char)(((start+limit)/6)*3);
-                    j=(char)(table[i]&0x1f); /* high word */
-                    int tableVal = table[i+1];
-                    int lowInt = low;
-                    if(high<j || ((tableVal>lowInt) && (high==j))) {
-                        limit=i;
-                    } else {
-                        start=i;
-                    }
-                    
-                    //System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
-                    
-                    // KLUDGE: IBM JIT in 1.4.0 is sooo broken
-                    // The below lines make TestExhaustive pass
-                    if(ICUDebug.enabled()){
-                        System.err.println("\t\t j = " + Utility.hex(j,4) +
-                                           "\t i = " + Utility.hex(i,4) +
-                                           "\t high = "+ Utility.hex(high)  +
-                                           "\t low = "  + Utility.hex(lowInt,4)   +
-                                           "\t table[i+1]: "+ Utility.hex(tableVal,4) 
-                                           );
-                    }
-                   
-                }
-
-                /* found? */
-                h=table[start];
-
-                //System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
-                int tableVal1 = table[start+1];
-                int lowInt = low;
-
-                if(high==(h&0x1f) && lowInt==tableVal1) {
-                    int tableVal2 = table[start+2];
-                    i=tableVal2;
-                    if((h&0x8000)==0) {
-                        /* the result is an index to a USerializedSet */
-                        return fillSet.getSet(startSets,(i-idxs.length));
-                    } else {
-                        /*
-                         * single-code point set {x} in
-                         * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }
-                         */
-                        //i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
-                        int temp = ((int)h & 0x1f00)<<8;
-                        i|=temp; /* add high bits from high(c) */
-                        fillSet.setToOne(i);
-                        return true;
-                    }
-                }
-            }
-        }
-    
-        return false; /* not found */
-    }
-
-    public static UnicodeSet addPropertyStarts(UnicodeSet set) {
-        int c;
-       
-        /* add the start code point of each same-value range of each trie */
-        if(isFormatVersion_2_1){
-            //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
-            TrieIterator auxIter  = new TrieIterator(AuxTrieImpl.auxTrie);
-            RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
-            while(auxIter.next(auxResult)){
-                set.add(auxResult.start);
-            }
-        }
-        /* add Hangul LV syllables and LV+1 because of skippables */
-        for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
-            set.add(c);
-            set.add(c+1);
-        }
-        set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
-        return set; // for chaining
-    }
-}
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UCharacterProperty.java
@ -103,24 +103,22 @@ public final class UCharacterProperty
    public static final int SRC_PROPSVEC=2;
    /** From unames.c/unames.icu */
    public static final int SRC_NAMES=3;
-    /** From unorm.cpp/unorm.icu */
-    public static final int SRC_NORM=4;
    /** From ucase.c/ucase.icu */
-    public static final int SRC_CASE=5;
+    public static final int SRC_CASE=4;
    /** From ubidi_props.c/ubidi.icu */
-    public static final int SRC_BIDI=6;
+    public static final int SRC_BIDI=5;
    /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
-    public static final int SRC_CHAR_AND_PROPSVEC=7;
+    public static final int SRC_CHAR_AND_PROPSVEC=6;
    /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
-    public static final int SRC_CASE_AND_NORM=8;
+    public static final int SRC_CASE_AND_NORM=7;
    /** From normalizer2impl.cpp/nfc.nrm */
-    public static final int SRC_NFC=9;
+    public static final int SRC_NFC=8;
    /** From normalizer2impl.cpp/nfkc.nrm */
-    public static final int SRC_NFKC=10;
+    public static final int SRC_NFKC=9;
    /** From normalizer2impl.cpp/nfkc_cf.nrm */
-    public static final int SRC_NFKC_CF=11;
+    public static final int SRC_NFKC_CF=10;
    /** One more than the highest UPropertySource (SRC_) constant. */
-    public static final int SRC_COUNT=12;
+    public static final int SRC_COUNT=11;

    // public methods ----------------------------------------------------

@ -310,7 +308,7 @@ public final class UCharacterProperty
       new BinaryProperties( SRC_NFKC,   0 ),                                       /* UCHAR_NFKD_INERT */
       new BinaryProperties( SRC_NFC,    0 ),                                       /* UCHAR_NFC_INERT */
       new BinaryProperties( SRC_NFKC,   0 ),                                       /* UCHAR_NFKC_INERT */
-       new BinaryProperties( SRC_NORM,   0 ),                                       /* UCHAR_SEGMENT_STARTER */
+       new BinaryProperties( SRC_NFC,    0 ),                                       /* UCHAR_SEGMENT_STARTER */
       new BinaryProperties(  1,                (  1 << PATTERN_SYNTAX) ),
       new BinaryProperties(  1,                (  1 << PATTERN_WHITE_SPACE) ),
       new BinaryProperties( SRC_CHAR_AND_PROPSVEC,  0 ),                           /* UCHAR_POSIX_ALNUM */
@ -372,25 +370,25 @@ public final class UCharacterProperty
                    } catch (IOException e) {
                        return false;
                    }
-                } else if(column==SRC_NORM) {
-                    /* normalization properties from unorm.icu */
-                    switch(which) {
-                    case UProperty.SEGMENT_STARTER:
-                        return NormalizerImpl.isCanonSafeStart(c);
-                    default:
-                        break;
-                    }
-                } else if(column==SRC_NFC || column==SRC_NFKC) {
+                } else if(column==SRC_NFC) {
+                    /* normalization properties from nfc.nrm */
                    switch(which) {
                    case UProperty.FULL_COMPOSITION_EXCLUSION: {
                        // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
                        Normalizer2Impl impl=Norm2AllModes.getNFCInstanceNoIOException().impl;
                        return impl.isCompNo(impl.getNorm16(c));
                    }
+                    case UProperty.SEGMENT_STARTER:
+                        return Norm2AllModes.getNFCInstanceNoIOException().impl.
+                            ensureCanonIterData().isCanonSegmentStarter(c);
                    default:
-                        // UCHAR_NF..._INERT properties
+                        // UCHAR_NF[CD]_INERT properties
                        return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
                    }
+                } else if(column==SRC_NFKC) {
+                    /* normalization properties from nfkc.nrm */
+                    // UCHAR_NFK[CD]_INERT properties
+                    return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
                } else if(column==SRC_NFKC_CF) {
                    // currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
                    Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstanceNoIOException().impl;
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/USerializedSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/USerializedSet.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
- *   Copyright (C) 2002-2009, International Business Machines
+ *   Copyright (C) 2002-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 */
@ -112,28 +112,26 @@ public final class USerializedSet {
        if(rangeIndex<bmpLength) {
            range[0]=array[rangeIndex++];
            if(rangeIndex<bmpLength) {
-                range[1]=array[rangeIndex];
+                range[1]=array[rangeIndex]-1;
            } else if(rangeIndex<length) {
-                range[1]=(((int)array[rangeIndex])<<16)|array[rangeIndex+1];
+                range[1]=((((int)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
            } else {
-                range[1]=0x110000;
+                range[1]=0x10ffff;
            }
-            range[1]-=1;
            return true;
        } else {
            rangeIndex-=bmpLength;
            rangeIndex*=2; /* address pairs of pairs of units */
-            length-=bmpLength;
-            if(rangeIndex<length) {
+            int suppLength=length-bmpLength;
+            if(rangeIndex<suppLength) {
                int offset=arrayOffset+bmpLength;
                range[0]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
                rangeIndex+=2;
-                if(rangeIndex<length) {
-                    range[1]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
+                if(rangeIndex<suppLength) {
+                    range[1]=((((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1])-1;
                } else {
-                    range[1]=0x110000;
+                    range[1]=0x10ffff;
                }
-                range[1]-=1;
                return true;
            } else {
                return false;
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CanonicalIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CanonicalIterator.java
@ -12,8 +12,8 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Set;

-import com.ibm.icu.impl.NormalizerImpl;
-import com.ibm.icu.impl.USerializedSet;
+import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.Normalizer2Impl;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;

@ -48,6 +48,9 @@ public final class CanonicalIterator {
     * @stable ICU 2.4
     */
    public CanonicalIterator(String source) {
+        Norm2AllModes allModes = Norm2AllModes.getNFCInstanceNoIOException();
+        nfd = allModes.decomp;
+        nfcImpl = allModes.impl.ensureCanonIterData();
        setSource(source);
    }

@ -110,7 +113,7 @@ public final class CanonicalIterator {
     * @stable ICU 2.4
     */
    public void setSource(String newSource) {
-        source = Normalizer.normalize(newSource, Normalizer.NFD);
+        source = nfd.normalize(newSource);
        done = false;

        // catch degenerate case
@ -131,9 +134,9 @@ public final class CanonicalIterator {

        int i = UTF16.findOffsetFromCodePoint(source, 1);

-        for (; i < source.length(); i += UTF16.getCharCount(cp)) {
-            cp = UTF16.charAt(source, i);
-            if (NormalizerImpl.isCanonSafeStart(cp)) {
+        for (; i < source.length(); i += Character.charCount(cp)) {
+            cp = source.codePointAt(i);
+            if (nfcImpl.isCanonSegmentStarter(cp)) {
                segmentList.add(source.substring(start, i)); // add up to i
                start = i;
            }
@ -226,6 +229,8 @@ public final class CanonicalIterator {
    private static boolean SKIP_ZEROS = true;

    // fields
+    private final Normalizer2 nfd;
+    private final Normalizer2Impl nfcImpl;
    private String source;
    private boolean done;
    private String[][] pieces;
@ -286,37 +291,30 @@ public final class CanonicalIterator {

        result.add(segment);
        StringBuffer workingBuffer = new StringBuffer();
+        UnicodeSet starts = new UnicodeSet();

        // cycle through all the characters
-        int cp=0;
-        int[] range = new int[2];
-        for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
+        int cp;
+        for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {

            // see if any character is at the start of some decomposition
-            cp = UTF16.charAt(segment, i);
-            USerializedSet starts = new USerializedSet();
-
-            if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
+            cp = segment.codePointAt(i);
+            if (!nfcImpl.getCanonStartSet(cp, starts)) {
              continue;
            }
-            int j=0;
            // if so, see which decompositions match
-            int rangeCount = starts.countRanges();
-            for(j = 0; j < rangeCount; ++j) {
-                starts.getRange(j, range);
-                int end=range[1];
-                for (int cp2 = range[0]; cp2 <= end; ++cp2) {
-                    Set<String> remainder = extract(cp2, segment, i, workingBuffer);
-                    if (remainder == null) {
-                        continue;
-                    }
+            for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
+                int cp2 = iter.codepoint;
+                Set<String> remainder = extract(cp2, segment, i, workingBuffer);
+                if (remainder == null) {
+                    continue;
+                }

-                    // there were some matches, so add all the possibilities to the set.
-                    String prefix= segment.substring(0,i);
-                    prefix += UTF16.valueOf(cp2);
-                    for (String item : remainder) {
-                        result.add(prefix + item);
-                    }
+                // there were some matches, so add all the possibilities to the set.
+                String prefix= segment.substring(0,i);
+                prefix += UTF16.valueOf(cp2);
+                for (String item : remainder) {
+                    result.add(prefix + item);
                }
            }
        }
@ -368,8 +366,10 @@ public final class CanonicalIterator {
        if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
            + ", " + Utility.hex(segment.substring(segmentPos)));

-        //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
-        String decomp = Normalizer.normalize(comp, Normalizer.NFD);
+        String decomp = nfcImpl.getDecomposition(comp);
+        if (decomp == null) {
+            decomp = UTF16.valueOf(comp);
+        }

        // See if it matches the start of segment (at segmentPos)
        boolean ok = false;
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/Normalizer.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/Normalizer.java
@ -6,11 +6,9 @@
 */
 package com.ibm.icu.text;
 import com.ibm.icu.impl.Normalizer2Impl;
-import com.ibm.icu.impl.NormalizerImpl;
 import com.ibm.icu.impl.Norm2AllModes;
 import com.ibm.icu.impl.UCaseProps;
 import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.util.VersionInfo;

 import java.io.IOException;
 import java.nio.CharBuffer;
@ -2403,14 +2401,6 @@ public final class Normalizer implements Cloneable {
        }
    }

-    /**
-     * Fetches the Unicode version burned into the Normalization data file
-     * @return VersionInfo version information of the normalizer
-     */
-    static VersionInfo getUnicodeVersion() {
-        return NormalizerImpl.getUnicodeVersion();
-    }
-
    /**
     * An Appendable that writes into a char array with a capacity that may be
     * less than array.length.
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/StringPrep.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/StringPrep.java
@ -17,6 +17,7 @@ import com.ibm.icu.impl.ICUData;
 import com.ibm.icu.impl.ICUResourceBundle;
 import com.ibm.icu.impl.StringPrepDataReader;
 import com.ibm.icu.impl.UBiDiProps;
+import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterDirection;
 import com.ibm.icu.util.VersionInfo;

@ -297,7 +298,7 @@ public final class StringPrep {
        checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
        sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
        normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
-        VersionInfo normUniVer = Normalizer.getUnicodeVersion();
+        VersionInfo normUniVer = UCharacter.getUnicodeVersion();
        if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
           normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
           ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java
@ -17,7 +17,6 @@ import java.util.TreeSet;

 import com.ibm.icu.impl.BMPSet;
 import com.ibm.icu.impl.Norm2AllModes;
-import com.ibm.icu.impl.NormalizerImpl;
 import com.ibm.icu.impl.RuleCharacterIterator;
 import com.ibm.icu.impl.SortedSetRelation;
 import com.ibm.icu.impl.UBiDiProps;
@ -3092,11 +3091,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
                    UCharacterProperty.INSTANCE.addPropertyStarts(incl);
                    UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
                    break;
-                case UCharacterProperty.SRC_NORM:
-                    NormalizerImpl.addPropertyStarts(incl);
-                    break;
                case UCharacterProperty.SRC_CASE_AND_NORM:
-                    NormalizerImpl.addPropertyStarts(incl);
+                    Norm2AllModes.getNFCInstanceNoIOException().impl.addPropertyStarts(incl);
                    UCaseProps.getSingleton().addPropertyStarts(incl);
                    break;
                case UCharacterProperty.SRC_NFC:
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
@ -14,12 +14,12 @@ import java.util.Locale;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.dev.test.TestUtil;
-import com.ibm.icu.impl.NormalizerImpl;
+import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.Normalizer2Impl;
 import com.ibm.icu.impl.UBiDiProps;
 import com.ibm.icu.impl.UCaseProps;
 import com.ibm.icu.impl.UCharacterName;
 import com.ibm.icu.impl.UCharacterProperty;
-import com.ibm.icu.impl.USerializedSet;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterCategory;
@ -2204,19 +2204,6 @@ public final class UCharacterTest extends TestFmwk
        }
    }

-    /* add characters from a serialized set to a normal one */
-    private static void _setAddSerialized(UnicodeSet set, USerializedSet sset) {
-     //  int start, end;
-       int i, count;
-
-       count=sset.countRanges();
-       int[] range = new int[2];
-       for(i=0; i<count; ++i) {
-           sset.getRange(i,range);
-           set.add(range[0],range[1]);
-       }
-    }
-
    private boolean showADiffB(UnicodeSet a, UnicodeSet b,
                                        String a_name, String b_name,
                                        boolean expect,
@ -2284,7 +2271,6 @@ public final class UCharacterTest extends TestFmwk
   public void TestConsistency() throws IOException {
       UnicodeSet set1, set2, set3, set4;

-       USerializedSet sset;
       int start, end;
       int i, length;

@ -2368,10 +2354,9 @@ public final class UCharacterTest extends TestFmwk
        */
       Normalizer2 norm2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
       set1=new UnicodeSet();
+       Norm2AllModes.getNFCInstanceNoIOException().impl.
+           ensureCanonIterData().getCanonStartSet(0x49, set1);
       set2=new UnicodeSet();
-       sset = new USerializedSet();
-       NormalizerImpl.getCanonStartSet(0x49,sset);
-       _setAddSerialized(set1, sset);

       /* enumerate all characters that are plausible to be latin letters */
       for(start=0xa0; start<0x2000; ++start) {
@ -2869,7 +2854,7 @@ public final class UCharacterTest extends TestFmwk
        }
        
        // Testing when "if(ch<NormalizerImpl.JAMO_L_BASE)" is true
-        for(int i=NormalizerImpl.JAMO_L_BASE-5; i<NormalizerImpl.JAMO_L_BASE; i++){
+        for(int i=Normalizer2Impl.Hangul.JAMO_L_BASE-5; i<Normalizer2Impl.Hangul.JAMO_L_BASE; i++){
            if(UCharacter.getIntPropertyValue(i, UProperty.HANGUL_SYLLABLE_TYPE) != 0){
                errln("UCharacter.getIntPropertyValue(ch, type) was suppose to return 0 " +
                        "when passing ch: " + i + "and type of Property.HANGUL_SYLLABLE_TYPE");
@ -2878,7 +2863,7 @@ public final class UCharacterTest extends TestFmwk
        }
        
        // Testing when "else if((ch-=NormalizerImpl.HANGUL_BASE)<0)" is true
-        for(int i=NormalizerImpl.HANGUL_BASE-5; i<NormalizerImpl.HANGUL_BASE; i++){
+        for(int i=Normalizer2Impl.Hangul.HANGUL_BASE-5; i<Normalizer2Impl.Hangul.HANGUL_BASE; i++){
            if(UCharacter.getIntPropertyValue(i, UProperty.HANGUL_SYLLABLE_TYPE) != 0){
                errln("UCharacter.getIntPropertyValue(ch, type) was suppose to return 0 " +
                        "when passing ch: " + i + "and type of Property.HANGUL_SYLLABLE_TYPE");
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
@ -11,12 +11,11 @@ import java.text.StringCharacterIterator;
 import java.util.Random;

 import com.ibm.icu.dev.test.TestFmwk;
-import com.ibm.icu.impl.NormalizerImpl;
+import com.ibm.icu.impl.Norm2AllModes;
+import com.ibm.icu.impl.Normalizer2Impl;
 import com.ibm.icu.impl.USerializedSet;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.*;
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.text.Normalizer;
 import com.ibm.icu.text.UCharacterIterator;
 import com.ibm.icu.text.UTF16;
@ -1998,21 +1997,17 @@ public class BasicTest extends TestFmwk {
        
        // test cases with i and I to make sure Turkic works
        char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
-        USerializedSet sset=new USerializedSet();
-        UnicodeSet set = new UnicodeSet();
-    
+        UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();
+        Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
+        nfcImpl.ensureCanonIterData();
+
        String s1, s2;
        int start, end;
    
        // collect all sets into one for contiguous output
-        int[] startEnd = new int[2];
        for(i=0; i<iI.length; ++i) {
-            if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {
-                count=sset.countRanges();
-                for(j=0; j<count; ++j) {
-                    sset.getRange(j, startEnd);
-                    set.add(startEnd[0], startEnd[1]);
-                }
+            if(nfcImpl.getCanonStartSet(iI[i], iSet)) {
+                set.addAll(iSet);
            }
        }

@ -2771,20 +2766,24 @@ public class BasicTest extends TestFmwk {
        USerializedSet sset=new USerializedSet();
        UnicodeSet set = new UnicodeSet();
        int start, end;
-    
+
+        char[] serialized = {
+            0x8007,  // length
+            3,  // bmpLength
+            0xc0, 0xfe, 0xfffc,
+            1, 9, 0x10, 0xfffc
+        };
+        sset.getSet(serialized, 0);
+
        // collect all sets into one for contiguous output
        int[] startEnd = new int[2];
-
-        if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {
-            int count=sset.countRanges();
-            for(int j=0; j<count; ++j) {
-                sset.getRange(j, startEnd);
-                set.add(startEnd[0], startEnd[1]);
-            }
+        int count=sset.countRanges();
+        for(int j=0; j<count; ++j) {
+            sset.getRange(j, startEnd);
+            set.add(startEnd[0], startEnd[1]);
        }
-       

-        // test all of these precomposed characters
+        // test all of these characters
        UnicodeSetIterator it = new UnicodeSetIterator(set);
        while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
            start=it.codepoint;
@ -2793,10 +2792,11 @@ public class BasicTest extends TestFmwk {
                if(!sset.contains(start)){
                    errln("USerializedSet.contains failed for "+Utility.hex(start,8));
                }
+                ++start;
            }
        }
    }
-    
+
    public void TestReturnFailure(){
        char[] term = {'r','\u00e9','s','u','m','\u00e9' };
        char[] decomposed_term = new char[10 + term.length + 2];
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java
@ -12,9 +12,6 @@ import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.Set;
 import com.ibm.icu.dev.test.TestFmwk;
-import com.ibm.icu.impl.Norm2AllModes;
-import com.ibm.icu.impl.Normalizer2Impl;
-import com.ibm.icu.impl.NormalizerImpl;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.CanonicalIterator;
@ -41,17 +38,6 @@ public class TestCanonicalIterator extends TestFmwk {
        {"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
    };

-    public void TestOldAndNew() {
-        Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
-        nfcImpl.ensureCanonIterData();
-        for (int c = 0; c <= 0x10ffff; ++c) {
-            if (nfcImpl.isCanonSegmentStarter(c) != NormalizerImpl.isCanonSafeStart(c)) {
-                errln(String.format("old!=new segment starter for U+%04x: old %b new %b",
-                        c, NormalizerImpl.isCanonSafeStart(c), nfcImpl.isCanonSegmentStarter(c)));
-            }
-        }
-    }
-
    public void TestExhaustive() {
        int counter = 0;
        CanonicalIterator it = new CanonicalIterator("");