ICU-3184 CODAN java port

X-SVN-Rev: 12968
2025-04-21 04:29:31 +00:00 · 2003-08-27 22:28:45 +00:00 · 2003-08-27 22:28:45 +00:00 · c8a4b87a90
commit c8a4b87a90
parent 0b1267d260
7 changed files with 697 additions and 22 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/collator/CollationMiscTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/collator/CollationMiscTest.java
@ -20,8 +20,9 @@ package com.ibm.icu.dev.test.collator;
 import com.ibm.icu.dev.test.*;
 import com.ibm.icu.text.*;
 import com.ibm.icu.impl.Utility;
+import com.ibm.icu.impl.ICULocaleData;
 import java.util.Locale;
-
+import java.util.ResourceBundle;

 public class CollationMiscTest extends TestFmwk{

@ -30,6 +31,95 @@ public class CollationMiscTest extends TestFmwk{
        // new CollationMiscTest().TestLocaleRuleBasedCollators(); 
    }
    
+    private static final int NORM_BUFFER_TEST_LEN_ = 32;
+    private static final class Tester 
+    {
+        int u;
+        String NFC;
+        String NFD;
+    };
+    
+    private static final boolean hasCollationElements(Locale locale)
+    {
+        ResourceBundle rb = ICULocaleData.getLocaleElements(locale);
+        if (rb != null) {
+            try {
+                 Object elements = rb.getObject("CollationElements");
+                 if (elements != null) {
+                     return true;
+                 }
+            } catch (Exception e) {
+            }
+        }
+        return false;
+    }
+    
+    public void TestComposeDecompose() 
+    {
+        Tester t[] = new Tester[0x30000];
+        t[0] = new Tester();
+        logln("Testing UCA extensively\n");
+        RuleBasedCollator coll;
+        try {
+            coll = (RuleBasedCollator)Collator.getInstance(Locale.ENGLISH);
+        } 
+        catch (Exception e) {
+            errln("Error opening collator\n");
+            return;
+        }
+    
+        int noCases = 0;
+        for (int u = 0; u < 0x30000; u ++) {
+            String comp = UTF16.valueOf(u);
+            int len = comp.length();
+            t[noCases].NFC = Normalizer.normalize(u, Normalizer.NFC);
+            t[noCases].NFD = Normalizer.normalize(u, Normalizer.NFD);
+    
+            if (t[noCases].NFC.length() != t[noCases].NFD.length() 
+                || (t[noCases].NFC.compareTo(t[noCases].NFD) != 0) 
+                || (len != t[noCases].NFD.length())
+                || (comp.compareTo(t[noCases].NFD) != 0)) {
+                t[noCases].u = u;
+                if (len != t[noCases].NFD.length() 
+                    || (comp.compareTo(t[noCases].NFD) != 0)) {
+                    t[noCases].NFC = comp;
+                }
+                noCases ++;
+                t[noCases] = new Tester();
+            } 
+        }
+    
+        for (int u = 0; u < noCases; u ++) {
+            if (!coll.equals(t[u].NFC, t[u].NFD)) {
+                errln("Failure: codePoint \\u" + Integer.toHexString(t[u].u) 
+                      + " fails TestComposeDecompose in the UCA");
+                CollationTest.doTest(this, coll, t[u].NFC, t[u].NFD, 0);
+            }
+        }
+    
+        logln("Testing locales, number of cases = " + noCases);
+        Locale loc[] = Collator.getAvailableLocales();
+        for (int i = 0; i < loc.length; i ++) {
+            if (hasCollationElements(loc[i])) {
+                logln("Testing locale " + loc[i].getDisplayName());
+                coll = (RuleBasedCollator)Collator.getInstance(loc[i]);
+                coll.setStrength(Collator.IDENTICAL);
+    
+                for (int u = 0; u < noCases; u ++) {
+                    if (!coll.equals(t[u].NFC, t[u].NFD)) {
+                        errln("Failure: codePoint \\u" 
+                              + Integer.toHexString(t[u].u)
+                              + " fails TestComposeDecompose for locale "
+                              + loc[i].getDisplayName());
+                        // this tests for the iterators too
+                        CollationTest.doTest(this, coll, t[u].NFC, t[u].NFD, 
+                                             0);
+                    }
+                }
+            }
+        }
+    }
+    
    public void TestRuleOptions() {
       // values here are hardcoded and are correct for the current UCA when 
       // the UCA changes, one might be forced to change these values. 
@ -426,6 +516,9 @@ public class CollationMiscTest extends TestFmwk{
                coll.setAlternateHandlingShifted(((Boolean)values[i]
                                                            ).booleanValue());
            }
+            else if (attrs[i].equals("NumericCollation")) {
+                coll.setNumericCollation(((Boolean)values[i]).booleanValue());
+            }
        }
        
        genericOrderingTest(coll, s);
@ -1698,4 +1791,73 @@ public class CollationMiscTest extends TestFmwk{
        CollationTest.doTest(this, collator, "a", "a ", 0); // inconsistent results
    }
    
+    /**
+     * Test for CollationElementIterator previous and next for the whole set of
+     * unicode characters with normalization on.
+     */
+    public void TestNumericCollation()
+    {
+        String basicTestStrings[] = {"hello1", "hello2", "hello123456"};
+        String preZeroTestStrings[] = {"avery1",
+                                       "avery01",
+                                       "avery001",
+                                       "avery0001"};
+        String thirtyTwoBitNumericStrings[] = {"avery42949672960",
+                                               "avery42949672961",
+                                               "avery42949672962",
+                                               "avery429496729610"};
+    
+        String supplementaryDigits[] = {"\uD835\uDFCE", // 0 
+                                        "\uD835\uDFCF", // 1 
+                                        "\uD835\uDFD0", // 2 
+                                        "\uD835\uDFD1", // 3 
+                                        "\uD835\uDFCF\uD835\uDFCE", // 10 
+                                        "\uD835\uDFCF\uD835\uDFCF", // 11 
+                                        "\uD835\uDFCF\uD835\uDFD0", // 12 
+                                        "\uD835\uDFD0\uD835\uDFCE", // 20 
+                                        "\uD835\uDFD0\uD835\uDFCF", // 21 
+                                        "\uD835\uDFD0\uD835\uDFD0" // 22 
+                                       };
+    
+        String foreignDigits[] = {"\u0661",
+                                  "\u0662",
+                                  "\u0663",
+                                  "\u0661\u0660",
+                                  "\u0661\u0662",
+                                  "\u0661\u0663",
+                                  "\u0662\u0660",
+                                  "\u0662\u0662",
+                                  "\u0662\u0663",
+                                  "\u0663\u0660",
+                                  "\u0663\u0662",
+                                  "\u0663\u0663"
+                                 };
+    
+        // Open our collator.
+        RuleBasedCollator coll 
+                    = (RuleBasedCollator)Collator.getInstance(Locale.ENGLISH);
+        String att[] = {"NumericCollation"};
+        Boolean val[] = {new Boolean(true)};
+        genericLocaleStarterWithOptions(Locale.ENGLISH, basicTestStrings, att,
+                                        val);
+        genericLocaleStarterWithOptions(Locale.ENGLISH, 
+                                        thirtyTwoBitNumericStrings, att, val);
+        genericLocaleStarterWithOptions(Locale.ENGLISH, foreignDigits, att, 
+                                        val);
+        genericLocaleStarterWithOptions(Locale.ENGLISH, supplementaryDigits, 
+                                        att, val);    
+    
+        // Setting up our collator to do digits.
+        coll.setNumericCollation(true);
+    
+        // Testing that prepended zeroes still yield the correct collation 
+        // behavior. 
+        // We expect that every element in our strings array will be equal.
+        for (int i = 0; i < preZeroTestStrings.length - 1; i ++) {
+            for (int j = i + 1; j < preZeroTestStrings.length; j ++) {
+                CollationTest.doTest(this, coll, preZeroTestStrings[i], 
+                                     preZeroTestStrings[j],0);
+            }
+        }
+    }
 }
--- a/icu4j/src/com/ibm/icu/impl/data/ICULocaleData.jar
+++ b/icu4j/src/com/ibm/icu/impl/data/ICULocaleData.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e8932a08fe0f724eba842b654f00fd88ae65f631f3ef4a5896eeaba34200e7d
-size 1138811
+oid sha256:4c81d588b4c428cd79002dde1e44c7f8f5b9583f43bf85027aa6996f28615b78
+size 1223293
--- a/icu4j/src/com/ibm/icu/impl/data/ucadata.icu
+++ b/icu4j/src/com/ibm/icu/impl/data/ucadata.icu
--- a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java
+++ b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java
@ -652,6 +652,13 @@ public final class CollationElementIterator
     * will cause this value to be reset to 0.
     */
    int m_CEBufferSize_;
+    static final int CE_NOT_FOUND_ = 0xF0000000;
+    static final int CE_EXPANSION_TAG_ = 1;
+    static final int CE_CONTRACTION_TAG_ = 2;
+    /** 
+     * Collate Digits As Numbers (CODAN) implementation
+     */
+    static final int CE_DIGIT_TAG_ = 13;

    // package private methods ----------------------------------------------

@ -862,7 +869,7 @@ public final class CollationElementIterator
    private static final int SECOND_LAST_BYTE_SHIFT_ = 8;

    // special ce values and tags -------------------------------------------
-    /*private*/ static final int CE_NOT_FOUND_ = 0xF0000000;
+    
    private static final int CE_EXPANSION_ = 0xF1000000;
    private static final int CE_CONTRACTION_ = 0xF2000000;
    private static final int CE_THAI_ = 0xF3000000;
@ -876,8 +883,6 @@ public final class CollationElementIterator
    private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;

    private static final int CE_NOT_FOUND_TAG_ = 0;
-    /*private*/ static final int CE_EXPANSION_TAG_ = 1;
-    /*private*/ static final int CE_CONTRACTION_TAG_ = 2;
    private static final int CE_THAI_TAG_ = 3;
    /**
     * Charset processing, not yet implemented
@ -907,7 +912,8 @@ public final class CollationElementIterator
     * space without affecting the performance (hopefully).
     */
    private static final int CE_LONG_PRIMARY_TAG_ = 12;
-    private static final int CE_CE_TAGS_COUNT = 13;
+                        
+    private static final int CE_CE_TAGS_COUNT = 14;
    private static final int CE_BYTE_COMMON_ = 0x05;

    // end special ce values and tags ---------------------------------------
@ -2005,6 +2011,193 @@ public final class CollationElementIterator
        }
        return m_CEBuffer_[0];
    }
+    
+    /**
+     * Gets the next digit ce
+     * @param collator current collator
+     * @param ce current collation element
+     * @param cp current codepoint
+     * @return next digit ce
+     */
+    private int nextDigit(RuleBasedCollator collator, int ce, int cp)
+    {
+        // We do a check to see if we want to collate digits as numbers; 
+        // if so we generate a custom collation key. Otherwise we pull out 
+        // the value stored in the expansion table.
+
+        if (collator.m_isNumericCollation_){
+            int collateVal = 0;
+            int trailingZeroIndex = 0;
+            boolean nonZeroValReached = false;
+
+            // I just need a temporary place to store my generated CEs.
+            // icu4c uses a unsigned byte array, i'll use a stringbuffer here
+            // to avoid dealing with the sign problems and array allocation
+            // clear and set initial string buffer length
+            m_utilStringBuffer_.setLength(3);
+        
+            // We parse the source string until we hit a char that's NOT a 
+            // digit.
+            // Use this u_charDigitValue. This might be slow because we have 
+            // to handle surrogates...
+            int digVal = UCharacter.digit(cp); 
+            // if we have arrived here, we have already processed possible 
+            // supplementaries that trigered the digit tag -
+            // all supplementaries are marked in the UCA.
+            // We  pad a zero in front of the first element anyways. 
+            // This takes care of the (probably) most common case where 
+            // people are sorting things followed by a single digit
+            int digIndx = 1;
+            for (;;) {
+                // Make sure we have enough space.
+                if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
+                    m_utilStringBuffer_.setLength(m_utilStringBuffer_.length() 
+                                                  << 1);
+                }
+                // Skipping over leading zeroes.        
+                if (digVal != 0 || nonZeroValReached) {
+                    if (digVal != 0 && !nonZeroValReached) {
+                        nonZeroValReached = true;
+                    }    
+                    // We parse the digit string into base 100 numbers 
+                    // (this fits into a byte).
+                    // We only add to the buffer in twos, thus if we are 
+                    // parsing an odd character, that serves as the 
+                    // 'tens' digit while the if we are parsing an even 
+                    // one, that is the 'ones' digit. We dumped the 
+                    // parsed base 100 value (collateVal) into a buffer. 
+                    // We multiply each collateVal by 2 (to give us room) 
+                    // and add 5 (to avoid overlapping magic CE byte 
+                    // values). The last byte we subtract 1 to ensure it is 
+                    // less than all the other bytes.
+                    if (digIndx % 2 == 1) {
+                        collateVal += digVal;  
+                        // This removes trailing zeroes.
+                        if (collateVal == 0 && trailingZeroIndex == 0) {
+                            trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
+                        }
+                        else if (trailingZeroIndex != 0) {
+                            trailingZeroIndex = 0;
+                        }
+                        m_utilStringBuffer_.setCharAt(
+                                            ((digIndx - 1) >>> 1) + 2,
+                                            (char)((collateVal << 1) + 6));
+                        collateVal = 0;
+                    }
+                    else {
+                        // We drop the collation value into the buffer so if 
+                        // we need to do a "front patch" we don't have to 
+                        // check to see if we're hitting the last element.
+                        collateVal = digVal * 10;
+                        m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2, 
+                                                (char)((collateVal << 1) + 6));
+                    }
+                    digIndx ++;
+                }
+            
+                // Get next character.
+                if (!isEnd()){
+                    backupInternalState(m_utilSpecialBackUp_);
+                    char ch = nextChar();
+                    int char32 = ch;
+                    if (UTF16.isLeadSurrogate(ch)){
+                        if (!isEnd()) {
+                            char trail = nextChar();
+                            if (UTF16.isTrailSurrogate(trail)) {
+                               char32 = UCharacterProperty.getRawSupplementary(
+                                                                   ch, trail);
+                            } 
+                            else {
+                                goBackOne();
+                            }
+                        }
+                    }
+                    
+                    digVal = UCharacter.digit(char32);
+                    if (digVal == -1) {
+                        // Resetting position to point to the next unprocessed 
+                        // char. We overshot it when doing our test/set for 
+                        // numbers.
+                        updateInternalState(m_utilSpecialBackUp_);
+                        break;
+                    }
+                } 
+                else {
+                    break;
+                }
+            }
+        
+            if (nonZeroValReached == false){
+                digIndx = 2;
+                m_utilStringBuffer_.setCharAt(2, (char)6);
+            }
+        
+            int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex 
+                                             : (digIndx >>> 1) + 2;              
+            if (digIndx % 2 != 0){
+                // We missed a value. Since digIndx isn't even, stuck too many 
+                // values into the buffer (this is what we get for padding the 
+                // first byte with a zero). "Front-patch" now by pushing all 
+                // nybbles forward.
+                // Doing it this way ensures that at least 50% of the time 
+                // (statistically speaking) we'll only be doing a single pass 
+                // and optimizes for strings with single digits. I'm just 
+                // assuming that's the more common case.
+                for (int i = 2; i < endIndex; i ++){
+                    m_utilStringBuffer_.setCharAt(i, 
+                        (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1) 
+                                  % 10) * 10) 
+                                 + (((m_utilStringBuffer_.charAt(i + 1) - 6) 
+                                      >>> 1) / 10) << 1) + 6));
+                }
+                -- digIndx;
+            }
+        
+            // Subtract one off of the last byte. 
+            m_utilStringBuffer_.setCharAt(endIndex - 1, 
+                         (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));            
+                
+            // We want to skip over the first two slots in the buffer. 
+            // The first slot is reserved for the header byte 0x1B. 
+            // The second slot is for the sign/exponent byte: 
+            // 0x80 + (decimalPos/2) & 7f.
+            m_utilStringBuffer_.setCharAt(0, (char)0x1B);
+            m_utilStringBuffer_.setCharAt(1, 
+                                     (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
+        
+            // Now transfer the collation key to our collIterate struct.
+            // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
+            ce = (((m_utilStringBuffer_.charAt(0) << 8)
+                       // Primary weight 
+                       | m_utilStringBuffer_.charAt(1)) 
+                                    << RuleBasedCollator.CE_PRIMARY_SHIFT_)
+                       //  Secondary weight 
+                       | (RuleBasedCollator.BYTE_COMMON_ 
+                          << RuleBasedCollator.CE_SECONDARY_SHIFT_) 
+                       | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
+            int i = 2; // Reset the index into the buffer.
+            
+            m_CEBuffer_[0] = ce;
+            m_CEBufferSize_ = 1;
+            m_CEBufferOffset_ = 1;
+            while (i < endIndex)
+            {
+                int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
+                if (i < endIndex) {
+                    primWeight |= m_utilStringBuffer_.charAt(i ++);
+                }
+                m_CEBuffer_[m_CEBufferSize_ ++] 
+                    = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_) 
+                      | RuleBasedCollator.CE_CONTINUATION_MARKER_;
+            }
+            return ce;
+        } 
+        
+        // no numeric mode, we'll just switch to whatever we stashed and 
+        // continue
+        // find the offset to expansion table
+        return collator.m_expansion_[getExpansionOffset(collator, ce)];
+    }

    /**
     * Gets the next implicit ce for codepoints
@ -2157,6 +2350,9 @@ public final class CollationElementIterator
                    return nextLongPrimary(ce);
                case CE_EXPANSION_TAG_:
                    return nextExpansion(collator, ce);
+                case CE_DIGIT_TAG_:
+                    ce = nextDigit(collator, ce, codepoint);
+                    break;
                    // various implicits optimization
                case CE_CJK_IMPLICIT_TAG_:
                    // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
@ -2180,7 +2376,8 @@ public final class CollationElementIterator
                    break;
                }
            }
-        } finally {
+        } 
+        finally {
            m_utilSpecialEntryBackUp_ = entrybackup;
        }
        return ce;
@ -2469,6 +2666,185 @@ public final class CollationElementIterator
        m_CEBufferOffset_ = m_CEBufferSize_ - 1;
        return m_CEBuffer_[m_CEBufferOffset_];
    }
+    
+    /**
+     * Getting the digit collation elements
+     * @param collator
+     * @param ce current collation element
+     * @param ch current code point
+     * @return digit collation element
+     */
+    private int previousDigit(RuleBasedCollator collator, int ce, char ch)
+    {
+        // We do a check to see if we want to collate digits as numbers; if so we generate
+        //  a custom collation key. Otherwise we pull out the value stored in the expansion table.
+        if (collator.m_isNumericCollation_){
+            int leadingZeroIndex = 0;
+            int collateVal = 0;
+            boolean nonZeroValReached = false;
+
+            // clear and set initial string buffer length
+            m_utilStringBuffer_.setLength(3);
+        
+            // We parse the source string until we hit a char that's NOT a digit
+            // Use this u_charDigitValue. This might be slow because we have to 
+            // handle surrogates...
+            int char32 = ch;
+            if (UTF16.isTrailSurrogate(ch)) {
+                if (!isBackwardsStart()){
+                    char lead = previousChar();
+                    if (UTF16.isLeadSurrogate(lead)) {
+                        char32 = UCharacterProperty.getRawSupplementary(lead,
+                                                                        ch);
+                    } 
+                    else {
+                        goForwardOne();
+                    }
+                }
+            } 
+            int digVal = UCharacter.digit(char32);
+            int digIndx = 0;
+            for (;;) {
+                // Make sure we have enough space.
+                if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
+                    m_utilStringBuffer_.setLength(m_utilStringBuffer_.length() 
+                                                  << 1);
+                }
+                // Skipping over "trailing" zeroes but we still add to digIndx.
+                if (digVal != 0 || nonZeroValReached) {
+                    if (digVal != 0 && !nonZeroValReached) {
+                        nonZeroValReached = true;
+                    }
+                
+                    // We parse the digit string into base 100 numbers (this 
+                    // fits into a byte).
+                    // We only add to the buffer in twos, thus if we are 
+                    // parsing an odd character, that serves as the 'tens' 
+                    // digit while the if we are parsing an even one, that is 
+                    // the 'ones' digit. We dumped the parsed base 100 value 
+                    // (collateVal) into a buffer. We multiply each collateVal 
+                    // by 2 (to give us room) and add 5 (to avoid overlapping 
+                    // magic CE byte values). The last byte we subtract 1 to 
+                    // ensure it is less than all the other bytes. 
+                    // Since we're doing in this reverse we want to put the 
+                    // first digit encountered into the ones place and the 
+                    // second digit encountered into the tens place.
+                
+                    if (digIndx % 2 == 1){
+                        collateVal += digVal * 10;
+                    
+                        // This removes leading zeroes.
+                        if (collateVal == 0 && leadingZeroIndex == 0) {
+                           leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
+                        }
+                        else if (leadingZeroIndex != 0) {
+                            leadingZeroIndex = 0;
+                        }
+                                            
+                        m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2, 
+                                                (char)((collateVal << 1) + 6));
+                        collateVal = 0;
+                    }
+                    else {
+                        collateVal = digVal;    
+                    }
+                }
+                digIndx ++;
+            
+                if (!isBackwardsStart()){
+                    backupInternalState(m_utilSpecialBackUp_);
+                    ch = previousChar();
+                    char32 = ch;
+                    if (UTF16.isTrailSurrogate(ch)){
+                        if (!isBackwardsStart()) {
+                            char lead = previousChar();
+                            if (UTF16.isLeadSurrogate(lead)) {
+                                char32 
+                                    = UCharacterProperty.getRawSupplementary(
+                                                                    lead, ch);
+                            } 
+                            else {
+                                updateInternalState(m_utilSpecialBackUp_);
+                            }
+                        }
+                    }
+                    
+                    digVal = UCharacter.digit(char32);
+                    if (digVal == -1) {
+                        updateInternalState(m_utilSpecialBackUp_);
+                        break;
+                    }
+                }
+                else {
+                    break;
+                }
+            }
+
+            if (nonZeroValReached == false) {
+                digIndx = 2;
+                m_utilStringBuffer_.setCharAt(2, (char)6);
+            }
+            
+            if (digIndx % 2 != 0) {
+                if (collateVal == 0 && leadingZeroIndex == 0) {
+                    // This removes the leading 0 in a odd number sequence of 
+                    // numbers e.g. avery001
+                    leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
+                }
+                else {
+                    // this is not a leading 0, we add it in
+                    m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
+                                                (char)((collateVal << 1) + 6));
+                    digIndx ++; 
+                }               
+            }
+                     
+            int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex 
+                                               : ((digIndx >>> 1) + 2) ;  
+            digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros         
+            // Subtract one off of the last byte. 
+            // Really the first byte here, but it's reversed...
+            m_utilStringBuffer_.setCharAt(2, 
+                                    (char)(m_utilStringBuffer_.charAt(2) - 1));          
+            // We want to skip over the first two slots in the buffer. 
+            // The first slot is reserved for the header byte 0x1B. 
+            // The second slot is for the sign/exponent byte: 
+            // 0x80 + (decimalPos/2) & 7f.
+            m_utilStringBuffer_.setCharAt(0, (char)0x1B);
+            m_utilStringBuffer_.setCharAt(1, 
+                                    (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
+        
+            // Now transfer the collation key to our collIterate struct.
+            // The total size for our collation key is endIndx bumped up to the 
+            // next largest even value divided by two.
+            m_CEBufferSize_ = 0;
+            m_CEBuffer_[m_CEBufferSize_ ++] 
+                        = (((m_utilStringBuffer_.charAt(0) << 8)
+                            // Primary weight 
+                            | m_utilStringBuffer_.charAt(1)) 
+                              << RuleBasedCollator.CE_PRIMARY_SHIFT_)
+                            // Secondary weight 
+                            | (RuleBasedCollator.BYTE_COMMON_ 
+                               << RuleBasedCollator.CE_SECONDARY_SHIFT_)
+                            // Tertiary weight. 
+                            | RuleBasedCollator.BYTE_COMMON_; 
+             int i = endIndex - 1; // Reset the index into the buffer.
+             while (i >= 2) {
+                int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
+                if (i >= 2) {
+                    primWeight |= m_utilStringBuffer_.charAt(i --);
+                }
+                m_CEBuffer_[m_CEBufferSize_ ++] 
+                    = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_) 
+                      | RuleBasedCollator.CE_CONTINUATION_MARKER_;
+             }
+             m_CEBufferOffset_ = m_CEBufferSize_ - 1;
+             return m_CEBuffer_[m_CEBufferOffset_];
+         }
+         else {
+             return collator.m_expansion_[getExpansionOffset(collator, ce)];
+         }
+    } 

    /**
     * Returns previous hangul ces
@ -2600,6 +2976,9 @@ public final class CollationElementIterator
                return previousLongPrimary(ce);
            case CE_EXPANSION_TAG_: // always returns
                return previousExpansion(collator, ce);
+            case CE_DIGIT_TAG_:
+                ce = previousDigit(collator, ce, ch);
+                break;
            case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
                return previousHangul(collator, ch);
            case CE_LEAD_SURROGATE_TAG_:  // D800-DBFF
@ -2728,4 +3107,45 @@ public final class CollationElementIterator
            return m_source_.current();
        }
    }
+    
+    /**
+     * Moves back 1 position in the source string. This is slightly less 
+     * complicated than previousChar in that it doesn't normalize while 
+     * moving back. Boundary checks are not performed.
+     * This method is to be used with caution, with the assumption that 
+     * moving back one position will not exceed the source limits.
+     * Use only with nextChar() and never call this API twice in a row without
+     * nextChar() in the middle.
+     */
+    private void goBackOne() 
+    {
+        if (m_bufferOffset_ >= 0) {
+            m_bufferOffset_ --;
+        }
+        else {
+            m_source_.setIndex(m_source_.getIndex() - 1);
+        }
+    }
+    
+    /**
+     * Moves forward 1 position in the source string. This is slightly less 
+     * complicated than nextChar in that it doesn't normalize while 
+     * moving back. Boundary checks are not performed.
+     * This method is to be used with caution, with the assumption that 
+     * moving back one position will not exceed the source limits.
+     * Use only with previousChar() and never call this API twice in a row 
+     * without previousChar() in the middle.
+     */
+    private void goForwardOne() 
+    {
+        if (m_bufferOffset_ < 0) {
+            // we're working on the source and not normalizing. fast path.
+            // note Thai pre-vowel reordering uses buffer too
+            m_source_.setIndex(m_source_.getIndex() + 1);
+        }
+        else {
+            // we are in the buffer, buffer offset will never be 0 here
+            m_bufferOffset_ ++;
+        }
+    }
 }
--- a/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
+++ b/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java,v $ 
-* $Date: 2003/08/20 00:20:37 $ 
-* $Revision: 1.23 $
+* $Date: 2003/08/27 22:28:45 $ 
+* $Revision: 1.24 $
 *
 *******************************************************************************
 */
@ -24,6 +24,7 @@ import com.ibm.icu.impl.TrieBuilder;
 import com.ibm.icu.impl.IntTrieBuilder;
 import com.ibm.icu.impl.TrieIterator;
 import com.ibm.icu.impl.Utility;
+import com.ibm.icu.impl.UCharacterProperty;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.impl.NormalizerImpl;
@ -1865,6 +1866,7 @@ final class CollationParsedRuleBuilder
    private int addAnElement(BuildTable t, Elements element) 
    {
  		Vector expansions = t.m_expansions_;
+        element.m_mapCE_ = 0;
        if (element.m_CELength_ == 1) {
 	    	if (element.m_isThai_ == false) {
 	            element.m_mapCE_ = element.m_CEs_[0];
@ -1941,6 +1943,41 @@ final class CollationParsedRuleBuilder
 			    }
 		    }
 	    }
+        
+        // We treat digits differently - they are "uber special" and should be
+        // processed differently if numeric collation is on. 
+        int uniChar = 0;
+        if ((element.m_uchars_.length() == 2) 
+            && UTF16.isLeadSurrogate(element.m_uchars_.charAt(0))) {
+            uniChar = UCharacterProperty.getRawSupplementary(
+                                                element.m_uchars_.charAt(0), 
+                                                element.m_uchars_.charAt(1));      
+        } 
+        else if (element.m_uchars_.length() == 1) {
+            uniChar = element.m_uchars_.charAt(0);
+        }
+        
+        // Here, we either have one normal CE OR mapCE is set. Therefore, we 
+        // stuff only one element to the expansion buffer. When we encounter a 
+        // digit and we don't do numeric collation, we will just pick the CE 
+        // we have and break out of case (see ucol.cpp ucol_prv_getSpecialCE 
+        // && ucol_prv_getSpecialPrevCE). If we picked a special, further 
+        // processing will occur. If it's a simple CE, we'll return due
+        // to how the loop is constructed.
+        if (uniChar != 0 && UCharacter.isDigit(uniChar)) {
+            // prepare the element
+            int expansion = RuleBasedCollator.CE_SPECIAL_FLAG_ 
+                            | (CollationElementIterator.CE_DIGIT_TAG_
+                               << RuleBasedCollator.CE_TAG_SHIFT_) | 1; 
+            if (element.m_mapCE_ != 0) { 
+                // if there is an expansion, we'll pick it here
+                expansion |= (addExpansion(expansions, element.m_mapCE_) << 4);
+            } 
+            else {
+                expansion |= (addExpansion(expansions, element.m_CEs_[0]) << 4);
+            }
+            element.m_mapCE_ = expansion;
+        }
 	
 	    // here we want to add the prefix structure.
 	    // I will try to process it as a reverse contraction, if possible.
--- a/icu4j/src/com/ibm/icu/text/CollatorReader.java
+++ b/icu4j/src/com/ibm/icu/text/CollatorReader.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $ 
-* $Date: 2003/06/03 18:49:34 $ 
-* $Revision: 1.13 $
+* $Date: 2003/08/27 22:28:45 $ 
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -91,8 +91,8 @@ final class CollatorReader
    */
    protected void readHeader(RuleBasedCollator rbc) throws IOException
    {
-    	int size = m_dataInputStream_.readInt();
-    	// all the offsets are in bytes
+        int size = m_dataInputStream_.readInt();
+        // all the offsets are in bytes
      	// to get the address add to the header address and cast properly 
      	// Default options int options
        m_dataInputStream_.skip(4); // options
@ -166,7 +166,7 @@ final class CollatorReader
     */
    protected void readOptions(RuleBasedCollator rbc) throws IOException
    {
-    	rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
+        rbc.m_defaultVariableTopValue_ = m_dataInputStream_.readInt();
    	rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt()
    	                                == RuleBasedCollator.AttributeValue.ON_);
        rbc.m_defaultIsAlternateHandlingShifted_ 
@ -186,6 +186,8 @@ final class CollatorReader
    	rbc.m_defaultStrength_ = m_dataInputStream_.readInt();
    	rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt() 
    	                             == RuleBasedCollator.AttributeValue.ON_);
+        rbc.m_defaultIsNumericCollation_ = (m_dataInputStream_.readInt() 
+                                      == RuleBasedCollator.AttributeValue.ON_);
        m_dataInputStream_.skip(64); // reserved for future use
    }
    
@ -206,7 +208,7 @@ final class CollatorReader
    {
    	readHeader(rbc);
    	readOptions(rbc);
-    	m_expansionSize_ >>= 2;
+        m_expansionSize_ >>= 2;
    	rbc.m_expansion_ = new int[m_expansionSize_];
    	for (int i = 0; i < m_expansionSize_; i ++) {
    		rbc.m_expansion_[i] = m_dataInputStream_.readInt();
--- a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $
-* $Date: 2003/08/25 23:23:12 $
-* $Revision: 1.44 $
+* $Date: 2003/08/27 22:28:45 $
+* $Revision: 1.45 $
 *
 *******************************************************************************
 */
@ -445,6 +445,20 @@ public final class RuleBasedCollator extends Collator
        setStrength(m_defaultStrength_);
    }
    
+    /**
+     * Method to set numeric collation to its default value.
+     * When numeric collation is turned on, this Collator generates a collation 
+     * key for the numeric value of substrings of digits. This is a way to get 
+     * '100' to sort AFTER '2'
+     * @see #getNumericCollation
+     * @see #setNumericCollation
+     * @draft ICU 2.8
+     */
+    public void setNumericCollationDefault()
+    {
+        setNumericCollation(m_defaultIsNumericCollation_);
+    }
+
    /**
     * Sets the mode for the direction of SECONDARY weights to be used in
     * French collation.
@ -625,6 +639,21 @@ public final class RuleBasedCollator extends Collator
        m_variableTopValue_ = (varTop & CE_PRIMARY_MASK_) >> 16;
    }
    
+    /**
+     * When numeric collation is turned on, this Collator generates a collation 
+     * key for the numeric value of substrings of digits. This is a way to get 
+     * '100' to sort AFTER '2'
+     * @param flag true to turn numeric collation on and false to turn it off
+     * @see #getNumericCollation
+     * @see #setNumericCollationDefault
+     * @draft ICU 2.8
+     */
+    public void setNumericCollation(boolean flag)
+    {
+        // sort substrings of digits as numbers
+        m_isNumericCollation_ = flag;
+    }
+
    // public getters --------------------------------------------------------

    /**
@ -863,6 +892,21 @@ public final class RuleBasedCollator extends Collator
          return m_variableTopValue_ << 16;
    }
    
+    /** 
+     * Method to retrieve the numeric collation value.
+     * When numeric collation is turned on, this Collator generates a collation 
+     * key for the numeric value of substrings of digits. This is a way to get 
+     * '100' to sort AFTER '2'
+     * @see #setNumericCollation
+     * @see #setNumericCollationDefault
+     * @return true if numeric collation is turned on, false otherwise
+     * @draft ICU 2.8
+     */
+    public boolean getNumericCollation()
+    {
+        return m_isNumericCollation_;
+    }
+    
    // public other methods -------------------------------------------------

    /**
@ -1324,6 +1368,7 @@ public final class RuleBasedCollator extends Collator
    boolean m_isJamoSpecial_;

    // Collator options ------------------------------------------------------
+    
    int m_defaultVariableTopValue_;
    boolean m_defaultIsFrenchCollation_;
    boolean m_defaultIsAlternateHandlingShifted_;
@ -1332,6 +1377,8 @@ public final class RuleBasedCollator extends Collator
    int m_defaultDecomposition_;
    int m_defaultStrength_;
    boolean m_defaultIsHiragana4_;
+    boolean m_defaultIsNumericCollation_;
+    
    /**
     * Value of the variable top
     */
@ -1344,6 +1391,10 @@ public final class RuleBasedCollator extends Collator
     * Case sorting customization
     */
    int m_caseFirst_;
+    /**
+     * Numeric collation option
+     */
+    boolean m_isNumericCollation_;

    // end Collator options --------------------------------------------------

@ -1515,10 +1566,9 @@ public final class RuleBasedCollator extends Collator
                Object elements = rb.getObject("CollationElements");
                if (elements != null) {
                    Object[][] rules = (Object[][])elements;
-                    m_rules_ = (String)rules[1][1];
                    // %%CollationBin
                    if(rules[0][1] instanceof byte[]){
-
+                        m_rules_ = (String)rules[1][1];
                        byte map[] = (byte [])rules[0][1];
                        BufferedInputStream input =
                                                 new BufferedInputStream(
@ -1547,7 +1597,8 @@ public final class RuleBasedCollator extends Collator
                        // due to resource redirection ICUListResourceBundle does not
                        // raise missing resource error
                        //throw new MissingResourceException("Could not get resource for constructing RuleBasedCollator","com.ibm.icu.impl.data.LocaleElements_"+locale.toString(), "%%CollationBin");
-                        init((String)rules[1][1]);
+                        m_rules_ = (String)rules[0][1];
+                        init(m_rules_);
                        return;
                    }
                }
@ -1606,6 +1657,7 @@ public final class RuleBasedCollator extends Collator
        m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_;
        m_defaultStrength_ = UCA_.m_defaultStrength_;
        m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_;
+        m_defaultIsNumericCollation_ = UCA_.m_defaultIsNumericCollation_;
        m_expansionOffset_ = UCA_.m_expansionOffset_;
        m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_;
        m_isCaseLevel_ = UCA_.m_isCaseLevel_;
@ -1621,6 +1673,7 @@ public final class RuleBasedCollator extends Collator
        m_top3_ = UCA_.m_top3_;
        m_topCount3_ = UCA_.m_topCount3_;
        m_variableTopValue_ = UCA_.m_variableTopValue_;
+        m_isNumericCollation_ = UCA_.m_isNumericCollation_;
        setWithUCATables();
        latinOneFailed_ = false;
    }
@ -1818,7 +1871,7 @@ public final class RuleBasedCollator extends Collator
     * Minimum size required for the binary collation data in bytes.
     * Size of UCA header + size of options to 4 bytes
     */
-    private static final int MIN_BINARY_DATA_SIZE_ = (42 + 24) << 2;
+    private static final int MIN_BINARY_DATA_SIZE_ = (42 + 25) << 2;

    /**
     * If this collator is to generate only simple tertiaries for fast path
@ -3679,6 +3732,7 @@ public final class RuleBasedCollator extends Collator
        m_isCaseLevel_ = m_defaultIsCaseLevel_;
        m_caseFirst_ = m_defaultCaseFirst_;
        m_isHiragana4_ = m_defaultIsHiragana4_;
+        m_isNumericCollation_ = m_defaultIsNumericCollation_;
        latinOneFailed_ = false;
        updateInternalState();
    }