ICU-11042 Han radical-stroke order data in the root collator instead of CJK tailorings, adjusted code & tests

X-SVN-Rev: 36163
2025-04-07 22:44:49 +00:00 · 2014-08-14 18:49:59 +00:00 · 2014-08-14 18:49:59 +00:00 · 438c1ac7b8
commit 438c1ac7b8
parent ec55298e96
9 changed files with 315 additions and 69111 deletions
--- a/icu4j/main/classes/collate/src/com/ibm/icu/impl/coll/CollationBuilder.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/impl/coll/CollationBuilder.java
@ -348,12 +348,9 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
            ce = rootElements.firstCEWithPrimaryAtLeast(
                baseData.getFirstPrimaryForGroup(UScript.HAN));
            break;
-        case FIRST_IMPLICIT: {
-            int ce32 = baseData.getCE32(0x4e00);
-            assert(Collation.hasCE32Tag(ce32, Collation.OFFSET_TAG));
-            ce = baseData.getCEFromOffsetCE32(0x4e00, ce32);
+        case FIRST_IMPLICIT:
+            ce = baseData.getSingleCE(0x4e00);
            break;
-        }
        case LAST_IMPLICIT:
            // We do not support tailoring to an unassigned-implicit CE.
            throw new UnsupportedOperationException(
--- a/icu4j/main/classes/collate/src/com/ibm/icu/impl/coll/CollationData.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/impl/coll/CollationData.java
@ -100,6 +100,74 @@ public final class CollationData {
        return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
    }

+    /**
+     * Returns the single CE that c maps to.
+     * Throws UnsupportedOperationException if c does not map to a single CE.
+     */
+    long getSingleCE(int c) {
+        CollationData d;
+        int ce32 = getCE32(c);
+        if(ce32 == Collation.FALLBACK_CE32) {
+            d = base;
+            ce32 = base.getCE32(c);
+        } else {
+            d = this;
+        }
+        while(Collation.isSpecialCE32(ce32)) {
+            switch(Collation.tagFromCE32(ce32)) {
+            case Collation.LATIN_EXPANSION_TAG:
+            case Collation.BUILDER_DATA_TAG:
+            case Collation.PREFIX_TAG:
+            case Collation.CONTRACTION_TAG:
+            case Collation.HANGUL_TAG:
+            case Collation.LEAD_SURROGATE_TAG:
+                throw new UnsupportedOperationException(String.format(
+                        "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
+                        c, ce32));
+            case Collation.FALLBACK_TAG:
+            case Collation.RESERVED_TAG_3:
+                throw new AssertionError(String.format(
+                        "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
+            case Collation.LONG_PRIMARY_TAG:
+                return Collation.ceFromLongPrimaryCE32(ce32);
+            case Collation.LONG_SECONDARY_TAG:
+                return Collation.ceFromLongSecondaryCE32(ce32);
+            case Collation.EXPANSION32_TAG:
+                if(Collation.lengthFromCE32(ce32) == 1) {
+                    ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
+                    break;
+                } else {
+                    throw new UnsupportedOperationException(String.format(
+                            "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
+                            c, ce32));
+                }
+            case Collation.EXPANSION_TAG: {
+                if(Collation.lengthFromCE32(ce32) == 1) {
+                    return d.ces[Collation.indexFromCE32(ce32)];
+                } else {
+                    throw new UnsupportedOperationException(String.format(
+                            "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
+                            c, ce32));
+                }
+            }
+            case Collation.DIGIT_TAG:
+                // Fetch the non-numeric-collation CE32 and continue.
+                ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
+                break;
+            case Collation.U0000_TAG:
+                assert(c == 0);
+                // Fetch the normal ce32 for U+0000 and continue.
+                ce32 = d.ce32s[0];
+                break;
+            case Collation.OFFSET_TAG:
+                return d.getCEFromOffsetCE32(c, ce32);
+            case Collation.IMPLICIT_TAG:
+                return Collation.unassignedCEFromCodePoint(c);
+            }
+        }
+        return Collation.ceFromSimpleCE32(ce32);
+    }
+
    /**
     * Returns the FCD16 value for code point c. c must be >= 0.
     */
--- a/icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
+++ b/icu4j/main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
@ -481,7 +481,7 @@ public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
            }
        }

-        // if the result is still too large, cut down to maxCount elements, by removing every nth element
+        // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element

        final int size = indexCharacters.size() - 1;
        if (size > maxLabelCount) {
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11004912c7ce72161feb732fdd36dad4d12914d8bfb378eec90eeab91951aef0
-size 10507681
+oid sha256:ccc875602d498466d04f994030ba87a6c2101f617064891c6b064c31fc84366f
+size 10647078
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_NON_IGNORABLE_SHORT.txt
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_SHIFTED_SHORT.txt
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/AlphabeticIndexTest.java
@ -551,7 +551,13 @@ public class AlphabeticIndexTest extends TestFmwk {
                if (locale.getCountry().length() != 0) {
                    continue;
                }
+                boolean isUnihan = collationValue.contains("unihan");
                AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale);
+                if (isUnihan) {
+                    // Unihan tailorings have a label per radical, and there are at least 214,
+                    // if not more when simplified radicals are distinguished.
+                    alphabeticIndex.setMaxLabelCount(500);
+                }
                final Collection mainChars = alphabeticIndex.getBucketLabels();
                String mainCharString = mainChars.toString();
                if (mainCharString.length() > 500) {
@ -559,7 +565,7 @@ public class AlphabeticIndexTest extends TestFmwk {
                }
                logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH));
                logln("Index:\t" + mainCharString);
-                if (mainChars.size() > 100) {
+                if (!isUnihan && mainChars.size() > 100) {
                    errln("Index character set too large: " +
                            locale + " [" + mainChars.size() + "]:\n    " + mainChars);
                }
@ -1013,4 +1019,27 @@ public class AlphabeticIndexTest extends TestFmwk {
        assertEquals("same strength as input Collator",
                Collator.IDENTICAL, index.getCollator().getStrength());
    }
+
+    public void TestChineseUnihan() {
+        AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan"));
+        index.setMaxLabelCount(500);  // ICU 54 default is 99.
+        AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
+        int bucketCount = immIndex.getBucketCount();
+        if(bucketCount < 216) {
+            // There should be at least an underflow and overflow label,
+            // and one for each of 214 radicals,
+            // and maybe additional labels for simplified radicals.
+            // (ICU4C: dataerrln(), prints only a warning if the data is missing)
+            errln("too few buckets/labels for Chinese/unihan: " + bucketCount +
+                    " (is zh/unihan data available?)");
+            return;
+        } else {
+            logln("Chinese/unihan has " + bucketCount + " buckets/labels");
+        }
+        // bucketIndex = radical number, adjusted for simplified radicals in lower buckets.
+        int bucketIndex = index.getBucketIndex("\u4e5d");
+        assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex);
+        bucketIndex = index.getBucketIndex("\u7527");
+        assertEquals("getBucketIndex(U+7527)", 100, bucketIndex);
+    }
 }
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationServiceTest.java
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationServiceTest.java
@ -445,7 +445,16 @@ public class CollationServiceTest extends TestFmwk {
 //                }
 //        }
 //    }
-    
+
+    private static boolean arrayContains(String[] array, String s) {
+        for (int i = 0; i < array.length; ++i) {
+            if (s.equals(array[i])) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    public void TestGetKeywordValues(){
        final String[][] PREFERRED = {
            {"und",             "standard", "eor", "search"},
@ -453,12 +462,12 @@ public class CollationServiceTest extends TestFmwk {
            {"en_029",          "standard", "eor", "search"},
            {"de_DE",           "standard", "phonebook", "search", "eor"},
            {"de_Latn_DE",      "standard", "phonebook", "search", "eor"},
-            {"zh",              "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
-            {"zh_Hans",         "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
-            {"zh_CN",           "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
-            {"zh_Hant",         "stroke", "big5han", "gb2312han", "pinyin", "zhuyin", "eor", "search", "standard"},
-            {"zh_TW",           "stroke", "big5han", "gb2312han", "pinyin", "zhuyin", "eor", "search", "standard"},
-            {"zh__PINYIN",      "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
+            {"zh",              "pinyin", "stroke", "eor", "search", "standard"},
+            {"zh_Hans",         "pinyin", "stroke", "eor", "search", "standard"},
+            {"zh_CN",           "pinyin", "stroke", "eor", "search", "standard"},
+            {"zh_Hant",         "stroke", "pinyin", "eor", "search", "standard"},
+            {"zh_TW",           "stroke", "pinyin", "eor", "search", "standard"},
+            {"zh__PINYIN",      "pinyin", "stroke", "eor", "search", "standard"},
            {"es_ES",           "standard", "search", "traditional", "eor"},
            {"es__TRADITIONAL", "traditional", "search", "standard", "eor"},
            {"und@collation=phonebook",     "standard", "eor", "search"},
@ -467,29 +476,19 @@ public class CollationServiceTest extends TestFmwk {
        };

        for (int i = 0; i < PREFERRED.length; i++) {
-            ULocale loc = new ULocale(PREFERRED[i][0]);
-            String[] expected = new String[PREFERRED[i].length - 1];
-            System.arraycopy(PREFERRED[i], 1, expected, 0, expected.length);
-
+            String locale = PREFERRED[i][0];
+            ULocale loc = new ULocale(locale);
+            String[] expected = PREFERRED[i];
            String[] pref = Collator.getKeywordValuesForLocale("collation", loc, true);
-            boolean matchPref = false;
-            if (pref.length == expected.length) {
-                matchPref = true;
-                for (int j = 0; j < pref.length; j++) {
-                    if (!pref[j].equals(expected[j])) {
-                        matchPref = false;
-                    }
+            for (int j = 1; j < expected.length; ++j) {
+                if (!arrayContains(pref, expected[j])) {
+                    errln("Keyword value " + expected[j] + " missing for locale: " + locale);
                }
            }
-            if (!matchPref) {
-                errln("FAIL: Preferred values for locale " + loc 
-                        + " got:" + Arrays.toString(pref) + " expected:" + Arrays.toString(expected));
-            }
 
-            String[] all = Collator.getKeywordValuesForLocale("collation", loc, true);
-
            // Collator.getKeywordValues return the same contents for both commonlyUsed
            // true and false.
+            String[] all = Collator.getKeywordValuesForLocale("collation", loc, false);
            boolean matchAll = false;
            if (pref.length == all.length) {
                matchAll = true;
--- a/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationTest.java
+++ b/icu4j/main/tests/collate/src/com/ibm/icu/dev/test/collator/CollationTest.java
@ -313,6 +313,16 @@ public class CollationTest extends TestFmwk {
        UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]");
        unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.

+        // Starting with CLDR 26/ICU 54, the root Han order may instead be
+        // the Unihan radical-stroke order.
+        // The tests should pass either way, so we only test the order of a small set of Han characters
+        // whose radical-stroke order is the same as their code point order.
+        UnicodeSet someHanInCPOrder = new UnicodeSet(
+                "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" +
+                "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]");
+        UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder);
+        inOrder.addAll(unassigned).freeze();
+
        UnicodeSet[] sets = { coreHan, otherHan, unassigned };
        int prev = 0;
        long prevPrimary = 0;
@ -337,7 +347,7 @@ public class CollationTest extends TestFmwk {
                    continue;
                }
                long primary = ce >>> 32;
-                if (!(primary > prevPrimary)) {
+                if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
                    errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary)
                            + ".. not greater than CE(U+" + Utility.hex(prev)
                            + ")=0x" + Utility.hex(prevPrimary) + "..");