ICU-11042 Han radical-stroke order data in the root collator instead of CJK tailorings, adjusted code & tests

X-SVN-Rev: 36163
This commit is contained in:
Markus Scherer 2014-08-14 18:49:59 +00:00
parent ec55298e96
commit 438c1ac7b8
9 changed files with 315 additions and 69111 deletions

View file

@ -348,12 +348,9 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
ce = rootElements.firstCEWithPrimaryAtLeast(
baseData.getFirstPrimaryForGroup(UScript.HAN));
break;
case FIRST_IMPLICIT: {
int ce32 = baseData.getCE32(0x4e00);
assert(Collation.hasCE32Tag(ce32, Collation.OFFSET_TAG));
ce = baseData.getCEFromOffsetCE32(0x4e00, ce32);
case FIRST_IMPLICIT:
ce = baseData.getSingleCE(0x4e00);
break;
}
case LAST_IMPLICIT:
// We do not support tailoring to an unassigned-implicit CE.
throw new UnsupportedOperationException(

View file

@ -100,6 +100,74 @@ public final class CollationData {
return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
}
/**
* Returns the single CE that c maps to.
* Throws UnsupportedOperationException if c does not map to a single CE.
*/
long getSingleCE(int c) {
CollationData d;
int ce32 = getCE32(c);
if(ce32 == Collation.FALLBACK_CE32) {
d = base;
ce32 = base.getCE32(c);
} else {
d = this;
}
while(Collation.isSpecialCE32(ce32)) {
switch(Collation.tagFromCE32(ce32)) {
case Collation.LATIN_EXPANSION_TAG:
case Collation.BUILDER_DATA_TAG:
case Collation.PREFIX_TAG:
case Collation.CONTRACTION_TAG:
case Collation.HANGUL_TAG:
case Collation.LEAD_SURROGATE_TAG:
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
case Collation.FALLBACK_TAG:
case Collation.RESERVED_TAG_3:
throw new AssertionError(String.format(
"unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
case Collation.LONG_PRIMARY_TAG:
return Collation.ceFromLongPrimaryCE32(ce32);
case Collation.LONG_SECONDARY_TAG:
return Collation.ceFromLongSecondaryCE32(ce32);
case Collation.EXPANSION32_TAG:
if(Collation.lengthFromCE32(ce32) == 1) {
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
break;
} else {
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
}
case Collation.EXPANSION_TAG: {
if(Collation.lengthFromCE32(ce32) == 1) {
return d.ces[Collation.indexFromCE32(ce32)];
} else {
throw new UnsupportedOperationException(String.format(
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
c, ce32));
}
}
case Collation.DIGIT_TAG:
// Fetch the non-numeric-collation CE32 and continue.
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
break;
case Collation.U0000_TAG:
assert(c == 0);
// Fetch the normal ce32 for U+0000 and continue.
ce32 = d.ce32s[0];
break;
case Collation.OFFSET_TAG:
return d.getCEFromOffsetCE32(c, ce32);
case Collation.IMPLICIT_TAG:
return Collation.unassignedCEFromCodePoint(c);
}
}
return Collation.ceFromSimpleCE32(ce32);
}
/**
* Returns the FCD16 value for code point c. c must be >= 0.
*/

View file

@ -481,7 +481,7 @@ public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
}
}
// if the result is still too large, cut down to maxCount elements, by removing every nth element
// if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
final int size = indexCharacters.size() - 1;
if (size > maxLabelCount) {

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:11004912c7ce72161feb732fdd36dad4d12914d8bfb378eec90eeab91951aef0
size 10507681
oid sha256:ccc875602d498466d04f994030ba87a6c2101f617064891c6b064c31fc84366f
size 10647078

View file

@ -551,7 +551,13 @@ public class AlphabeticIndexTest extends TestFmwk {
if (locale.getCountry().length() != 0) {
continue;
}
boolean isUnihan = collationValue.contains("unihan");
AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale);
if (isUnihan) {
// Unihan tailorings have a label per radical, and there are at least 214,
// if not more when simplified radicals are distinguished.
alphabeticIndex.setMaxLabelCount(500);
}
final Collection mainChars = alphabeticIndex.getBucketLabels();
String mainCharString = mainChars.toString();
if (mainCharString.length() > 500) {
@ -559,7 +565,7 @@ public class AlphabeticIndexTest extends TestFmwk {
}
logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH));
logln("Index:\t" + mainCharString);
if (mainChars.size() > 100) {
if (!isUnihan && mainChars.size() > 100) {
errln("Index character set too large: " +
locale + " [" + mainChars.size() + "]:\n " + mainChars);
}
@ -1013,4 +1019,27 @@ public class AlphabeticIndexTest extends TestFmwk {
assertEquals("same strength as input Collator",
Collator.IDENTICAL, index.getCollator().getStrength());
}
public void TestChineseUnihan() {
AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan"));
index.setMaxLabelCount(500); // ICU 54 default is 99.
AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
int bucketCount = immIndex.getBucketCount();
if(bucketCount < 216) {
// There should be at least an underflow and overflow label,
// and one for each of 214 radicals,
// and maybe additional labels for simplified radicals.
// (ICU4C: dataerrln(), prints only a warning if the data is missing)
errln("too few buckets/labels for Chinese/unihan: " + bucketCount +
" (is zh/unihan data available?)");
return;
} else {
logln("Chinese/unihan has " + bucketCount + " buckets/labels");
}
// bucketIndex = radical number, adjusted for simplified radicals in lower buckets.
int bucketIndex = index.getBucketIndex("\u4e5d");
assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex);
bucketIndex = index.getBucketIndex("\u7527");
assertEquals("getBucketIndex(U+7527)", 100, bucketIndex);
}
}

View file

@ -445,7 +445,16 @@ public class CollationServiceTest extends TestFmwk {
// }
// }
// }
private static boolean arrayContains(String[] array, String s) {
for (int i = 0; i < array.length; ++i) {
if (s.equals(array[i])) {
return true;
}
}
return false;
}
public void TestGetKeywordValues(){
final String[][] PREFERRED = {
{"und", "standard", "eor", "search"},
@ -453,12 +462,12 @@ public class CollationServiceTest extends TestFmwk {
{"en_029", "standard", "eor", "search"},
{"de_DE", "standard", "phonebook", "search", "eor"},
{"de_Latn_DE", "standard", "phonebook", "search", "eor"},
{"zh", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
{"zh_Hans", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
{"zh_CN", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
{"zh_Hant", "stroke", "big5han", "gb2312han", "pinyin", "zhuyin", "eor", "search", "standard"},
{"zh_TW", "stroke", "big5han", "gb2312han", "pinyin", "zhuyin", "eor", "search", "standard"},
{"zh__PINYIN", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
{"zh", "pinyin", "stroke", "eor", "search", "standard"},
{"zh_Hans", "pinyin", "stroke", "eor", "search", "standard"},
{"zh_CN", "pinyin", "stroke", "eor", "search", "standard"},
{"zh_Hant", "stroke", "pinyin", "eor", "search", "standard"},
{"zh_TW", "stroke", "pinyin", "eor", "search", "standard"},
{"zh__PINYIN", "pinyin", "stroke", "eor", "search", "standard"},
{"es_ES", "standard", "search", "traditional", "eor"},
{"es__TRADITIONAL", "traditional", "search", "standard", "eor"},
{"und@collation=phonebook", "standard", "eor", "search"},
@ -467,29 +476,19 @@ public class CollationServiceTest extends TestFmwk {
};
for (int i = 0; i < PREFERRED.length; i++) {
ULocale loc = new ULocale(PREFERRED[i][0]);
String[] expected = new String[PREFERRED[i].length - 1];
System.arraycopy(PREFERRED[i], 1, expected, 0, expected.length);
String locale = PREFERRED[i][0];
ULocale loc = new ULocale(locale);
String[] expected = PREFERRED[i];
String[] pref = Collator.getKeywordValuesForLocale("collation", loc, true);
boolean matchPref = false;
if (pref.length == expected.length) {
matchPref = true;
for (int j = 0; j < pref.length; j++) {
if (!pref[j].equals(expected[j])) {
matchPref = false;
}
for (int j = 1; j < expected.length; ++j) {
if (!arrayContains(pref, expected[j])) {
errln("Keyword value " + expected[j] + " missing for locale: " + locale);
}
}
if (!matchPref) {
errln("FAIL: Preferred values for locale " + loc
+ " got:" + Arrays.toString(pref) + " expected:" + Arrays.toString(expected));
}
String[] all = Collator.getKeywordValuesForLocale("collation", loc, true);
// Collator.getKeywordValues return the same contents for both commonlyUsed
// true and false.
String[] all = Collator.getKeywordValuesForLocale("collation", loc, false);
boolean matchAll = false;
if (pref.length == all.length) {
matchAll = true;

View file

@ -313,6 +313,16 @@ public class CollationTest extends TestFmwk {
UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]");
unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
// Starting with CLDR 26/ICU 54, the root Han order may instead be
// the Unihan radical-stroke order.
// The tests should pass either way, so we only test the order of a small set of Han characters
// whose radical-stroke order is the same as their code point order.
UnicodeSet someHanInCPOrder = new UnicodeSet(
"[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" +
"\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]");
UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder);
inOrder.addAll(unassigned).freeze();
UnicodeSet[] sets = { coreHan, otherHan, unassigned };
int prev = 0;
long prevPrimary = 0;
@ -337,7 +347,7 @@ public class CollationTest extends TestFmwk {
continue;
}
long primary = ce >>> 32;
if (!(primary > prevPrimary)) {
if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary)
+ ".. not greater than CE(U+" + Utility.hex(prev)
+ ")=0x" + Utility.hex(prevPrimary) + "..");