mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-11042 Han radical-stroke order data in the root collator instead of CJK tailorings, adjusted code & tests
X-SVN-Rev: 36163
This commit is contained in:
parent
ec55298e96
commit
438c1ac7b8
9 changed files with 315 additions and 69111 deletions
|
@ -348,12 +348,9 @@ public final class CollationBuilder extends CollationRuleParser.Sink {
|
|||
ce = rootElements.firstCEWithPrimaryAtLeast(
|
||||
baseData.getFirstPrimaryForGroup(UScript.HAN));
|
||||
break;
|
||||
case FIRST_IMPLICIT: {
|
||||
int ce32 = baseData.getCE32(0x4e00);
|
||||
assert(Collation.hasCE32Tag(ce32, Collation.OFFSET_TAG));
|
||||
ce = baseData.getCEFromOffsetCE32(0x4e00, ce32);
|
||||
case FIRST_IMPLICIT:
|
||||
ce = baseData.getSingleCE(0x4e00);
|
||||
break;
|
||||
}
|
||||
case LAST_IMPLICIT:
|
||||
// We do not support tailoring to an unassigned-implicit CE.
|
||||
throw new UnsupportedOperationException(
|
||||
|
|
|
@ -100,6 +100,74 @@ public final class CollationData {
|
|||
return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the single CE that c maps to.
|
||||
* Throws UnsupportedOperationException if c does not map to a single CE.
|
||||
*/
|
||||
long getSingleCE(int c) {
|
||||
CollationData d;
|
||||
int ce32 = getCE32(c);
|
||||
if(ce32 == Collation.FALLBACK_CE32) {
|
||||
d = base;
|
||||
ce32 = base.getCE32(c);
|
||||
} else {
|
||||
d = this;
|
||||
}
|
||||
while(Collation.isSpecialCE32(ce32)) {
|
||||
switch(Collation.tagFromCE32(ce32)) {
|
||||
case Collation.LATIN_EXPANSION_TAG:
|
||||
case Collation.BUILDER_DATA_TAG:
|
||||
case Collation.PREFIX_TAG:
|
||||
case Collation.CONTRACTION_TAG:
|
||||
case Collation.HANGUL_TAG:
|
||||
case Collation.LEAD_SURROGATE_TAG:
|
||||
throw new UnsupportedOperationException(String.format(
|
||||
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
|
||||
c, ce32));
|
||||
case Collation.FALLBACK_TAG:
|
||||
case Collation.RESERVED_TAG_3:
|
||||
throw new AssertionError(String.format(
|
||||
"unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
|
||||
case Collation.LONG_PRIMARY_TAG:
|
||||
return Collation.ceFromLongPrimaryCE32(ce32);
|
||||
case Collation.LONG_SECONDARY_TAG:
|
||||
return Collation.ceFromLongSecondaryCE32(ce32);
|
||||
case Collation.EXPANSION32_TAG:
|
||||
if(Collation.lengthFromCE32(ce32) == 1) {
|
||||
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
|
||||
break;
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format(
|
||||
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
|
||||
c, ce32));
|
||||
}
|
||||
case Collation.EXPANSION_TAG: {
|
||||
if(Collation.lengthFromCE32(ce32) == 1) {
|
||||
return d.ces[Collation.indexFromCE32(ce32)];
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format(
|
||||
"there is not exactly one collation element for U+%04X (CE32 0x%08x)",
|
||||
c, ce32));
|
||||
}
|
||||
}
|
||||
case Collation.DIGIT_TAG:
|
||||
// Fetch the non-numeric-collation CE32 and continue.
|
||||
ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
|
||||
break;
|
||||
case Collation.U0000_TAG:
|
||||
assert(c == 0);
|
||||
// Fetch the normal ce32 for U+0000 and continue.
|
||||
ce32 = d.ce32s[0];
|
||||
break;
|
||||
case Collation.OFFSET_TAG:
|
||||
return d.getCEFromOffsetCE32(c, ce32);
|
||||
case Collation.IMPLICIT_TAG:
|
||||
return Collation.unassignedCEFromCodePoint(c);
|
||||
}
|
||||
}
|
||||
return Collation.ceFromSimpleCE32(ce32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the FCD16 value for code point c. c must be >= 0.
|
||||
*/
|
||||
|
|
|
@ -481,7 +481,7 @@ public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
|
|||
}
|
||||
}
|
||||
|
||||
// if the result is still too large, cut down to maxCount elements, by removing every nth element
|
||||
// if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
|
||||
|
||||
final int size = indexCharacters.size() - 1;
|
||||
if (size > maxLabelCount) {
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:11004912c7ce72161feb732fdd36dad4d12914d8bfb378eec90eeab91951aef0
|
||||
size 10507681
|
||||
oid sha256:ccc875602d498466d04f994030ba87a6c2101f617064891c6b064c31fc84366f
|
||||
size 10647078
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -551,7 +551,13 @@ public class AlphabeticIndexTest extends TestFmwk {
|
|||
if (locale.getCountry().length() != 0) {
|
||||
continue;
|
||||
}
|
||||
boolean isUnihan = collationValue.contains("unihan");
|
||||
AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale);
|
||||
if (isUnihan) {
|
||||
// Unihan tailorings have a label per radical, and there are at least 214,
|
||||
// if not more when simplified radicals are distinguished.
|
||||
alphabeticIndex.setMaxLabelCount(500);
|
||||
}
|
||||
final Collection mainChars = alphabeticIndex.getBucketLabels();
|
||||
String mainCharString = mainChars.toString();
|
||||
if (mainCharString.length() > 500) {
|
||||
|
@ -559,7 +565,7 @@ public class AlphabeticIndexTest extends TestFmwk {
|
|||
}
|
||||
logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH));
|
||||
logln("Index:\t" + mainCharString);
|
||||
if (mainChars.size() > 100) {
|
||||
if (!isUnihan && mainChars.size() > 100) {
|
||||
errln("Index character set too large: " +
|
||||
locale + " [" + mainChars.size() + "]:\n " + mainChars);
|
||||
}
|
||||
|
@ -1013,4 +1019,27 @@ public class AlphabeticIndexTest extends TestFmwk {
|
|||
assertEquals("same strength as input Collator",
|
||||
Collator.IDENTICAL, index.getCollator().getStrength());
|
||||
}
|
||||
|
||||
public void TestChineseUnihan() {
|
||||
AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan"));
|
||||
index.setMaxLabelCount(500); // ICU 54 default is 99.
|
||||
AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
|
||||
int bucketCount = immIndex.getBucketCount();
|
||||
if(bucketCount < 216) {
|
||||
// There should be at least an underflow and overflow label,
|
||||
// and one for each of 214 radicals,
|
||||
// and maybe additional labels for simplified radicals.
|
||||
// (ICU4C: dataerrln(), prints only a warning if the data is missing)
|
||||
errln("too few buckets/labels for Chinese/unihan: " + bucketCount +
|
||||
" (is zh/unihan data available?)");
|
||||
return;
|
||||
} else {
|
||||
logln("Chinese/unihan has " + bucketCount + " buckets/labels");
|
||||
}
|
||||
// bucketIndex = radical number, adjusted for simplified radicals in lower buckets.
|
||||
int bucketIndex = index.getBucketIndex("\u4e5d");
|
||||
assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex);
|
||||
bucketIndex = index.getBucketIndex("\u7527");
|
||||
assertEquals("getBucketIndex(U+7527)", 100, bucketIndex);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -445,7 +445,16 @@ public class CollationServiceTest extends TestFmwk {
|
|||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
private static boolean arrayContains(String[] array, String s) {
|
||||
for (int i = 0; i < array.length; ++i) {
|
||||
if (s.equals(array[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void TestGetKeywordValues(){
|
||||
final String[][] PREFERRED = {
|
||||
{"und", "standard", "eor", "search"},
|
||||
|
@ -453,12 +462,12 @@ public class CollationServiceTest extends TestFmwk {
|
|||
{"en_029", "standard", "eor", "search"},
|
||||
{"de_DE", "standard", "phonebook", "search", "eor"},
|
||||
{"de_Latn_DE", "standard", "phonebook", "search", "eor"},
|
||||
{"zh", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
|
||||
{"zh_Hans", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
|
||||
{"zh_CN", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
|
||||
{"zh_Hant", "stroke", "big5han", "gb2312han", "pinyin", "zhuyin", "eor", "search", "standard"},
|
||||
{"zh_TW", "stroke", "big5han", "gb2312han", "pinyin", "zhuyin", "eor", "search", "standard"},
|
||||
{"zh__PINYIN", "pinyin", "big5han", "gb2312han", "stroke", "zhuyin", "eor", "search", "standard"},
|
||||
{"zh", "pinyin", "stroke", "eor", "search", "standard"},
|
||||
{"zh_Hans", "pinyin", "stroke", "eor", "search", "standard"},
|
||||
{"zh_CN", "pinyin", "stroke", "eor", "search", "standard"},
|
||||
{"zh_Hant", "stroke", "pinyin", "eor", "search", "standard"},
|
||||
{"zh_TW", "stroke", "pinyin", "eor", "search", "standard"},
|
||||
{"zh__PINYIN", "pinyin", "stroke", "eor", "search", "standard"},
|
||||
{"es_ES", "standard", "search", "traditional", "eor"},
|
||||
{"es__TRADITIONAL", "traditional", "search", "standard", "eor"},
|
||||
{"und@collation=phonebook", "standard", "eor", "search"},
|
||||
|
@ -467,29 +476,19 @@ public class CollationServiceTest extends TestFmwk {
|
|||
};
|
||||
|
||||
for (int i = 0; i < PREFERRED.length; i++) {
|
||||
ULocale loc = new ULocale(PREFERRED[i][0]);
|
||||
String[] expected = new String[PREFERRED[i].length - 1];
|
||||
System.arraycopy(PREFERRED[i], 1, expected, 0, expected.length);
|
||||
|
||||
String locale = PREFERRED[i][0];
|
||||
ULocale loc = new ULocale(locale);
|
||||
String[] expected = PREFERRED[i];
|
||||
String[] pref = Collator.getKeywordValuesForLocale("collation", loc, true);
|
||||
boolean matchPref = false;
|
||||
if (pref.length == expected.length) {
|
||||
matchPref = true;
|
||||
for (int j = 0; j < pref.length; j++) {
|
||||
if (!pref[j].equals(expected[j])) {
|
||||
matchPref = false;
|
||||
}
|
||||
for (int j = 1; j < expected.length; ++j) {
|
||||
if (!arrayContains(pref, expected[j])) {
|
||||
errln("Keyword value " + expected[j] + " missing for locale: " + locale);
|
||||
}
|
||||
}
|
||||
if (!matchPref) {
|
||||
errln("FAIL: Preferred values for locale " + loc
|
||||
+ " got:" + Arrays.toString(pref) + " expected:" + Arrays.toString(expected));
|
||||
}
|
||||
|
||||
String[] all = Collator.getKeywordValuesForLocale("collation", loc, true);
|
||||
|
||||
// Collator.getKeywordValues return the same contents for both commonlyUsed
|
||||
// true and false.
|
||||
String[] all = Collator.getKeywordValuesForLocale("collation", loc, false);
|
||||
boolean matchAll = false;
|
||||
if (pref.length == all.length) {
|
||||
matchAll = true;
|
||||
|
|
|
@ -313,6 +313,16 @@ public class CollationTest extends TestFmwk {
|
|||
UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]");
|
||||
unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings.
|
||||
|
||||
// Starting with CLDR 26/ICU 54, the root Han order may instead be
|
||||
// the Unihan radical-stroke order.
|
||||
// The tests should pass either way, so we only test the order of a small set of Han characters
|
||||
// whose radical-stroke order is the same as their code point order.
|
||||
UnicodeSet someHanInCPOrder = new UnicodeSet(
|
||||
"[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" +
|
||||
"\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]");
|
||||
UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder);
|
||||
inOrder.addAll(unassigned).freeze();
|
||||
|
||||
UnicodeSet[] sets = { coreHan, otherHan, unassigned };
|
||||
int prev = 0;
|
||||
long prevPrimary = 0;
|
||||
|
@ -337,7 +347,7 @@ public class CollationTest extends TestFmwk {
|
|||
continue;
|
||||
}
|
||||
long primary = ce >>> 32;
|
||||
if (!(primary > prevPrimary)) {
|
||||
if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
|
||||
errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary)
|
||||
+ ".. not greater than CE(U+" + Utility.hex(prev)
|
||||
+ ")=0x" + Utility.hex(prevPrimary) + "..");
|
||||
|
|
Loading…
Add table
Reference in a new issue