ICU-4229 Enhanced the checking further.

X-SVN-Rev: 38128
This commit is contained in:
Mark Davis 2015-12-14 16:14:40 +00:00
parent bb50b15953
commit 565abe4808
2 changed files with 132 additions and 40 deletions

View file

@ -93,7 +93,7 @@ public class LocaleValidityChecker {
if (!isValidT(locale.getExtension(c), where)) return false;
break;
case u:
if (!isValidU(locale.getExtension(c), where)) return false;
if (!isValidU(locale, locale.getExtension(c), where)) return false;
break;
}
} catch (Exception e) {
@ -104,12 +104,14 @@ public class LocaleValidityChecker {
}
enum SpecialCase {
normal, anything, reorder, codepoints;
normal, anything, reorder, codepoints, subdivision;
static SpecialCase get(String key) {
if (key.equals("kr")) {
return SpecialCase.reorder;
} else if (key.equals("vt")) {
return SpecialCase.codepoints;
} else if (key.equals("sd")) {
return subdivision;
} else if (key.equals("x0")) {
return anything;
} else {
@ -118,15 +120,17 @@ public class LocaleValidityChecker {
}
}
/**
* @param locale
* @param extension
* @param where
* @return
*/
private boolean isValidU(String extensionString, Where where) {
private boolean isValidU(ULocale locale, String extensionString, Where where) {
String key = "";
int typeCount = 0;
ValueType valueType = null;
SpecialCase specialCase = null;
StringBuilder prefix = new StringBuilder();
// TODO: is empty -u- valid?
for (String subtag : SEPARATOR.split(extensionString)) {
if (subtag.length() == 2) {
@ -142,8 +146,20 @@ public class LocaleValidityChecker {
typeCount = 0;
} else {
++typeCount;
if (valueType == ValueType.single && typeCount > 1) {
return where.set(Datatype.u, key+"-"+subtag);
switch (valueType) {
case single:
if (typeCount > 1) {
return where.set(Datatype.u, key+"-"+subtag);
}
break;
case incremental:
if (typeCount == 1) {
prefix.setLength(0);
prefix.append(subtag);
} else {
prefix.append('-').append(subtag);
subtag = prefix.toString();
}
}
switch (specialCase) {
case anything:
@ -162,8 +178,13 @@ public class LocaleValidityChecker {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
case subdivision:
if (!isSubdivision(locale, subtag)) {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
}
// en-u-sd-usca
// en-US-u-sd-usca
Output<Boolean> isKnownKey = new Output<Boolean>();
@ -180,6 +201,33 @@ public class LocaleValidityChecker {
return true;
}
/**
* @param locale
* @param subtag
* @return
*/
private boolean isSubdivision(ULocale locale, String subtag) {
// First check if the subtag is valid
if (subtag.length() < 3) {
return false;
}
String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2);
String subdivision = subtag.substring(region.length());
if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision) == null) {
return false;
}
// Then check for consistency with the locale's region
String localeRegion = locale.getCountry();
if (localeRegion.isEmpty()) {
ULocale max = ULocale.addLikelySubtags(locale);
localeRegion = max.getCountry();
}
if (!region.equalsIgnoreCase(localeRegion)) {
return false;
}
return true;
}
static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others"));
static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy"));
/**
@ -194,12 +242,12 @@ public class LocaleValidityChecker {
return false;
}
return ValidIdentifiers.isValid(Datatype.script, datasubtypes, subtag) != null;
// space, punct, symbol, currency, digit - core groups of characters below 'a'
// any script code except Common and Inherited.
// sc ; Zinh ; Inherited ; Qaai
// sc ; Zyyy ; Common
// Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
// others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false;
// space, punct, symbol, currency, digit - core groups of characters below 'a'
// any script code except Common and Inherited.
// sc ; Zinh ; Inherited ; Qaai
// sc ; Zyyy ; Common
// Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
// others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false;
}
/**
@ -235,14 +283,14 @@ public class LocaleValidityChecker {
}
public enum ValueType {
single, multiple, specific;
single, multiple, incremental;
private static Set<String> multipleValueTypes = new HashSet<String>(Arrays.asList("x0", "kr", "vt"));
private static Set<String> specificValueTypes = new HashSet<String>(Arrays.asList("ca"));
static ValueType get(String key) {
if (multipleValueTypes.contains(key)) {
return multiple;
} else if (specificValueTypes.contains(key)) {
return specific;
return incremental;
} else {
return single;
}

View file

@ -34,15 +34,11 @@ public class TestLocaleValidity extends TestFmwk {
public void testBasic() {
String[][] tests = {
{"OK", "en-u-kr-latn-digit"},
{"Incomplete extension 'u' [at index 3]", "en-u"},
{"Incomplete extension 't' [at index 3]", "en-t"},
{"OK", "en-u-ca-chinese"},
{"OK", "en-x-abcdefg"},
{"OK", "x-abcdefg"},
{"OK", "en-u-sd-usca"},
{"OK", "en-US-u-sd-usca"},
{"OK", "en-AQ-u-sd-usca"},
{"OK", "en-t-it"},
{"OK", "und-Cyrl-t-und-latn"},
{"OK", "root"},
@ -53,22 +49,9 @@ public class TestLocaleValidity extends TestFmwk {
{"OK", "zh-Hant"},
{"OK", "zh-Hant-AQ"},
{"OK", "x-abcdefg-g-foobar"},
{"Empty subtag [at index 0]", ""},
{"{u, ca-chinesx}", "en-u-ca-chinesx"},
{"{illegal, q}", "en-q-abcdefg"},
{"Incomplete privateuse [at index 0]", "x-abc$defg"},
{"{script, Latx}", "und-Cyrl-t-und-latx"},
{"{variant, FOOBAR}", "zh-Hant-1606nict-1694acad-foobar"},
{"{region, AB}", "zh-Hant-AB"},
{"{language, ex}", "ex"},
{"{script, Hanx}", "zh-Hanx"},
{"{language, qaa}", "qaa"},
{"Invalid subtag: $ [at index 3]", "EN-$"},
{"Invalid subtag: $ [at index 0]", "$"},
// too many items
{"{u, cu-usd}", "en-u-cu-adp-usd"},
{"OK", "en-u-ca-buddhist"},
{"OK", "en-u-ca-islamic-umalqura"}, // additive
{"OK", "en-u-cf-account"},
{"OK", "en-u-co-big5han"},
{"OK", "en-u-cu-adp"},
@ -80,17 +63,80 @@ public class TestLocaleValidity extends TestFmwk {
{"OK", "en-u-kf-false"},
{"OK", "en-u-kk-false"},
{"OK", "en-u-kn-false"},
{"OK", "en-u-kr-latn-digit-symbol"},
{"OK", "en-u-kr-latn-digit-symbol"}, // reorder codes, multiple
{"OK", "en-u-ks-identic"},
{"OK", "en-u-kv-currency"},
{"OK", "en-u-nu-ahom"},
{"OK", "en-u-sd-usny"},
{"OK", "en-u-tz-adalv"},
{"OK", "en-u-va-posix"},
{"{u, ca-civil}", "en-u-ca-islamicc"}, // deprecated
// really long case
{"OK", "en-u-ca-buddhist-ca-islamic-umalqura-cf-account-co-big5han-cu-adp-fw-fri-hc-h11-ka-noignore-kb-false-kc-false-kf-false-kk-false-kn-false-kr-latn-digit-symbol-ks-identic-kv-currency-nu-ahom-sd-usny-tz-adalv-va-posix"},
// deprecated, but turned into valid by ULocale.Builder()
{"OK", "en-u-ca-islamicc"}, // deprecated
{"OK", "en-u-tz-aqams"}, // deprecated
// Bad syntax (caught by ULocale.Builder())
{"Incomplete extension 'u' [at index 3]", "en-u"},
{"Incomplete extension 't' [at index 3]", "en-t"},
{"Empty subtag [at index 0]", ""},
{"Incomplete privateuse [at index 0]", "x-abc$defg"},
{"Invalid subtag: $ [at index 3]", "EN-$"},
{"Invalid subtag: $ [at index 0]", "$"},
// bad extension
{"{illegal, q}", "en-q-abcdefg"},
// bad subtags
{"{variant, FOOBAR}", "zh-Hant-1606nict-1694acad-foobar"},
{"{region, AB}", "zh-Hant-AB"},
{"{language, ex}", "ex"},
{"{script, Hanx}", "zh-Hanx"},
{"{language, qaa}", "qaa"},
// bad types for keys
{"{u, ca-chinesx}", "en-u-ca-chinesx"},
{"{script, Latx}", "und-Cyrl-t-und-latx"},
{"{u, sd-usca}", "en-AQ-u-sd-usca"},
{"{u, ca-buddhisx}", "en-u-ca-buddhisx"},
{"{u, ca-islamic-umalqurx}", "en-u-ca-islamic-umalqurx"}, // additive
{"{u, cf-accounx}", "en-u-cf-accounx"},
{"{u, co-big5hax}", "en-u-co-big5hax"},
{"{u, cu-adx}", "en-u-cu-adx"},
{"{u, fw-frx}", "en-u-fw-frx"},
{"{u, hc-h1x}", "en-u-hc-h1x"},
{"{u, ka-noignorx}", "en-u-ka-noignorx"},
{"{u, kb-falsx}", "en-u-kb-falsx"},
{"{u, kc-falsx}", "en-u-kc-falsx"},
{"{u, kf-falsx}", "en-u-kf-falsx"},
{"{u, kk-falsx}", "en-u-kk-falsx"},
{"{u, kn-falsx}", "en-u-kn-falsx"},
{"{u, kr-symbox}", "en-u-kr-latn-digit-symbox"}, // reorder codes, multiple
{"{u, ks-identix}", "en-u-ks-identix"},
{"{u, kv-currencx}", "en-u-kv-currencx"},
{"{u, nu-ahox}", "en-u-nu-ahox"},
{"{u, sd-usnx}", "en-u-sd-usnx"},
{"{u, tz-adalx}", "en-u-tz-adalx"},
{"{u, va-posit}", "en-u-va-posit"},
// too many items
{"{u, cu-usd}", "en-u-cu-adp-usd"},
// use deprecated subtags. testDeprecated checks if they work when Datasubtype.deprecated is added
//{"{u, ca-civil}", "en-u-ca-islamicc"}, // deprecated, but turns into valid
{"{u, co-direct}", "en-u-co-direct"}, // deprecated
{"{u, kh}", "en-u-kh-false"}, // deprecated
{"{u, tz-aqams}", "en-u-tz-aqams"}, // deprecated
{"{u, tz-camtr}", "en-u-tz-camtr"}, // deprecated
{"{u, vt}", "en-u-vt-0020-0041"}, // deprecated
};
check(tests, Datasubtype.regular, Datasubtype.unknown);
@ -120,12 +166,10 @@ public class TestLocaleValidity extends TestFmwk {
}
public void testDeprecated() {
LocaleValidityChecker regularAndDeprecated = new LocaleValidityChecker(EnumSet.of(Datasubtype.regular, Datasubtype.deprecated));
String[][] tests = {
{"OK", "en-u-ca-islamicc"}, // deprecated
{"OK", "en-u-co-direct"}, // deprecated
{"OK", "en-u-kh-false"}, // deprecated
{"OK", "en-u-tz-aqams"}, // deprecated
{"OK", "en-u-tz-camtr"}, // deprecated
{"OK", "en-u-vt-0020"}, // deprecated
};
check(tests, Datasubtype.regular, Datasubtype.unknown, Datasubtype.deprecated);
@ -133,9 +177,9 @@ public class TestLocaleValidity extends TestFmwk {
private void check(String[][] tests, Datasubtype... datasubtypes) {
int count = 0;
LocaleValidityChecker regularAndUnknown = new LocaleValidityChecker(datasubtypes);
LocaleValidityChecker localeValidityChecker = new LocaleValidityChecker(datasubtypes);
for (String[] test : tests) {
check(++count, regularAndUnknown, test[0], test[1]);
check(++count, localeValidityChecker, test[0], test[1]);
}
}