ICU-4229 Enhanced the checking further.

X-SVN-Rev: 38125
This commit is contained in:
Mark Davis 2015-12-13 21:37:37 +00:00
parent fcd9bcbe71
commit bdfe1a68cf
4 changed files with 539 additions and 76 deletions

View file

@ -8,13 +8,13 @@ package com.ibm.icu.impl;
import java.util.Collections;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import com.ibm.icu.impl.locale.AsciiUtil;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.UResourceBundleIterator;
@ -32,6 +32,10 @@ public class ValidIdentifiers {
subdivision,
unit,
variant,
u,
t,
x,
illegal
}
public enum Datasubtype {
@ -43,7 +47,7 @@ public class ValidIdentifiers {
macroregion,
}
static class ValiditySet {
public static class ValiditySet {
public final Set<String> regularData;
public final Map<String,Set<String>> subdivisionData;
public ValiditySet(Set<String> plainData, boolean makeMap) {
@ -105,7 +109,7 @@ public class ValidIdentifiers {
}
}
static class ValidityData {
private static class ValidityData {
static final Map<Datatype,Map<Datasubtype,ValiditySet>> data;
static {
Map<Datatype, Map<Datasubtype, ValiditySet>> _data = new EnumMap<Datatype,Map<Datasubtype,ValiditySet>>(Datatype.class);
@ -141,6 +145,7 @@ public class ValidIdentifiers {
data = Collections.unmodifiableMap(_data);
}
private static void addRange(String string, Set<String> subvalues) {
string = AsciiUtil.toLowerString(string);
int pos = string.indexOf('~');
if (pos < 0) {
subvalues.add(string);
@ -148,84 +153,44 @@ public class ValidIdentifiers {
StringRange.expand(string.substring(0,pos), string.substring(pos+1), false, subvalues);
}
}
static Map<Datatype, Map<Datasubtype, ValiditySet>> getData() {
return data;
}
/**
* Returns the Datasubtype containing the code, or null if there is none.
* @param datatype
* @param datasubtypes
* @param code
* @return
*/
static Datasubtype isValid(Datatype datatype, Set<Datasubtype> datasubtypes, String code) {
Map<Datasubtype, ValiditySet> subtable = data.get(datatype);
if (subtable != null) {
for (Datasubtype datasubtype : datasubtypes) {
ValiditySet validitySet = subtable.get(datasubtype);
if (validitySet != null) {
if (validitySet.contains(code)) {
return datasubtype;
}
}
}
}
return null;
}
static Datasubtype isValid(Datatype datatype, Set<Datasubtype> datasubtypes, String code, String value) {
Map<Datasubtype, ValiditySet> subtable = data.get(datatype);
if (subtable != null) {
for (Datasubtype datasubtype : datasubtypes) {
ValiditySet validitySet = subtable.get(datasubtype);
if (validitySet != null) {
if (validitySet.contains(code, value)) {
return datasubtype;
}
}
}
}
return null;
}
}
// Quick testing for now
public static void main(String[] args) {
showValid(Datatype.script, EnumSet.of(Datasubtype.regular, Datasubtype.unknown), "Zzzz");
showValid(Datatype.script, EnumSet.of(Datasubtype.regular), "Zzzz");
showValid(Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US-CA");
showValid(Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US", "CA");
showValid(Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US-?");
showValid(Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US", "?");
showAll();
}
private static void showAll() {
Map<Datatype, Map<Datasubtype, ValiditySet>> data = ValidityData.getData();
for (Entry<Datatype, Map<Datasubtype, ValiditySet>> e1 : data.entrySet()) {
System.out.println(e1.getKey());
for (Entry<Datasubtype, ValiditySet> e2 : e1.getValue().entrySet()) {
System.out.println("\t" + e2.getKey());
System.out.println("\t\t" + e2.getValue());
}
}
public static Map<Datatype, Map<Datasubtype, ValiditySet>> getData() {
return ValidityData.data;
}
/**
* @param script
* @param of
* @param string
* Returns the Datasubtype containing the code, or null if there is none.
*/
private static void showValid(Datatype datatype, Set<Datasubtype> datasubtypes, String code) {
Datasubtype value = ValidityData.isValid(datatype, datasubtypes, code);
System.out.println(datatype + ", " + datasubtypes + ", " + code + " => " + value);
public static Datasubtype isValid(Datatype datatype, Set<Datasubtype> datasubtypes, String code) {
Map<Datasubtype, ValiditySet> subtable = ValidityData.data.get(datatype);
if (subtable != null) {
for (Datasubtype datasubtype : datasubtypes) {
ValiditySet validitySet = subtable.get(datasubtype);
if (validitySet != null) {
if (validitySet.contains(AsciiUtil.toLowerString(code))) {
return datasubtype;
}
}
}
}
return null;
}
private static void showValid(Datatype datatype, Set<Datasubtype> datasubtypes, String code, String value2) {
Datasubtype value = ValidityData.isValid(datatype, datasubtypes, code, value2);
System.out.println(datatype + ", " + datasubtypes + ", " + code + ", " + value + " => " + value);
public static Datasubtype isValid(Datatype datatype, Set<Datasubtype> datasubtypes, String code, String value) {
Map<Datasubtype, ValiditySet> subtable = ValidityData.data.get(datatype);
if (subtable != null) {
code = AsciiUtil.toLowerString(code);
value = AsciiUtil.toLowerString(value);
for (Datasubtype datasubtype : datasubtypes) {
ValiditySet validitySet = subtable.get(datasubtype);
if (validitySet != null) {
if (validitySet.contains(code, value)) {
return datasubtype;
}
}
}
}
return null;
}
}

View file

@ -1,11 +1,13 @@
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and
* Copyright (C) 2014-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl.locale;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
@ -88,6 +90,15 @@ public class KeyTypeData {
return null;
}
// public static boolean isValid(String key, String type) {
// key = AsciiUtil.toLowerString(key);
// KeyData keyData = KEYMAP.get(key);
// if (keyData != null) {
// return keyData.bcpId;
// }
// return false;
// }
public static String toLegacyKey(String key) {
key = AsciiUtil.toLowerString(key);
KeyData keyData = KEYMAP.get(key);
@ -539,4 +550,22 @@ public class KeyTypeData {
initFromResourceBundle();
}
public static boolean isDeprecated(String key) {
return DEPRECATED_HACK_SET.contains(key);
}
public static boolean isDeprecated(String key, String type) {
Set<String> set = DEPRECATED_HACK.get(key);
return set != null && set.contains(type);
}
// Until LDML2ICU is updated
static Map<String,Set<String>> DEPRECATED_HACK = new HashMap<String,Set<String>>();
static Set<String> DEPRECATED_HACK_SET = new HashSet<String>();
static {
DEPRECATED_HACK.put("ca", Collections.singleton("islamicc"));
DEPRECATED_HACK.put("co", Collections.singleton("direct"));
DEPRECATED_HACK.put("tz", new HashSet<String>(Arrays.asList("aqams", "camtr", "cnckg", "cnhrb", "cnkhg", "usnavajo")));
DEPRECATED_HACK_SET.addAll(Arrays.asList("kh", "vt"));
};
}

View file

@ -0,0 +1,261 @@
/*
*******************************************************************************
* Copyright (C) 2015, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl.locale;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import com.ibm.icu.impl.ValidIdentifiers;
import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
import com.ibm.icu.impl.ValidIdentifiers.Datatype;
import com.ibm.icu.util.IllformedLocaleException;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
/**
* @author markdavis
*
*/
public class LocaleValidityChecker {
private final Set<Datasubtype> datasubtypes;
private final boolean allowsDeprecated;
public static class Where {
public Datatype fieldFailure;
public String codeFailure;
public boolean set(Datatype datatype, String code) {
fieldFailure = datatype;
codeFailure = code;
return false;
}
@Override
public String toString() {
return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}";
}
}
public LocaleValidityChecker(Set<Datasubtype> datasubtypes) {
this.datasubtypes = EnumSet.copyOf(datasubtypes);
allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated);
}
public LocaleValidityChecker(Datasubtype... datasubtypes) {
this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes));
allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated);
}
/**
* @return the datasubtypes
*/
public Set<Datasubtype> getDatasubtypes() {
return EnumSet.copyOf(datasubtypes);
}
static Pattern SEPARATOR = Pattern.compile("[-_]");
public boolean isValid(ULocale locale, Where where) {
where.set(null, null);
if (!isValid(Datatype.language, locale.getLanguage(), where)) {
// special case x
if (locale.getLanguage().equals("x")) {
where.set(null, null);
// TODO check syntax is ok, only alphanum{1,8}
return true;
}
return false;
}
if (!isValid(Datatype.script, locale.getScript(), where)) return false;
if (!isValid(Datatype.region, locale.getCountry(), where)) return false;
String variantString = locale.getVariant();
if (!variantString.isEmpty()) {
for (String variant : SEPARATOR.split(variantString)) {
if (!isValid(Datatype.variant, variant, where)) return false;
}
}
for (Character c : locale.getExtensionKeys()) {
try {
Datatype datatype = Datatype.valueOf(c+"");
switch (datatype) {
case x:
// TODO : check that the rest is syntactic
return true;
case t:
if (!isValidT(locale.getExtension(c), where)) return false;
break;
case u:
if (!isValidU(locale.getExtension(c), where)) return false;
break;
}
} catch (Exception e) {
return where.set(Datatype.illegal, c+"");
}
}
return true;
}
enum SpecialCase {
normal, anything, reorder, codepoints;
static SpecialCase get(String key) {
if (key.equals("kr")) {
return SpecialCase.reorder;
} else if (key.equals("vt")) {
return SpecialCase.codepoints;
} else if (key.equals("x0")) {
return anything;
} else {
return normal;
}
}
}
/**
* @param extension
* @param where
* @return
*/
private boolean isValidU(String extensionString, Where where) {
String key = "";
int typeCount = 0;
ValueType valueType = null;
SpecialCase specialCase = null;
// TODO: is empty -u- valid?
for (String subtag : SEPARATOR.split(extensionString)) {
if (subtag.length() == 2) {
key = KeyTypeData.toBcpKey(subtag);
if (key == null) {
return where.set(Datatype.u, subtag);
}
if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) {
return where.set(Datatype.u, key);
}
valueType = ValueType.get(key);
specialCase = SpecialCase.get(key);
typeCount = 0;
} else {
++typeCount;
if (valueType == ValueType.single && typeCount > 1) {
return where.set(Datatype.u, key+"-"+subtag);
}
switch (specialCase) {
case anything:
continue;
case codepoints:
try {
if (Integer.parseInt(subtag,16) > 0x10FFFF) {
return where.set(Datatype.u, key+"-"+subtag);
}
} catch (NumberFormatException e) {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
case reorder:
if (!isScriptReorder(subtag)) {
return where.set(Datatype.u, key+"-"+subtag);
}
continue;
}
// en-u-sd-usca
// en-US-u-sd-usca
Output<Boolean> isKnownKey = new Output<Boolean>();
Output<Boolean> isSpecialType = new Output<Boolean>();
String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType);
if (type == null) {
return where.set(Datatype.u, key+"-"+subtag);
}
if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) {
return where.set(Datatype.u, key+"-"+subtag);
}
}
}
return true;
}
static final Set<String> REORDERING_INCLUDE = new HashSet<String>(Arrays.asList("space", "punct", "symbol", "currency", "digit", "others"));
static final Set<String> REORDERING_EXCLUDE = new HashSet<String>(Arrays.asList("zinh", "zyyy"));
/**
* @param subtag
* @return
*/
private boolean isScriptReorder(String subtag) {
subtag = AsciiUtil.toLowerString(subtag);
if (REORDERING_INCLUDE.contains(subtag)) {
return true;
} else if (REORDERING_EXCLUDE.contains(subtag)) {
return false;
}
return ValidIdentifiers.isValid(Datatype.script, datasubtypes, subtag) != null;
// space, punct, symbol, currency, digit - core groups of characters below 'a'
// any script code except Common and Inherited.
// sc ; Zinh ; Inherited ; Qaai
// sc ; Zyyy ; Common
// Some pairs of scripts sort primary-equal and always reorder together. For example, Katakana characters are are always reordered with Hiragana.
// others - where all codes not explicitly mentioned should be ordered. The script code Zzzz (Unknown Script) is a synonym for others. return false;
}
/**
* @param extensionString
* @param where
* @return
*/
private boolean isValidT(String extensionString, Where where) {
// TODO: is empty -t- valid?
// TODO stop at first tag ([a-z][0-9]) and check their validity separately
try {
ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build();
return isValid(locale, where);
} catch (IllformedLocaleException e) {
int startIndex = e.getErrorIndex();
String[] list = SEPARATOR.split(extensionString.substring(startIndex));
return where.set(Datatype.t, list[0]);
} catch (Exception e) {
return where.set(Datatype.t, e.getMessage());
}
}
/**
* @param language
* @param language2
* @return
*/
private boolean isValid(Datatype datatype, String code, Where where) {
return datatype == Datatype.language && code.equalsIgnoreCase("root") ? true
: code.isEmpty() ? true
: ValidIdentifiers.isValid(datatype, datasubtypes, code) != null ? true
: where == null ? false : where.set(datatype, code);
}
public enum ValueType {
single, multiple, specific;
private static Set<String> multipleValueTypes = new HashSet<String>(Arrays.asList("x0", "kr", "vt"));
private static Set<String> specificValueTypes = new HashSet<String>(Arrays.asList("ca"));
static ValueType get(String key) {
if (multipleValueTypes.contains(key)) {
return multiple;
} else if (specificValueTypes.contains(key)) {
return specific;
} else {
return single;
}
}
}
/*
Type: any multiple
{"OK", "en-t-x0-SPECIAL"}
{"OK", "en-u-kr-REORDER_CODE"}, // Collation reorder codes; One or more collation reorder codes, see LDML Part 5: Collation
{"OK", "en-u-vt-CODEPOINTS"}, // deprecated Collation parameter key for variable top; The variable top (one or more Unicode code points: LDML Appendix Q)
Multiple-values, specific sequences
<type name="islamic-umalqura" description="Islamic calendar, Umm al-Qura" since="24"/>
*/
}

View file

@ -0,0 +1,208 @@
/*
*******************************************************************************
* Copyright (C) 2015, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.util;
import java.util.EnumSet;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.ValidIdentifiers;
import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
import com.ibm.icu.impl.ValidIdentifiers.Datatype;
import com.ibm.icu.impl.ValidIdentifiers.ValiditySet;
import com.ibm.icu.impl.locale.LocaleValidityChecker;
import com.ibm.icu.impl.locale.LocaleValidityChecker.Where;
import com.ibm.icu.util.ULocale;
/**
* @author markdavis
*
*/
public class TestLocaleValidity extends TestFmwk {
/**
* Quick check
*/
public static void main(String[] args) {
new TestLocaleValidity().run(args);
}
public void testBasic() {
String[][] tests = {
{"OK", "en-u-kr-latn-digit"},
{"Incomplete extension 'u' [at index 3]", "en-u"},
{"Incomplete extension 't' [at index 3]", "en-t"},
{"OK", "en-u-ca-chinese"},
{"OK", "en-x-abcdefg"},
{"OK", "x-abcdefg"},
{"OK", "en-u-sd-usca"},
{"OK", "en-US-u-sd-usca"},
{"OK", "en-AQ-u-sd-usca"},
{"OK", "en-t-it"},
{"OK", "und-Cyrl-t-und-latn"},
{"OK", "root"},
{"OK", "und"},
{"OK", "en"},
{"OK", "en-Hant"},
{"OK", "zh-Hant-1606nict-1694acad"},
{"OK", "zh-Hant"},
{"OK", "zh-Hant-AQ"},
{"OK", "x-abcdefg-g-foobar"},
{"Empty subtag [at index 0]", ""},
{"{u, ca-chinesx}", "en-u-ca-chinesx"},
{"{illegal, q}", "en-q-abcdefg"},
{"Incomplete privateuse [at index 0]", "x-abc$defg"},
{"{script, Latx}", "und-Cyrl-t-und-latx"},
{"{variant, FOOBAR}", "zh-Hant-1606nict-1694acad-foobar"},
{"{region, AB}", "zh-Hant-AB"},
{"{language, ex}", "ex"},
{"{script, Hanx}", "zh-Hanx"},
{"{language, qaa}", "qaa"},
{"Invalid subtag: $ [at index 3]", "EN-$"},
{"Invalid subtag: $ [at index 0]", "$"},
// too many items
{"{u, cu-usd}", "en-u-cu-adp-usd"},
{"OK", "en-u-ca-buddhist"},
{"OK", "en-u-cf-account"},
{"OK", "en-u-co-big5han"},
{"OK", "en-u-cu-adp"},
{"OK", "en-u-fw-fri"},
{"OK", "en-u-hc-h11"},
{"OK", "en-u-ka-noignore"},
{"OK", "en-u-kb-false"},
{"OK", "en-u-kc-false"},
{"OK", "en-u-kf-false"},
{"OK", "en-u-kk-false"},
{"OK", "en-u-kn-false"},
{"OK", "en-u-kr-latn-digit-symbol"},
{"OK", "en-u-ks-identic"},
{"OK", "en-u-kv-currency"},
{"OK", "en-u-nu-ahom"},
{"OK", "en-u-sd-usny"},
{"OK", "en-u-tz-adalv"},
{"OK", "en-u-va-posix"},
{"{u, ca-civil}", "en-u-ca-islamicc"}, // deprecated
{"{u, co-direct}", "en-u-co-direct"}, // deprecated
{"{u, kh}", "en-u-kh-false"}, // deprecated
{"{u, tz-aqams}", "en-u-tz-aqams"}, // deprecated
{"{u, vt}", "en-u-vt-0020-0041"}, // deprecated
};
check(tests, Datasubtype.regular, Datasubtype.unknown);
}
public void testMissing() {
String[][] tests = {
{"OK", "en-u-lb-loose"},
{"OK", "en-u-lw-breakall"},
{"OK", "en-u-ms-metric"},
{"OK", "en-u-ss-none"},
};
check(tests, Datasubtype.regular, Datasubtype.unknown);
}
public void testTSubtags() {
String[][] tests = {
// {"OK", "und-Cyrl-t-und-latn-m0-ungegn-2007"},
// {"{t, ungegg}", "und-Cyrl-t-und-latn-m0-ungegg-2007"},
// {"OK", "en-t-i0-handwrit"},
// {"OK", "en-t-k0-101key"},
// {"OK", "en-t-m0-alaloc"},
// {"OK", "en-t-t0-und"},
// {"OK", "en-t-x0-anythin"},
};
check(tests, Datasubtype.regular, Datasubtype.unknown);
}
public void testDeprecated() {
LocaleValidityChecker regularAndDeprecated = new LocaleValidityChecker(EnumSet.of(Datasubtype.regular, Datasubtype.deprecated));
String[][] tests = {
{"OK", "en-u-ca-islamicc"}, // deprecated
{"OK", "en-u-co-direct"}, // deprecated
{"OK", "en-u-kh-false"}, // deprecated
{"OK", "en-u-tz-aqams"}, // deprecated
{"OK", "en-u-vt-0020"}, // deprecated
};
check(tests, Datasubtype.regular, Datasubtype.unknown, Datasubtype.deprecated);
}
private void check(String[][] tests, Datasubtype... datasubtypes) {
int count = 0;
LocaleValidityChecker regularAndUnknown = new LocaleValidityChecker(datasubtypes);
for (String[] test : tests) {
check(++count, regularAndUnknown, test[0], test[1]);
}
}
private void check(int count, LocaleValidityChecker all, String expected, String locale) {
ULocale ulocale;
try {
ulocale = new ULocale.Builder().setLanguageTag(locale).build();
} catch (Exception e) {
assertEquals(count + ". " + locale, expected, e.getMessage());
return;
}
Where where = new Where();
all.isValid(ulocale, where);
assertEquals(count + ". " + locale, expected, where.toString());
// ULocale ulocale2 = ULocale.forLanguageTag(locale);
// final String languageTag2 = ulocale2.toLanguageTag();
//
// if (languageTag.equals(languageTag2)) {
// return;
// }
// all.isValid(ulocale2, where);
// assertEquals(ulocale2 + ", " + ulocale2.toLanguageTag(), expected, where.toString());
// problem: ULocale("$").toLanguageTag() becomes valid
}
// Quick testing for now
public void testValidIdentifierData() {
showValid(Datasubtype.unknown, Datatype.script, EnumSet.of(Datasubtype.regular, Datasubtype.unknown), "Zzzz");
showValid(null, Datatype.script, EnumSet.of(Datasubtype.regular), "Zzzz");
showValid(Datasubtype.regular, Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US-CA");
showValid(Datasubtype.regular, Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US", "CA");
showValid(null, Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US-?");
showValid(null, Datatype.subdivision, EnumSet.of(Datasubtype.regular), "US", "?");
if (isVerbose()) {
showAll();
}
}
private static void showAll() {
Map<Datatype, Map<Datasubtype, ValiditySet>> data = ValidIdentifiers.getData();
for (Entry<Datatype, Map<Datasubtype, ValiditySet>> e1 : data.entrySet()) {
System.out.println(e1.getKey());
for (Entry<Datasubtype, ValiditySet> e2 : e1.getValue().entrySet()) {
System.out.println("\t" + e2.getKey());
System.out.println("\t\t" + e2.getValue());
}
}
}
/**
* @param expected TODO
* @param script
* @param of
* @param string
*/
private void showValid(Datasubtype expected, Datatype datatype, Set<Datasubtype> datasubtypes, String code) {
Datasubtype value = ValidIdentifiers.isValid(datatype, datasubtypes, code);
assertEquals(datatype + ", " + datasubtypes + ", " + code, expected, value);
}
private void showValid(Datasubtype expected, Datatype datatype, Set<Datasubtype> datasubtypes, String code, String code2) {
Datasubtype value = ValidIdentifiers.isValid(datatype, datasubtypes, code, code2);
assertEquals(datatype + ", " + datasubtypes + ", " + code + ", " + code2, expected, value);
}
}