mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 17:24:01 +00:00
ICU-7125 Add API for language matching for data that will be coming from CLDR.
X-SVN-Rev: 26589
This commit is contained in:
parent
beeba02f88
commit
42d4b38e15
8 changed files with 1020 additions and 9 deletions
|
@ -6,9 +6,8 @@
|
|||
* Author: Mark Davis
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.util;
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.util.Freezable;
|
||||
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2004-2009, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
****************************************************************************************
|
||||
* Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.util;
|
||||
|
||||
import java.util.MissingResourceException;
|
||||
|
|
586
icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
Normal file
586
icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
Normal file
|
@ -0,0 +1,586 @@
|
|||
/*
|
||||
****************************************************************************************
|
||||
* Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.impl.Row;
|
||||
import com.ibm.icu.impl.Row.R2;
|
||||
import com.ibm.icu.impl.Row.R3;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
* Provides a way to match the languages (locales) supported by a product to the
|
||||
* languages (locales) acceptable to a user, and get the best match. For
|
||||
* example:
|
||||
*
|
||||
* <pre>
|
||||
* LanguageMatcher matcher = new StandardLanguageMatcher("fr, en-GB, en");
|
||||
*
|
||||
* // afterwards:
|
||||
* matcher.getBestMatch(LanguageCode.US).first == LanguageCode.ENGLISH
|
||||
* </pre>
|
||||
*
|
||||
* It takes into account when languages are close to one another, such as fil
|
||||
* and tl, and when language regional variants are close, like en-GB and en-AU.
|
||||
* It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test
|
||||
* file.
|
||||
* <p>All classes implementing this interface should be immutable. Often a
|
||||
* product will just need one static instance, built with the languages
|
||||
* that it supports. However, it may want multiple instances with different
|
||||
* default languages based on additional information, such as the domain.
|
||||
*
|
||||
* @author markdavis@google.com
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
public class LocaleMatcher {
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
|
||||
/**
|
||||
* Threshold for falling back to the default (first) language. May make this
|
||||
* a parameter in the future.
|
||||
*/
|
||||
private static final double DEFAULT_THRESHOLD = 0.5;
|
||||
|
||||
/**
|
||||
* The default language, in case the threshold is not met.
|
||||
*/
|
||||
private final ULocale defaultLanguage;
|
||||
|
||||
/**
|
||||
* Create a new language matcher. The highest-weighted language is the
|
||||
* default. That means that if no other language is matches closer than a given
|
||||
* threshold, that default language is chosen. Typically the default is English,
|
||||
* but it could be different based on additional information, such as the domain
|
||||
* of the page.
|
||||
*
|
||||
* @param languagePriorityList weighted list
|
||||
*/
|
||||
public LocaleMatcher(LocalePriorityList languagePriorityList) {
|
||||
this(languagePriorityList, defaultWritten);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new language matcher from a String form. The highest-weighted
|
||||
* language is the default.
|
||||
*
|
||||
* @param languagePriorityListString String form of LanguagePriorityList
|
||||
*/
|
||||
public LocaleMatcher(String languagePriorityListString) {
|
||||
this(LocalePriorityList.add(languagePriorityListString).build());
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @param languagePriorityList
|
||||
* @param matcherData
|
||||
*/
|
||||
public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
|
||||
this.matcherData = matcherData;
|
||||
for (final ULocale language : languagePriorityList) {
|
||||
add(language, languagePriorityList.getWeight(language));
|
||||
}
|
||||
Iterator<ULocale> it = languagePriorityList.iterator();
|
||||
defaultLanguage = it.hasNext() ? it.next() : null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a fraction between 0 and 1, where 1 means that the languages are a
|
||||
* perfect match, and 0 means that they are completely different. Note that
|
||||
* the precise values may change over time; no code should be made dependent
|
||||
* on the values remaining constant.
|
||||
* @param a
|
||||
* @param aMax
|
||||
* @param b
|
||||
* @param bMax
|
||||
* @return value between 0 and 1, inclusive.
|
||||
*/
|
||||
public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
|
||||
return matcherData.match(a, aMax, b, bMax);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Canonicalize a locale (language). Note that for now, it is canonicalizing according to CLDR conventions (he vs iw, etc), since that is what is needed for likelySubtags.
|
||||
* TODO Get the data from CLDR, use Java conventions.
|
||||
* @param languageCode
|
||||
* @return ULocale with remapped subtags.
|
||||
*/
|
||||
public ULocale canonicalize(ULocale languageCode) {
|
||||
String lang = languageCode.getLanguage();
|
||||
String lang2 = canonicalMap.get(lang);
|
||||
String script = languageCode.getScript();
|
||||
String script2 = canonicalMap.get(script);
|
||||
String region = languageCode.getCountry();
|
||||
String region2 = canonicalMap.get(region);
|
||||
if (lang2 != null || script2 != null || region2 != null) {
|
||||
return new ULocale(
|
||||
lang2 == null ? lang : lang2,
|
||||
script2 == null ? script : script2,
|
||||
region2 == null ? region : region2
|
||||
);
|
||||
}
|
||||
return languageCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the best match for a LanguagePriorityList
|
||||
*
|
||||
* @param languageList
|
||||
* @return best matching language code
|
||||
*/
|
||||
public ULocale getBestMatch(LocalePriorityList languageList) {
|
||||
double bestWeight = 0;
|
||||
ULocale bestTableMatch = null;
|
||||
for (final ULocale language : languageList) {
|
||||
final Row.R2<ULocale, Double> matchRow = getBestMatchInternal(language);
|
||||
final double weight = matchRow.get1() * languageList.getWeight(language);
|
||||
if (weight > bestWeight) {
|
||||
bestWeight = weight;
|
||||
bestTableMatch = matchRow.get0();
|
||||
}
|
||||
}
|
||||
if (bestWeight < DEFAULT_THRESHOLD) {
|
||||
bestTableMatch = defaultLanguage;
|
||||
}
|
||||
return bestTableMatch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the best match for a LanguagePriorityList
|
||||
*
|
||||
* @param languageList
|
||||
* @return best matching language code
|
||||
*/
|
||||
public ULocale getBestMatch(String languageList) {
|
||||
return getBestMatch(LocalePriorityList.add(languageList).build());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the best match for an individual language code.
|
||||
*
|
||||
* @param languageCode
|
||||
* @return best matching language code
|
||||
*/
|
||||
public ULocale getBestMatch(ULocale languageCode) {
|
||||
return getBestMatchInternal(languageCode).get0();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "{" + defaultLanguage + ", "
|
||||
+ maximizedLanguageToWeight + "}";
|
||||
}
|
||||
// ================= Privates =====================
|
||||
|
||||
/**
|
||||
* Get the best match for an individual language code.
|
||||
*
|
||||
* @param languageCode
|
||||
* @return best matching language code and weight (as per
|
||||
* {@link #match(ULocale, ULocale)})
|
||||
*/
|
||||
private Row.R2<ULocale, Double> getBestMatchInternal(ULocale languageCode) {
|
||||
languageCode = canonicalize(languageCode);
|
||||
final ULocale maximized = addLikelySubtags(languageCode);
|
||||
if (DEBUG) {
|
||||
System.out.println("\n" + languageCode + ";\t" + maximized);
|
||||
}
|
||||
double bestWeight = 0;
|
||||
ULocale bestTableMatch = null;
|
||||
for (final ULocale tableKey : maximizedLanguageToWeight.keySet()) {
|
||||
R2<ULocale, Double> row = maximizedLanguageToWeight.get(tableKey);
|
||||
final double match = match(languageCode, maximized, tableKey, row.get0());
|
||||
if (DEBUG) {
|
||||
System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match);
|
||||
}
|
||||
final double weight = match * row.get1();
|
||||
if (weight > bestWeight) {
|
||||
bestWeight = weight;
|
||||
bestTableMatch = tableKey;
|
||||
}
|
||||
}
|
||||
if (bestWeight < DEFAULT_THRESHOLD) {
|
||||
bestTableMatch = defaultLanguage;
|
||||
}
|
||||
return Row.R2.of(bestTableMatch, bestWeight);
|
||||
}
|
||||
|
||||
private void add(ULocale language, Double weight) {
|
||||
language = canonicalize(language);
|
||||
R2<ULocale, Double> row = Row.of(addLikelySubtags(language), weight);
|
||||
maximizedLanguageToWeight.put(language, row);
|
||||
}
|
||||
|
||||
Map<ULocale,Row.R2<ULocale, Double>> maximizedLanguageToWeight = new LinkedHashMap<ULocale, R2<ULocale, Double>>();
|
||||
|
||||
|
||||
// =============== Special Mapping Information ==============
|
||||
|
||||
/**
|
||||
* We need to add another method to addLikelySubtags that doesn't return
|
||||
* null, but instead substitutes Zzzz and ZZ if unknown. There are also
|
||||
* a few cases where addLikelySubtags needs to have expanded data, to handle
|
||||
* all deprecated codes, and to update to CLDR 1.6.
|
||||
* @param languageCode
|
||||
* @return "fixed" addLikelySubtags
|
||||
*/
|
||||
// TODO(markdavis): update the above when CLDR 1.6 is final.
|
||||
private ULocale addLikelySubtags(ULocale languageCode) {
|
||||
final ULocale result = ULocale.addLikelySubtags(languageCode);
|
||||
// should have method on getLikelySubtags for this
|
||||
if (result == null || result.equals(languageCode)) {
|
||||
final String language = languageCode.getLanguage();
|
||||
final String script = languageCode.getScript();
|
||||
final String region = languageCode.getCountry();
|
||||
return new ULocale((language.length()==0 ? "und"
|
||||
: language)
|
||||
+ "_"
|
||||
+ (script.length()==0 ? "Zzzz" : script)
|
||||
+ "_"
|
||||
+ (region.length()==0 ? "ZZ" : region));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static class LocalePatternMatcher {
|
||||
// a value of null means a wildcard; matches any.
|
||||
private String lang;
|
||||
private String script;
|
||||
private String region;
|
||||
private Level level;
|
||||
static Pattern pattern = Pattern.compile(
|
||||
"([a-zA-Z]{1,8}|\\*)" +
|
||||
"(?:-([a-zA-Z]{4}|\\*))?" +
|
||||
"(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?");
|
||||
|
||||
public LocalePatternMatcher(String toMatch) {
|
||||
Matcher matcher = pattern.matcher(toMatch);
|
||||
if (!matcher.matches()) {
|
||||
throw new IllegalArgumentException("Bad pattern: " + toMatch);
|
||||
}
|
||||
lang = matcher.group(1);
|
||||
script = matcher.group(2);
|
||||
region = matcher.group(3);
|
||||
level = region != null ? Level.region : script != null ? Level.script : Level.language;
|
||||
|
||||
if (lang.equals("*")) {
|
||||
lang = null;
|
||||
}
|
||||
if (script != null && script.equals("*")) {
|
||||
script = null;
|
||||
}
|
||||
if (region != null && region.equals("*")) {
|
||||
region = null;
|
||||
}
|
||||
}
|
||||
|
||||
boolean matches(ULocale ulocale) {
|
||||
if (lang != null && !lang.equals(ulocale.getLanguage())) {
|
||||
return false;
|
||||
}
|
||||
if (script != null && !script.equals(ulocale.getScript())) {
|
||||
return false;
|
||||
}
|
||||
if (region != null && !region.equals(ulocale.getCountry())) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public Level getLevel() {
|
||||
return level;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return (lang == null ? "*" : lang);
|
||||
}
|
||||
|
||||
public String getScript() {
|
||||
return (script == null ? "*" : script);
|
||||
}
|
||||
|
||||
public String getRegion() {
|
||||
return (region == null ? "*" : region);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String result = getLanguage();
|
||||
if (level != Level.language) {
|
||||
result += "-" + getScript();
|
||||
if (level != Level.script) {
|
||||
result += "-" + getRegion();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
enum Level {language, script, region}
|
||||
|
||||
private static class ScoreData {
|
||||
final Set<Row.R3<LocalePatternMatcher,LocalePatternMatcher,Double>> scores = new LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>();
|
||||
final double worst;
|
||||
final Level level;
|
||||
|
||||
public ScoreData(Level level) {
|
||||
this.level = level;
|
||||
this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0;
|
||||
}
|
||||
|
||||
void addDataToScores(String desired, String supported, R3<LocalePatternMatcher,LocalePatternMatcher,Double> data) {
|
||||
// Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desired);
|
||||
// if (lang_result == null) {
|
||||
// scores.put(desired, lang_result = new HashMap());
|
||||
// }
|
||||
// Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supported);
|
||||
// if (result == null) {
|
||||
// lang_result.put(supported, result = new LinkedHashSet());
|
||||
// }
|
||||
// result.add(data);
|
||||
scores.add(data);
|
||||
}
|
||||
|
||||
double getScore(ULocale desiredLocale, ULocale dMax, String desiredRaw, String desiredMax,
|
||||
ULocale supportedLocale, ULocale sMax, String supportedRaw, String supportedMax) {
|
||||
|
||||
/*
|
||||
* d, dm, s, sm
|
||||
* dc = d != dm
|
||||
* sc = s != sm
|
||||
* if dm != sm
|
||||
* rd = rd(dm,sm) // line 4
|
||||
* if dc != sc
|
||||
* rd *= 0.75 // lines 3,8
|
||||
* ef dc
|
||||
* rd *= 0.5 // lines 7
|
||||
* end
|
||||
* ef dc == sc
|
||||
* rd = 0 // line 6
|
||||
* else
|
||||
* rd = 0.25*StdRDiff // lines 2,5
|
||||
*/
|
||||
|
||||
boolean desiredChange = desiredRaw.equals(desiredMax);
|
||||
boolean supportedChange = supportedRaw.equals(supportedMax);
|
||||
double distance;
|
||||
if (!desiredMax.equals(supportedMax)) {
|
||||
// Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desiredMax);
|
||||
// if (lang_result == null) {
|
||||
// distance = worst;
|
||||
// } else {
|
||||
// Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supportedMax);
|
||||
// skip:
|
||||
// if (result == null) {
|
||||
// distance = worst;
|
||||
// } else {
|
||||
distance = getRawScore(dMax, sMax);
|
||||
// }
|
||||
if (desiredChange == supportedChange) {
|
||||
distance *= 0.75;
|
||||
} else if (desiredChange) {
|
||||
distance *= 0.5;
|
||||
}
|
||||
} else if (desiredChange == supportedChange) { // maxes are equal, changes are equal
|
||||
distance = 0;
|
||||
} else { // maxes are equal, changes are different
|
||||
distance = 0.25*worst;
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale);
|
||||
}
|
||||
for (R3<LocalePatternMatcher,LocalePatternMatcher,Double> datum : scores) { // : result
|
||||
if (datum.get0().matches(desiredLocale)
|
||||
&& datum.get1().matches(supportedLocale)) {
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tFOUND\t" + datum);
|
||||
}
|
||||
return datum.get2();
|
||||
}
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tNOTFOUND\t" + worst);
|
||||
}
|
||||
return worst;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return level + ", " + scores;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Only for testing and use by tools. Interface may change!!
|
||||
* @internal
|
||||
*/
|
||||
public static class LanguageMatcherData {
|
||||
ScoreData languageScores = new ScoreData(Level.language);
|
||||
ScoreData scriptScores = new ScoreData(Level.script);
|
||||
ScoreData regionScores = new ScoreData(Level.region);
|
||||
|
||||
public Builder start() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
|
||||
double diff = 0;
|
||||
diff += languageScores.getScore(a, aMax, a.getLanguage(), aMax.getLanguage(), b, bMax, b.getLanguage(), bMax.getLanguage());
|
||||
diff += scriptScores.getScore(a, aMax, a.getScript(), aMax.getScript(), b, bMax, b.getScript(), bMax.getScript());
|
||||
diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());
|
||||
|
||||
if (!a.getVariant().equals(b.getVariant())) {
|
||||
diff += 1;
|
||||
}
|
||||
if (diff < 0.0d) {
|
||||
diff = 0.0d;
|
||||
} else if (diff > 1.0d) {
|
||||
diff = 1.0d;
|
||||
}
|
||||
return 1.0 - diff;
|
||||
}
|
||||
|
||||
class Builder {
|
||||
public LanguageMatcherData build() {
|
||||
return LanguageMatcherData.this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an exceptional distance between languages, typically because regional
|
||||
* dialects were given their own language codes. At this point the code is
|
||||
* symmetric. We don't bother producing an equivalence class because there are
|
||||
* so few cases; this function depends on the other permutations being
|
||||
* added specifically.
|
||||
* @param desired
|
||||
* @param supported
|
||||
* @param distance
|
||||
* @param bidirectional TODO
|
||||
* @return
|
||||
*/
|
||||
private Builder addDistance(String desired, String supported, int percent) {
|
||||
return addDistance(desired, supported, percent, false, null);
|
||||
}
|
||||
private Builder addDistance(String desired, String supported, int percent, String comment) {
|
||||
return addDistance(desired, supported, percent, false, comment);
|
||||
}
|
||||
|
||||
private Builder addDistance(String desired, String supported, int percent, boolean oneway) {
|
||||
return addDistance(desired, supported, percent, oneway, null);
|
||||
}
|
||||
|
||||
private Builder addDistance(String desired, String supported, int percent, boolean oneway, String comment) {
|
||||
if (DEBUG) {
|
||||
if (false)
|
||||
System.out.println("\t<distance desired=\"" + desired + "\"" +
|
||||
" supported=\"" + supported + "\"" +
|
||||
" percent=\"" + percent + "\""
|
||||
+ (oneway ? "" : " oneway=\"true\"")
|
||||
+ "/>"
|
||||
+ (comment == null ? "" : "\t<!-- " + comment + " -->"));
|
||||
else // .addDistance("nn", "nb", 4, true)
|
||||
System.out.println(".addDistance(\"" + desired + "\"" +
|
||||
", \"" + supported + "\"" +
|
||||
", " + percent + ""
|
||||
+ (oneway ? "" : ", true")
|
||||
+ (comment == null ? "" : ", \"" + comment + "\"")
|
||||
+ ")"
|
||||
);
|
||||
|
||||
}
|
||||
double score = 1-percent/100.0; // convert from percentage
|
||||
LocalePatternMatcher desiredMatcher = new LocalePatternMatcher(desired);
|
||||
Level desiredLen = desiredMatcher.getLevel();
|
||||
LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
|
||||
Level supportedLen = supportedMatcher.getLevel();
|
||||
if (desiredLen != supportedLen) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
R3<LocalePatternMatcher,LocalePatternMatcher,Double> data = Row.of(desiredMatcher, supportedMatcher, score);
|
||||
R3<LocalePatternMatcher,LocalePatternMatcher,Double> data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
|
||||
switch (desiredLen) {
|
||||
case language:
|
||||
String dlanguage = desiredMatcher.getLanguage();
|
||||
String slanguage = supportedMatcher.getLanguage();
|
||||
languageScores.addDataToScores(dlanguage, slanguage, data);
|
||||
if (!oneway) {
|
||||
languageScores.addDataToScores(slanguage, dlanguage, data2);
|
||||
}
|
||||
break;
|
||||
case script:
|
||||
String dscript = desiredMatcher.getScript();
|
||||
String sscript = supportedMatcher.getScript();
|
||||
scriptScores.addDataToScores(dscript, sscript, data);
|
||||
if (!oneway) {
|
||||
scriptScores.addDataToScores(sscript, dscript, data2);
|
||||
}
|
||||
break;
|
||||
case region:
|
||||
String dregion = desiredMatcher.getRegion();
|
||||
String sregion = supportedMatcher.getRegion();
|
||||
regionScores.addDataToScores(dregion, sregion, data);
|
||||
if (!oneway) {
|
||||
regionScores.addDataToScores(sregion, dregion, data2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LanguageMatcherData matcherData;
|
||||
|
||||
private static LanguageMatcherData defaultWritten = new LanguageMatcherData().start()
|
||||
// TODO get data from CLDR
|
||||
.addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
|
||||
.addDistance("nn", "nb", 96)
|
||||
.addDistance("nn", "no", 96)
|
||||
.addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
|
||||
.addDistance("da", "nb", 90)
|
||||
.addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
|
||||
.addDistance("sh", "br", 96)
|
||||
.addDistance("sr", "br", 96)
|
||||
.addDistance("sh", "hr", 96)
|
||||
.addDistance("sr", "hr", 96)
|
||||
.addDistance("sh", "sr", 96)
|
||||
.addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
|
||||
.addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
|
||||
.addDistance("*-Hant", "*-Hans", 75, true)
|
||||
.addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.")
|
||||
.addDistance("en-*-US", "en-*-*", 97)
|
||||
.addDistance("en-*-CA", "en-*-*", 98)
|
||||
.addDistance("en-*-*", "en-*-*", 99)
|
||||
.addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.")
|
||||
.addDistance("es-*-ES", "es-*-*", 93)
|
||||
.addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
|
||||
.addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
|
||||
.addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
|
||||
.build();
|
||||
|
||||
private static HashMap<String,String> canonicalMap = new HashMap<String, String>();
|
||||
|
||||
static {
|
||||
// TODO get data from CLDR
|
||||
System.out.println(ULocale.addLikelySubtags(new ULocale("iw")));
|
||||
System.out.println(ULocale.addLikelySubtags(new ULocale("he")));
|
||||
canonicalMap.put("iw", "he");
|
||||
canonicalMap.put("mo", "ro");
|
||||
canonicalMap.put("tl", "fil");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
/*
|
||||
****************************************************************************************
|
||||
* Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.util;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
* Provides an immutable list of languages (locales) in priority order.
|
||||
* The string format is based on the Accept-Language format
|
||||
* {@link http://www.ietf.org/rfc/rfc2616.txt}, such as
|
||||
* "af, en, fr;q=0.9". Syntactically it is slightly
|
||||
* more lenient, in allowing extra whitespace between elements, extra commas,
|
||||
* and more than 3 decimals (on input), and pins between 0 and 1.
|
||||
* <p>In theory, Accept-Language indicates the relative 'quality' of each item,
|
||||
* but in practice, all of the browsers just take an ordered list, like
|
||||
* "en, fr, de", and synthesize arbitrary quality values that put these in the
|
||||
* right order, like: "en, fr;q=0.7, de;q=0.3". The quality values in these de facto
|
||||
* semantics thus have <b>nothing</b> to do with the relative qualities of the
|
||||
* original. Accept-Language also doesn't
|
||||
* specify the interpretation of multiple instances, eg what "en, fr, en;q=.5"
|
||||
* means.
|
||||
* <p>There are various ways to build a LanguagePriorityList, such
|
||||
* as using the following equivalent patterns:
|
||||
*
|
||||
* <pre>
|
||||
* list = LanguagePriorityList.add("af, en, fr;q=0.9").build();
|
||||
*
|
||||
* list2 = LanguagePriorityList
|
||||
* .add(ULocale.forString("af"))
|
||||
* .add(ULocale.ENGLISH)
|
||||
* .add(ULocale.FRENCH, 0.9d)
|
||||
* .build();
|
||||
* </pre>
|
||||
* When the list is built, the internal values are sorted in descending order by
|
||||
* weight, and then by input order. That is, if two languages have the same weight, the first one in the original order
|
||||
* comes first. If exactly the same language tag appears multiple times,
|
||||
* the last one wins.
|
||||
*
|
||||
* There are two options when building. If preserveWeights are on, then "de;q=0.3, ja;q=0.3, en, fr;q=0.7, de " would result in the following:
|
||||
* <pre> en;q=1.0
|
||||
* de;q=1.0
|
||||
* fr;q=0.7
|
||||
* ja;q=0.3</pre>
|
||||
* If it is off (the default), then all weights are reset to 1.0 after reordering.
|
||||
* This is to match the effect of the Accept-Language semantics as used in browsers, and results in the following:
|
||||
* * <pre> en;q=1.0
|
||||
* de;q=1.0
|
||||
* fr;q=1.0
|
||||
* ja;q=1.0</pre>
|
||||
* @author markdavis@google.com
|
||||
*/
|
||||
public class LocalePriorityList implements Iterable<ULocale> {
|
||||
private static final double D0 = 0.0d;
|
||||
private static final Double D1 = 1.0d;
|
||||
|
||||
private static final Pattern languageSplitter = Pattern.compile("\\s*,\\s*");
|
||||
private static final Pattern weightSplitter = Pattern
|
||||
.compile("\\s*(\\S*)\\s*;\\s*q\\s*=\\s*(\\S*)");
|
||||
private final Map<ULocale, Double> languagesAndWeights;
|
||||
|
||||
/**
|
||||
* Add a language code to the list being built, with weight 1.0.
|
||||
*
|
||||
* @param ULocale
|
||||
* @return internal builder, for chaining
|
||||
*/
|
||||
public static LanguagePriorityListBuilder add(final ULocale languageCode) {
|
||||
return new LanguagePriorityListBuilder().add(languageCode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a language code to the list being built, with specified weight.
|
||||
*
|
||||
* @param languageCode
|
||||
* @param weight value from 0.0 to 1.0
|
||||
* @return internal builder, for chaining
|
||||
*/
|
||||
public static LanguagePriorityListBuilder add(
|
||||
final ULocale languageCode, final double weight) {
|
||||
return new LanguagePriorityListBuilder().add(languageCode, weight);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a language priority list.
|
||||
*
|
||||
* @param languagePriorityList
|
||||
* @return internal builder, for chaining
|
||||
*/
|
||||
public static LanguagePriorityListBuilder add(
|
||||
final LocalePriorityList languagePriorityList) {
|
||||
return new LanguagePriorityListBuilder().add(languagePriorityList);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add language codes to the list being built, using a string in RFC2626
|
||||
* (lenient) format, where each language is a valid {@link ULocale}.
|
||||
*
|
||||
* @param acceptLanguageString
|
||||
* @return internal builder, for chaining
|
||||
*/
|
||||
public static LanguagePriorityListBuilder add(
|
||||
final String acceptLanguageString) {
|
||||
return new LanguagePriorityListBuilder().add(acceptLanguageString);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the weight for a given language, or null if there is none. Note that
|
||||
* the weights may be adjusted from those used to build the list.
|
||||
*
|
||||
* @param language
|
||||
* @return weight
|
||||
*/
|
||||
public Double getWeight(final ULocale language) {
|
||||
return languagesAndWeights.get(language);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder result = new StringBuilder();
|
||||
for (final ULocale language : languagesAndWeights.keySet()) {
|
||||
if (result.length() != 0) {
|
||||
result.append(", ");
|
||||
}
|
||||
result.append(language);
|
||||
double weight = languagesAndWeights.get(language);
|
||||
if (weight != D1) {
|
||||
result.append(";q=").append(weight);
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public Iterator<ULocale> iterator() {
|
||||
return languagesAndWeights.keySet().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
try {
|
||||
final LocalePriorityList that = (LocalePriorityList) o;
|
||||
return languagesAndWeights.equals(that.languagesAndWeights);
|
||||
} catch (final RuntimeException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return languagesAndWeights.hashCode();
|
||||
}
|
||||
|
||||
// ==================== Privates ====================
|
||||
|
||||
|
||||
private LocalePriorityList(final Map<ULocale, Double> languageToWeight) {
|
||||
this.languagesAndWeights = languageToWeight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal class used for building LanguagePriorityLists
|
||||
*/
|
||||
public static class LanguagePriorityListBuilder {
|
||||
/**
|
||||
* These store the input languages and weights, in chronological order,
|
||||
* where later additions override previous ones.
|
||||
*/
|
||||
private final Map<ULocale, Double> languageToWeight
|
||||
= new LinkedHashMap<ULocale, Double>();
|
||||
|
||||
public LocalePriorityList build() {
|
||||
return build(false);
|
||||
}
|
||||
|
||||
public LocalePriorityList build(boolean preserveWeights) {
|
||||
// Walk through the input list, collecting the items with the same weights.
|
||||
final Map<Double, Set<ULocale>> doubleCheck = new TreeMap<Double, Set<ULocale>>(
|
||||
myDescendingDouble);
|
||||
for (final ULocale lang : languageToWeight.keySet()) {
|
||||
Double weight = languageToWeight.get(lang);
|
||||
Set<ULocale> s = doubleCheck.get(weight);
|
||||
if (s == null) {
|
||||
doubleCheck.put(weight, s = new LinkedHashSet<ULocale>());
|
||||
}
|
||||
s.add(lang);
|
||||
}
|
||||
// We now have a bunch of items sorted by weight, then chronologically.
|
||||
// We can now create a list in the right order
|
||||
final Map<ULocale, Double> temp = new LinkedHashMap<ULocale, Double>();
|
||||
for (final Double weight : doubleCheck.keySet()) {
|
||||
for (final ULocale lang : doubleCheck.get(weight)) {
|
||||
temp.put(lang, preserveWeights ? weight : D1);
|
||||
}
|
||||
}
|
||||
return new LocalePriorityList(Collections.unmodifiableMap(temp));
|
||||
}
|
||||
|
||||
public LanguagePriorityListBuilder add(
|
||||
final LocalePriorityList languagePriorityList) {
|
||||
for (final ULocale language : languagePriorityList.languagesAndWeights
|
||||
.keySet()) {
|
||||
add(language, languagePriorityList.languagesAndWeights.get(language));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new language code, with weight = 1.0.
|
||||
*
|
||||
* @param languageCode
|
||||
* @return this, for chaining
|
||||
*/
|
||||
public LanguagePriorityListBuilder add(final ULocale languageCode) {
|
||||
return add(languageCode, D1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds language codes, with each having weight = 1.0.
|
||||
*
|
||||
* @param languageCodes List of language codes.
|
||||
* @return this, for chaining.
|
||||
*/
|
||||
public LanguagePriorityListBuilder add(final ULocale... languageCodes) {
|
||||
for (final ULocale languageCode : languageCodes) {
|
||||
add(languageCode, D1);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new supported languageCode, with specified weight. Overrides any
|
||||
* previous weight for the language.
|
||||
*
|
||||
* @param languageCode
|
||||
* @param weight
|
||||
* @return this, for chaining.
|
||||
*/
|
||||
public LanguagePriorityListBuilder add(final ULocale languageCode,
|
||||
double weight) {
|
||||
if (languageToWeight.containsKey(languageCode)) {
|
||||
languageToWeight.remove(languageCode);
|
||||
}
|
||||
if (weight <= D0) {
|
||||
return this; // skip zeros
|
||||
} else if (weight > D1) {
|
||||
weight = D1;
|
||||
}
|
||||
languageToWeight.put(languageCode, weight);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds rfc2616 list.
|
||||
*
|
||||
* @param acceptLanguageList
|
||||
* @return this, for chaining.
|
||||
*/
|
||||
public LanguagePriorityListBuilder add(final String acceptLanguageList) {
|
||||
final String[] items = languageSplitter.split(acceptLanguageList.trim());
|
||||
final Matcher itemMatcher = weightSplitter.matcher("");
|
||||
for (final String item : items) {
|
||||
if (itemMatcher.reset(item).matches()) {
|
||||
final ULocale language = new ULocale(itemMatcher.group(1));
|
||||
final double weight = Double.parseDouble(itemMatcher.group(2));
|
||||
if (!(weight >= D0 && weight <= D1)) { // do ! for NaN
|
||||
throw new IllegalArgumentException("Illegal weight, must be 0..1: "
|
||||
+ weight);
|
||||
}
|
||||
add(language, weight);
|
||||
} else if (item.length() != 0) {
|
||||
add(new ULocale(item));
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
private static Comparator<Double> myDescendingDouble = new Comparator<Double>() {
|
||||
public int compare(Double o1, Double o2) {
|
||||
return -o1.compareTo(o2);
|
||||
}
|
||||
};
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
****************************************************************************************
|
||||
* Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.test.util;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.LocaleMatcher;
|
||||
import com.ibm.icu.util.LocalePriorityList;
|
||||
|
||||
/**
|
||||
* Test the LanguageMatcher.
|
||||
*
|
||||
* @author markdavis
|
||||
*/
|
||||
public class LocaleMatcherTest extends TestFmwk {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new LocaleMatcherTest().run(args);
|
||||
}
|
||||
|
||||
public void testBasics() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher(LocalePriorityList
|
||||
.add(ULocale.FRENCH).add(ULocale.UK)
|
||||
.add(ULocale.ENGLISH).build());
|
||||
logln(matcher.toString());
|
||||
|
||||
assertEquals(ULocale.UK, matcher.getBestMatch(ULocale.UK));
|
||||
assertEquals(ULocale.ENGLISH, matcher.getBestMatch(ULocale.US));
|
||||
assertEquals(ULocale.FRENCH, matcher.getBestMatch(ULocale.FRANCE));
|
||||
assertEquals(ULocale.FRENCH, matcher.getBestMatch(ULocale.JAPAN));
|
||||
}
|
||||
|
||||
public void testFallback() {
|
||||
// check that script fallbacks are handled right
|
||||
final LocaleMatcher matcher = new LocaleMatcher("zh_CN, zh_TW, iw");
|
||||
assertEquals(new ULocale("zh_TW"), matcher.getBestMatch("zh_Hant"));
|
||||
assertEquals(new ULocale("zh_CN"), matcher.getBestMatch("zh"));
|
||||
assertEquals(new ULocale("zh_CN"), matcher.getBestMatch("zh_Hans_CN"));
|
||||
assertEquals(new ULocale("zh_TW"), matcher.getBestMatch("zh_Hant_HK"));
|
||||
assertEquals(new ULocale("he"), matcher.getBestMatch("iw_IT"));
|
||||
}
|
||||
|
||||
public void testSpecials() {
|
||||
// check that nearby languages are handled
|
||||
final LocaleMatcher matcher = new LocaleMatcher("en, fil, ro, nn");
|
||||
assertEquals(new ULocale("fil"), matcher.getBestMatch("tl"));
|
||||
assertEquals(new ULocale("ro"), matcher.getBestMatch("mo"));
|
||||
assertEquals(new ULocale("nn"), matcher.getBestMatch("nb"));
|
||||
// make sure default works
|
||||
assertEquals(new ULocale("en"), matcher.getBestMatch("ja"));
|
||||
}
|
||||
|
||||
public void testRegionalSpecials() {
|
||||
// verify that en_AU is closer to en_GB than to en (which is en_US)
|
||||
final LocaleMatcher matcher = new LocaleMatcher("en, en_GB, es, es_419");
|
||||
assertEquals("en_AU in {en, en_GB, es, es_419}", new ULocale("en_GB"), matcher.getBestMatch("en_AU"));
|
||||
assertEquals("es_MX in {en, en_GB, es, es_419}", new ULocale("es_419"), matcher.getBestMatch("es_MX"));
|
||||
assertEquals("es_ES in {en, en_GB, es, es_419}", new ULocale("es"), matcher.getBestMatch("es_ES"));
|
||||
}
|
||||
|
||||
private void assertEquals(Object expected, Object string) {
|
||||
assertEquals("", expected, string);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
****************************************************************************************
|
||||
* Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.test.util;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.LocalePriorityList;
|
||||
|
||||
/**
|
||||
* Test the LanguagePriorityList
|
||||
* @author markdavis@google.com
|
||||
*/
|
||||
public class LocalePriorityListTest extends TestFmwk {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new LocalePriorityListTest().run(args);
|
||||
}
|
||||
|
||||
public void testLanguagePriorityList() {
|
||||
final String expected = "af, en, fr";
|
||||
|
||||
LocalePriorityList list = LocalePriorityList.add("af, en, fr;q=0.9").build();
|
||||
assertEquals(expected, list.toString());
|
||||
|
||||
// check looseness, and that later values win
|
||||
LocalePriorityList list2 = LocalePriorityList.add(
|
||||
", fr ; q = 0.9 , en;q=0.1 , af, en, de;q=0, ").build();
|
||||
assertEquals(expected, list2.toString());
|
||||
assertEquals(list, list2);
|
||||
|
||||
LocalePriorityList list3 = LocalePriorityList
|
||||
.add(new ULocale("af"))
|
||||
.add(ULocale.FRENCH, 0.9d)
|
||||
.add(ULocale.ENGLISH)
|
||||
.build();
|
||||
assertEquals(expected, list3.toString());
|
||||
assertEquals(list, list3);
|
||||
|
||||
LocalePriorityList list4 = LocalePriorityList
|
||||
.add(list).build();
|
||||
assertEquals(expected, list4.toString());
|
||||
assertEquals(list, list4);
|
||||
|
||||
LocalePriorityList list5 = LocalePriorityList.add("af, fr;q=0.9, en").build(true);
|
||||
assertEquals("af, en, fr;q=0.9", list5.toString());
|
||||
}
|
||||
|
||||
private void assertEquals(Object expected, Object string) {
|
||||
assertEquals("", expected, string);
|
||||
}
|
||||
}
|
|
@ -31,6 +31,8 @@ public class TestAll extends TestGroup {
|
|||
"LocaleAliasTest",
|
||||
"DebugUtilitiesTest",
|
||||
"LocaleBuilderTest",
|
||||
"LocaleMatcherTest",
|
||||
"LocalePriorityListTest",
|
||||
},
|
||||
"Test miscellaneous public utilities");
|
||||
}
|
||||
|
|
|
@ -32,15 +32,15 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.Row;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMapIterator;
|
||||
import com.ibm.icu.dev.test.util.Row.R2;
|
||||
import com.ibm.icu.dev.test.util.Tabber.HTMLTabber;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
|
||||
import com.ibm.icu.dev.test.util.XEquivalenceClass.SetMaker;
|
||||
import com.ibm.icu.impl.Row;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.impl.Row.R2;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Collator;
|
||||
|
|
Loading…
Add table
Reference in a new issue