ICU-7125 Add API for language matching for data that will be coming from CLDR.

X-SVN-Rev: 26589
2025-04-14 17:24:01 +00:00 · 2009-09-02 23:13:26 +00:00 · 2009-09-02 23:13:26 +00:00 · 42d4b38e15
commit 42d4b38e15
parent beeba02f88
8 changed files with 1020 additions and 9 deletions
--- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/Row.java
+++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/Row.java
@ -6,9 +6,8 @@
 * Author: Mark Davis
 **********************************************************************
 */
-package com.ibm.icu.dev.test.util;
+package com.ibm.icu.impl;

-import com.ibm.icu.impl.Utility;
 import com.ibm.icu.util.Freezable;


--- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleData.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleData.java
@ -1,9 +1,9 @@
 /*
- *******************************************************************************
- * Copyright (C) 2004-2009, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
- *******************************************************************************
-*/
+ ****************************************************************************************
+ * Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                         *
+ ****************************************************************************************
+ */
 package com.ibm.icu.util;

 import java.util.MissingResourceException;
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java
@ -0,0 +1,586 @@
+/*
+ ****************************************************************************************
+ * Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                         *
+ ****************************************************************************************
+ */
+package com.ibm.icu.util;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.ibm.icu.impl.Row;
+import com.ibm.icu.impl.Row.R2;
+import com.ibm.icu.impl.Row.R3;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Provides a way to match the languages (locales) supported by a product to the
+ * languages (locales) acceptable to a user, and get the best match. For
+ * example:
+ * 
+ * <pre>
+ * LanguageMatcher matcher = new StandardLanguageMatcher(&quot;fr, en-GB, en&quot;);
+ * 
+ * // afterwards:
+ * matcher.getBestMatch(LanguageCode.US).first == LanguageCode.ENGLISH
+ * </pre>
+ * 
+ * It takes into account when languages are close to one another, such as fil
+ * and tl, and when language regional variants are close, like en-GB and en-AU.
+ * It also handles scripts, like zh-Hant vs zh-TW. For examples, see the test
+ * file.
+ * <p>All classes implementing this interface should be immutable. Often a
+ * product will just need one static instance, built with the languages
+ * that it supports. However, it may want multiple instances with different
+ * default languages based on additional information, such as the domain.
+ * 
+ * @author markdavis@google.com
+ * @draft ICU 4.4
+ */
+public class LocaleMatcher {
+    private static final boolean DEBUG = false;
+
+
+    /**
+     * Threshold for falling back to the default (first) language. May make this
+     * a parameter in the future.
+     */
+    private static final double DEFAULT_THRESHOLD = 0.5;
+
+    /**
+     * The default language, in case the threshold is not met.
+     */
+    private final ULocale defaultLanguage;
+
+    /**
+     * Create a new language matcher. The highest-weighted language is the
+     * default. That means that if no other language is matches closer than a given
+     * threshold, that default language is chosen. Typically the default is English,
+     * but it could be different based on additional information, such as the domain
+     * of the page.
+     * 
+     * @param languagePriorityList weighted list
+     */
+    public LocaleMatcher(LocalePriorityList languagePriorityList) {
+        this(languagePriorityList, defaultWritten);
+    }
+
+    /**
+     * Create a new language matcher from a String form. The highest-weighted
+     * language is the default.
+     * 
+     * @param languagePriorityListString String form of LanguagePriorityList
+     */
+    public LocaleMatcher(String languagePriorityListString) {
+        this(LocalePriorityList.add(languagePriorityListString).build());
+    }
+
+    /**
+     * @internal
+     * @param languagePriorityList
+     * @param matcherData
+     */
+    public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
+        this.matcherData = matcherData;
+        for (final ULocale language : languagePriorityList) {
+            add(language, languagePriorityList.getWeight(language));
+        }
+        Iterator<ULocale> it = languagePriorityList.iterator();
+        defaultLanguage = it.hasNext() ? it.next() : null;
+    }
+
+
+    /**
+     * Returns a fraction between 0 and 1, where 1 means that the languages are a
+     * perfect match, and 0 means that they are completely different. Note that
+     * the precise values may change over time; no code should be made dependent
+     * on the values remaining constant.
+     * @param a 
+     * @param aMax 
+     * @param b 
+     * @param bMax 
+     * @return value between 0 and 1, inclusive.
+     */
+    public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
+        return matcherData.match(a, aMax, b, bMax);
+    }
+
+
+    /**
+     * Canonicalize a locale (language). Note that for now, it is canonicalizing according to CLDR conventions (he vs iw, etc), since that is what is needed for likelySubtags.
+     * TODO Get the data from CLDR, use Java conventions.
+     * @param languageCode 
+     * @return ULocale with remapped subtags.
+     */
+    public ULocale canonicalize(ULocale languageCode) {
+        String lang = languageCode.getLanguage();
+        String lang2 = canonicalMap.get(lang);
+        String script = languageCode.getScript();
+        String script2 = canonicalMap.get(script);
+        String region = languageCode.getCountry();
+        String region2 = canonicalMap.get(region);
+        if (lang2 != null || script2 != null || region2 != null) {
+            return new ULocale(
+                    lang2 == null ? lang : lang2,
+                            script2 == null ? script : script2,
+                                    region2 == null ? region : region2
+            );
+        }
+        return languageCode;
+    }
+
+    /**
+     * Get the best match for a LanguagePriorityList
+     * 
+     * @param languageList
+     * @return best matching language code
+     */
+    public ULocale getBestMatch(LocalePriorityList languageList) {
+        double bestWeight = 0;
+        ULocale bestTableMatch = null;
+        for (final ULocale language : languageList) {
+            final Row.R2<ULocale, Double> matchRow = getBestMatchInternal(language);
+            final double weight = matchRow.get1() * languageList.getWeight(language);
+            if (weight > bestWeight) {
+                bestWeight = weight;
+                bestTableMatch = matchRow.get0();
+            }
+        }
+        if (bestWeight < DEFAULT_THRESHOLD) {
+            bestTableMatch = defaultLanguage;
+        }
+        return bestTableMatch;
+    }
+
+    /**
+     * Get the best match for a LanguagePriorityList
+     * 
+     * @param languageList
+     * @return best matching language code
+     */
+    public ULocale getBestMatch(String languageList) {
+        return getBestMatch(LocalePriorityList.add(languageList).build());
+    }
+
+    /**
+     * Get the best match for an individual language code.
+     * 
+     * @param languageCode
+     * @return best matching language code
+     */
+    public ULocale getBestMatch(ULocale languageCode) {
+        return getBestMatchInternal(languageCode).get0();
+    }
+
+    @Override
+    public String toString() {
+        return "{" + defaultLanguage + ", " 
+        + maximizedLanguageToWeight + "}";
+    }
+    // ================= Privates =====================
+
+    /**
+     * Get the best match for an individual language code.
+     * 
+     * @param languageCode
+     * @return best matching language code and weight (as per
+     *         {@link #match(ULocale, ULocale)})
+     */
+    private Row.R2<ULocale, Double> getBestMatchInternal(ULocale languageCode) {
+        languageCode = canonicalize(languageCode);
+        final ULocale maximized = addLikelySubtags(languageCode);
+        if (DEBUG) {
+            System.out.println("\n" + languageCode + ";\t" + maximized);
+        }
+        double bestWeight = 0;
+        ULocale bestTableMatch = null;
+        for (final ULocale tableKey : maximizedLanguageToWeight.keySet()) {
+            R2<ULocale, Double> row = maximizedLanguageToWeight.get(tableKey);
+            final double match = match(languageCode, maximized, tableKey, row.get0());
+            if (DEBUG) {
+                System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match);
+            }
+            final double weight = match * row.get1();
+            if (weight > bestWeight) {
+                bestWeight = weight;
+                bestTableMatch = tableKey;
+            }
+        }
+        if (bestWeight < DEFAULT_THRESHOLD) {
+            bestTableMatch = defaultLanguage;
+        }
+        return Row.R2.of(bestTableMatch, bestWeight);
+    }
+
+    private void add(ULocale language, Double weight) {
+        language = canonicalize(language);
+        R2<ULocale, Double> row = Row.of(addLikelySubtags(language), weight);
+        maximizedLanguageToWeight.put(language, row);
+    }
+
+    Map<ULocale,Row.R2<ULocale, Double>> maximizedLanguageToWeight = new LinkedHashMap<ULocale, R2<ULocale, Double>>();
+
+
+    // =============== Special Mapping Information ==============
+
+    /**
+     * We need to add another method to addLikelySubtags that doesn't return
+     * null, but instead substitutes Zzzz and ZZ if unknown. There are also
+     * a few cases where addLikelySubtags needs to have expanded data, to handle
+     * all deprecated codes, and to update to CLDR 1.6.
+     * @param languageCode
+     * @return "fixed" addLikelySubtags
+     */
+    // TODO(markdavis): update the above when CLDR 1.6 is final.
+    private ULocale addLikelySubtags(ULocale languageCode) {
+        final ULocale result = ULocale.addLikelySubtags(languageCode);
+        // should have method on getLikelySubtags for this
+        if (result == null || result.equals(languageCode)) {
+            final String language = languageCode.getLanguage();
+            final String script = languageCode.getScript();
+            final String region = languageCode.getCountry();
+            return new ULocale((language.length()==0 ? "und"
+                    : language)
+                    + "_"
+                    + (script.length()==0 ? "Zzzz" : script)
+                    + "_"
+                    + (region.length()==0 ? "ZZ" : region));
+        }
+        return result;
+    }
+
+    private static class LocalePatternMatcher {
+        // a value of null means a wildcard; matches any.
+        private String lang;
+        private String script;
+        private String region;
+        private Level level;
+        static Pattern pattern = Pattern.compile(
+                "([a-zA-Z]{1,8}|\\*)" +
+                "(?:-([a-zA-Z]{4}|\\*))?" +
+        "(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?");
+
+        public LocalePatternMatcher(String toMatch) {
+            Matcher matcher = pattern.matcher(toMatch);
+            if (!matcher.matches()) {
+                throw new IllegalArgumentException("Bad pattern: " + toMatch);
+            }
+            lang = matcher.group(1);
+            script = matcher.group(2);
+            region = matcher.group(3);
+            level = region != null ? Level.region : script != null ? Level.script : Level.language;
+
+            if (lang.equals("*")) {
+                lang = null;
+            }
+            if (script != null && script.equals("*")) {
+                script = null;
+            }
+            if (region != null && region.equals("*")) {
+                region = null;
+            }
+        }
+
+        boolean matches(ULocale ulocale) {
+            if (lang != null && !lang.equals(ulocale.getLanguage())) {
+                return false;
+            }
+            if (script != null && !script.equals(ulocale.getScript())) {
+                return false;
+            }
+            if (region != null && !region.equals(ulocale.getCountry())) {
+                return false;
+            }
+            return true;
+        }
+
+        public Level getLevel() {
+            return level;
+        }
+
+        public String getLanguage() {
+            return (lang == null ? "*" : lang);
+        }
+
+        public String getScript() {
+            return (script == null ? "*" : script);
+        }
+
+        public String getRegion() {
+            return (region == null ? "*" : region);
+        }
+
+        public String toString() {
+            String result = getLanguage();
+            if (level != Level.language) {
+                result += "-" + getScript();
+                if (level != Level.script) {
+                    result += "-" + getRegion();
+                }
+            }
+            return result;
+        }
+    }
+
+    enum Level {language, script, region}
+
+    private static class ScoreData {
+        final Set<Row.R3<LocalePatternMatcher,LocalePatternMatcher,Double>> scores = new LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>();
+        final double worst;
+        final Level level;
+
+        public ScoreData(Level level) {
+            this.level = level;
+            this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0;
+        }
+
+        void addDataToScores(String desired, String supported, R3<LocalePatternMatcher,LocalePatternMatcher,Double> data) {
+            //            Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desired);
+            //            if (lang_result == null) {
+            //                scores.put(desired, lang_result = new HashMap());
+            //            }
+            //            Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supported);
+            //            if (result == null) {
+            //                lang_result.put(supported, result = new LinkedHashSet());
+            //            }
+            //            result.add(data);
+            scores.add(data);
+        }
+
+        double getScore(ULocale desiredLocale, ULocale dMax, String desiredRaw, String desiredMax, 
+                ULocale supportedLocale, ULocale sMax, String supportedRaw, String supportedMax) {
+
+            /*
+             * d, dm, s, sm
+             * dc = d != dm
+             * sc = s != sm
+             * if dm != sm
+             *   rd = rd(dm,sm) // line 4
+             *   if dc != sc
+             *     rd *= 0.75 // lines 3,8
+             *   ef dc
+             *     rd *= 0.5 // lines 7
+             *   end
+             *  ef dc == sc
+             *   rd = 0 // line 6
+             *  else
+             *   rd = 0.25*StdRDiff // lines 2,5
+             */
+
+            boolean desiredChange = desiredRaw.equals(desiredMax);
+            boolean supportedChange = supportedRaw.equals(supportedMax);
+            double distance;
+            if (!desiredMax.equals(supportedMax)) {
+                //                Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desiredMax);
+                //                if (lang_result == null) {
+                //                    distance = worst;
+                //                } else {
+                //                    Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>> result = lang_result.get(supportedMax);
+                //                    skip:
+                //                    if (result == null) {
+                //                        distance = worst;
+                //                    } else {
+                distance = getRawScore(dMax, sMax);
+                //                }
+                if (desiredChange == supportedChange) {
+                    distance *= 0.75;
+                } else if (desiredChange) {
+                    distance *= 0.5;
+                }
+            } else if (desiredChange == supportedChange) { // maxes are equal, changes are equal
+                distance = 0;
+            } else { // maxes are equal, changes are different
+                distance = 0.25*worst;
+            }
+            return distance;
+        }
+
+        private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
+            if (DEBUG) {
+                System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale);
+            }
+            for (R3<LocalePatternMatcher,LocalePatternMatcher,Double> datum : scores) { // : result
+                if (datum.get0().matches(desiredLocale) 
+                        && datum.get1().matches(supportedLocale)) {
+                    if (DEBUG) {
+                        System.out.println("\t\t\tFOUND\t" + datum);
+                    }
+                    return datum.get2();
+                }
+            }
+            if (DEBUG) {
+                System.out.println("\t\t\tNOTFOUND\t" + worst);
+            }
+            return worst;
+        }
+
+        public String toString() {
+            return level + ", " + scores;
+        }
+    }
+
+    /**
+     * Only for testing and use by tools. Interface may change!!
+     * @internal
+     */
+    public static class LanguageMatcherData {
+        ScoreData languageScores = new ScoreData(Level.language);
+        ScoreData scriptScores = new ScoreData(Level.script);
+        ScoreData regionScores = new ScoreData(Level.region);
+
+        public Builder start() {
+            return new Builder();
+        }
+
+        public double match(ULocale a, ULocale aMax, ULocale b, ULocale bMax) {
+            double diff = 0;
+            diff += languageScores.getScore(a, aMax, a.getLanguage(), aMax.getLanguage(), b, bMax, b.getLanguage(), bMax.getLanguage());
+            diff += scriptScores.getScore(a, aMax, a.getScript(), aMax.getScript(), b, bMax, b.getScript(), bMax.getScript());
+            diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());
+
+            if (!a.getVariant().equals(b.getVariant())) {
+                diff += 1;
+            }
+            if (diff < 0.0d) {
+                diff = 0.0d;
+            } else if (diff > 1.0d) {
+                diff = 1.0d;
+            }
+            return 1.0 - diff;
+        }
+
+        class Builder {            
+            public LanguageMatcherData build() {
+                return LanguageMatcherData.this;
+            }
+
+            /**
+             * Add an exceptional distance between languages, typically because regional
+             * dialects were given their own language codes. At this point the code is
+             * symmetric. We don't bother producing an equivalence class because there are
+             * so few cases; this function depends on the other permutations being
+             * added specifically.
+             * @param desired
+             * @param supported
+             * @param distance
+             * @param bidirectional TODO
+             * @return 
+             */
+            private Builder addDistance(String desired, String supported, int percent) {
+                return addDistance(desired, supported, percent, false, null);
+            }
+            private Builder addDistance(String desired, String supported, int percent, String comment) {
+                return addDistance(desired, supported, percent, false, comment);
+            }
+
+            private Builder addDistance(String desired, String supported, int percent, boolean oneway) {
+                return addDistance(desired, supported, percent, oneway, null);
+            }
+
+            private Builder addDistance(String desired, String supported, int percent, boolean oneway, String comment) {
+                if (DEBUG) {
+                    if (false)
+                        System.out.println("\t<distance desired=\"" + desired + "\"" +
+                                " supported=\"" + supported + "\"" +
+                                " percent=\"" + percent + "\""
+                                + (oneway ? "" : " oneway=\"true\"")
+                                + "/>"
+                                + (comment == null ? "" : "\t<!-- " + comment + " -->"));
+                    else //     .addDistance("nn", "nb", 4, true)
+                        System.out.println(".addDistance(\"" + desired + "\"" +
+                                ", \"" + supported + "\"" +
+                                ", " + percent + ""
+                                + (oneway ? "" : ", true")
+                                + (comment == null ? "" : ", \"" + comment + "\"")
+                                + ")"
+                        );
+
+                }
+                double score = 1-percent/100.0; // convert from percentage
+                LocalePatternMatcher desiredMatcher = new LocalePatternMatcher(desired);
+                Level desiredLen = desiredMatcher.getLevel();
+                LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
+                Level supportedLen = supportedMatcher.getLevel();
+                if (desiredLen != supportedLen) {
+                    throw new IllegalArgumentException();
+                }
+                R3<LocalePatternMatcher,LocalePatternMatcher,Double> data = Row.of(desiredMatcher, supportedMatcher, score);
+                R3<LocalePatternMatcher,LocalePatternMatcher,Double> data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
+                switch (desiredLen) {
+                case language:
+                    String dlanguage = desiredMatcher.getLanguage();
+                    String slanguage = supportedMatcher.getLanguage();
+                    languageScores.addDataToScores(dlanguage, slanguage, data);
+                    if (!oneway) {
+                        languageScores.addDataToScores(slanguage, dlanguage, data2);
+                    }
+                    break;
+                case script:
+                    String dscript = desiredMatcher.getScript();
+                    String sscript = supportedMatcher.getScript();
+                    scriptScores.addDataToScores(dscript, sscript, data);
+                    if (!oneway) {
+                        scriptScores.addDataToScores(sscript, dscript, data2);
+                    }
+                    break;
+                case region:
+                    String dregion = desiredMatcher.getRegion();
+                    String sregion = supportedMatcher.getRegion();
+                    regionScores.addDataToScores(dregion, sregion, data);
+                    if (!oneway) {
+                        regionScores.addDataToScores(sregion, dregion, data2);
+                    }
+                    break;
+                }
+                return this;
+            }
+        }
+    }
+
+    LanguageMatcherData matcherData;
+
+    private static LanguageMatcherData defaultWritten = new LanguageMatcherData().start()
+    // TODO get data from CLDR
+    .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
+    .addDistance("nn", "nb", 96)
+    .addDistance("nn", "no", 96)
+    .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
+    .addDistance("da", "nb", 90)
+    .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
+    .addDistance("sh", "br", 96)
+    .addDistance("sr", "br", 96)
+    .addDistance("sh", "hr", 96)
+    .addDistance("sr", "hr", 96)
+    .addDistance("sh", "sr", 96)
+    .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
+    .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
+    .addDistance("*-Hant", "*-Hans", 75, true)
+    .addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.")
+    .addDistance("en-*-US", "en-*-*", 97)
+    .addDistance("en-*-CA", "en-*-*", 98)
+    .addDistance("en-*-*", "en-*-*", 99)
+    .addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.")
+    .addDistance("es-*-ES", "es-*-*", 93)
+    .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
+    .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
+    .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
+    .build();
+
+    private static HashMap<String,String> canonicalMap = new HashMap<String, String>();
+
+    static {
+        // TODO get data from CLDR
+        System.out.println(ULocale.addLikelySubtags(new ULocale("iw")));
+        System.out.println(ULocale.addLikelySubtags(new ULocale("he")));
+        canonicalMap.put("iw", "he");
+        canonicalMap.put("mo", "ro");
+        canonicalMap.put("tl", "fil");
+    }
+}
--- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocalePriorityList.java
@ -0,0 +1,298 @@
+/*
+ ****************************************************************************************
+ * Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                         *
+ ****************************************************************************************
+ */
+
+package com.ibm.icu.util;
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Provides an immutable list of languages (locales) in priority order.
+ * The string format is based on the Accept-Language format 
+ * {@link http://www.ietf.org/rfc/rfc2616.txt}, such as 
+ * "af, en, fr;q=0.9". Syntactically it is slightly
+ * more lenient, in allowing extra whitespace between elements, extra commas,
+ * and more than 3 decimals (on input), and pins between 0 and 1.
+ * <p>In theory, Accept-Language indicates the relative 'quality' of each item,
+ * but in practice, all of the browsers just take an ordered list, like 
+ * "en, fr, de", and synthesize arbitrary quality values that put these in the
+ * right order, like: "en, fr;q=0.7, de;q=0.3". The quality values in these de facto
+ * semantics thus have <b>nothing</b> to do with the relative qualities of the
+ * original. Accept-Language also doesn't
+ * specify the interpretation of multiple instances, eg what "en, fr, en;q=.5"
+ * means.
+ * <p>There are various ways to build a LanguagePriorityList, such
+ * as using the following equivalent patterns:
+ * 
+ * <pre>
+ * list = LanguagePriorityList.add(&quot;af, en, fr;q=0.9&quot;).build();
+ * 
+ * list2 = LanguagePriorityList
+ *  .add(ULocale.forString(&quot;af&quot;))
+ *  .add(ULocale.ENGLISH)
+ *  .add(ULocale.FRENCH, 0.9d)
+ *  .build();
+ * </pre>
+ * When the list is built, the internal values are sorted in descending order by
+ * weight, and then by input order. That is, if two languages have the same weight, the first one in the original order
+ * comes first. If exactly the same language tag appears multiple times,
+ * the last one wins. 
+ * 
+ * There are two options when building. If preserveWeights are on, then "de;q=0.3, ja;q=0.3, en, fr;q=0.7, de " would result in the following:
+ * <pre> en;q=1.0
+ * de;q=1.0
+ * fr;q=0.7
+ * ja;q=0.3</pre>
+ * If it is off (the default), then all weights are reset to 1.0 after reordering. 
+ * This is to match the effect of the Accept-Language semantics as used in browsers, and results in the following:
+ *  * <pre> en;q=1.0
+ * de;q=1.0
+ * fr;q=1.0
+ * ja;q=1.0</pre>
+ * @author markdavis@google.com
+ */
+public class LocalePriorityList implements Iterable<ULocale> {
+    private static final double D0 = 0.0d;
+    private static final Double D1 = 1.0d;
+
+    private static final Pattern languageSplitter = Pattern.compile("\\s*,\\s*");
+    private static final Pattern weightSplitter = Pattern
+    .compile("\\s*(\\S*)\\s*;\\s*q\\s*=\\s*(\\S*)");
+    private final Map<ULocale, Double> languagesAndWeights;
+
+    /**
+     * Add a language code to the list being built, with weight 1.0.
+     * 
+     * @param ULocale
+     * @return internal builder, for chaining
+     */
+    public static LanguagePriorityListBuilder add(final ULocale languageCode) {
+        return new LanguagePriorityListBuilder().add(languageCode);
+    }
+
+    /**
+     * Add a language code to the list being built, with specified weight.
+     * 
+     * @param languageCode
+     * @param weight value from 0.0 to 1.0
+     * @return internal builder, for chaining
+     */
+    public static LanguagePriorityListBuilder add(
+            final ULocale languageCode, final double weight) {
+        return new LanguagePriorityListBuilder().add(languageCode, weight);
+    }
+
+    /**
+     * Add a language priority list.
+     * 
+     * @param languagePriorityList
+     * @return internal builder, for chaining
+     */
+    public static LanguagePriorityListBuilder add(
+            final LocalePriorityList languagePriorityList) {
+        return new LanguagePriorityListBuilder().add(languagePriorityList);
+    }
+
+    /**
+     * Add language codes to the list being built, using a string in RFC2626
+     * (lenient) format, where each language is a valid {@link ULocale}.
+     * 
+     * @param acceptLanguageString
+     * @return internal builder, for chaining
+     */
+    public static LanguagePriorityListBuilder add(
+            final String acceptLanguageString) {
+        return new LanguagePriorityListBuilder().add(acceptLanguageString);
+    }
+
+    /**
+     * Return the weight for a given language, or null if there is none. Note that
+     * the weights may be adjusted from those used to build the list.
+     * 
+     * @param language
+     * @return weight
+     */
+    public Double getWeight(final ULocale language) {
+        return languagesAndWeights.get(language);
+    }
+
+    @Override
+    public String toString() {
+        final StringBuilder result = new StringBuilder();
+        for (final ULocale language : languagesAndWeights.keySet()) {
+            if (result.length() != 0) {
+                result.append(", ");
+            }
+            result.append(language);
+            double weight = languagesAndWeights.get(language);
+            if (weight != D1) {
+                result.append(";q=").append(weight);
+            }
+        }
+        return result.toString();
+    }
+
+    public Iterator<ULocale> iterator() {
+        return languagesAndWeights.keySet().iterator();
+    }
+
+    @Override
+    public boolean equals(final Object o) {
+        try {
+            final LocalePriorityList that = (LocalePriorityList) o;
+            return languagesAndWeights.equals(that.languagesAndWeights);
+        } catch (final RuntimeException e) {
+            return false;
+        }
+    }
+
+    @Override
+    public int hashCode() {
+        return languagesAndWeights.hashCode();
+    }
+
+    // ==================== Privates ====================
+
+
+    private LocalePriorityList(final Map<ULocale, Double> languageToWeight) {
+        this.languagesAndWeights = languageToWeight;
+    }
+
+    /**
+     * Internal class used for building LanguagePriorityLists
+     */
+    public static class LanguagePriorityListBuilder {
+        /**
+         * These store the input languages and weights, in chronological order,
+         * where later additions override previous ones.
+         */
+        private final Map<ULocale, Double> languageToWeight 
+        = new LinkedHashMap<ULocale, Double>();
+
+        public LocalePriorityList build() {
+            return build(false);
+        }
+
+        public LocalePriorityList build(boolean preserveWeights) {
+            // Walk through the input list, collecting the items with the same weights.
+            final Map<Double, Set<ULocale>> doubleCheck = new TreeMap<Double, Set<ULocale>>(
+                    myDescendingDouble);
+            for (final ULocale lang : languageToWeight.keySet()) {
+                Double weight = languageToWeight.get(lang);
+                Set<ULocale> s = doubleCheck.get(weight);
+                if (s == null) {
+                    doubleCheck.put(weight, s = new LinkedHashSet<ULocale>());
+                }
+                s.add(lang);
+            }
+            // We now have a bunch of items sorted by weight, then chronologically.
+            // We can now create a list in the right order
+            final Map<ULocale, Double> temp = new LinkedHashMap<ULocale, Double>();
+            for (final Double weight : doubleCheck.keySet()) {
+                for (final ULocale lang : doubleCheck.get(weight)) {
+                    temp.put(lang, preserveWeights ? weight : D1);
+                }
+            }
+            return new LocalePriorityList(Collections.unmodifiableMap(temp));
+        }
+
+        public LanguagePriorityListBuilder add(
+                final LocalePriorityList languagePriorityList) {
+            for (final ULocale language : languagePriorityList.languagesAndWeights
+                    .keySet()) {
+                add(language, languagePriorityList.languagesAndWeights.get(language));
+            }
+            return this;
+        }
+
+        /**
+         * Adds a new language code, with weight = 1.0.
+         * 
+         * @param languageCode
+         * @return this, for chaining
+         */
+        public LanguagePriorityListBuilder add(final ULocale languageCode) {
+            return add(languageCode, D1);
+        }
+
+        /**
+         * Adds language codes, with each having weight = 1.0.
+         * 
+         * @param languageCodes List of language codes.
+         * @return this, for chaining.
+         */
+        public LanguagePriorityListBuilder add(final ULocale... languageCodes) {
+            for (final ULocale languageCode : languageCodes) {
+                add(languageCode, D1);
+            }
+            return this;
+        }
+
+        /**
+         * Adds a new supported languageCode, with specified weight. Overrides any
+         * previous weight for the language.
+         * 
+         * @param languageCode
+         * @param weight
+         * @return this, for chaining.
+         */
+        public LanguagePriorityListBuilder add(final ULocale languageCode,
+                double weight) {
+            if (languageToWeight.containsKey(languageCode)) {
+                languageToWeight.remove(languageCode);
+            }
+            if (weight <= D0) {
+                return this; // skip zeros
+            } else if (weight > D1) {
+                weight = D1;
+            }
+            languageToWeight.put(languageCode, weight);
+            return this;
+        }
+
+        /**
+         * Adds rfc2616 list.
+         * 
+         * @param acceptLanguageList
+         * @return this, for chaining.
+         */
+        public LanguagePriorityListBuilder add(final String acceptLanguageList) {
+            final String[] items = languageSplitter.split(acceptLanguageList.trim());
+            final Matcher itemMatcher = weightSplitter.matcher("");
+            for (final String item : items) {
+                if (itemMatcher.reset(item).matches()) {
+                    final ULocale language = new ULocale(itemMatcher.group(1));
+                    final double weight = Double.parseDouble(itemMatcher.group(2));
+                    if (!(weight >= D0 && weight <= D1)) { // do ! for NaN
+                        throw new IllegalArgumentException("Illegal weight, must be 0..1: "
+                                + weight);
+                    }
+                    add(language, weight);
+                } else if (item.length() != 0) {
+                    add(new ULocale(item));
+                }
+            }
+            return this;
+        }
+    }
+
+    private static Comparator<Double> myDescendingDouble = new Comparator<Double>() {
+        public int compare(Double o1, Double o2) {
+            return -o1.compareTo(o2);
+        }
+    };
+}
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleMatcherTest.java
@ -0,0 +1,70 @@
+/*
+ ****************************************************************************************
+ * Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                         *
+ ****************************************************************************************
+ */
+
+package com.ibm.icu.dev.test.util;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.LocaleMatcher;
+import com.ibm.icu.util.LocalePriorityList;
+
+/**
+ * Test the LanguageMatcher.
+ * 
+ * @author markdavis
+ */
+public class LocaleMatcherTest extends TestFmwk {
+    
+    public static void main(String[] args) throws Exception {
+        new LocaleMatcherTest().run(args);
+      }
+
+  public void testBasics() {
+    final LocaleMatcher matcher = new LocaleMatcher(LocalePriorityList
+        .add(ULocale.FRENCH).add(ULocale.UK)
+        .add(ULocale.ENGLISH).build());
+    logln(matcher.toString());
+
+    assertEquals(ULocale.UK, matcher.getBestMatch(ULocale.UK));
+    assertEquals(ULocale.ENGLISH, matcher.getBestMatch(ULocale.US));
+    assertEquals(ULocale.FRENCH, matcher.getBestMatch(ULocale.FRANCE));
+    assertEquals(ULocale.FRENCH, matcher.getBestMatch(ULocale.JAPAN));
+  }
+
+  public void testFallback() {
+    // check that script fallbacks are handled right
+    final LocaleMatcher matcher = new LocaleMatcher("zh_CN, zh_TW, iw");
+    assertEquals(new ULocale("zh_TW"), matcher.getBestMatch("zh_Hant"));
+    assertEquals(new ULocale("zh_CN"), matcher.getBestMatch("zh"));
+    assertEquals(new ULocale("zh_CN"), matcher.getBestMatch("zh_Hans_CN"));
+    assertEquals(new ULocale("zh_TW"), matcher.getBestMatch("zh_Hant_HK"));
+    assertEquals(new ULocale("he"), matcher.getBestMatch("iw_IT"));
+  }
+
+  public void testSpecials() {
+    // check that nearby languages are handled
+    final LocaleMatcher matcher = new LocaleMatcher("en, fil, ro, nn");
+    assertEquals(new ULocale("fil"), matcher.getBestMatch("tl"));
+    assertEquals(new ULocale("ro"), matcher.getBestMatch("mo"));
+    assertEquals(new ULocale("nn"), matcher.getBestMatch("nb"));
+    // make sure default works
+    assertEquals(new ULocale("en"), matcher.getBestMatch("ja"));
+  }
+
+  public void testRegionalSpecials() {
+    // verify that en_AU is closer to en_GB than to en (which is en_US)
+    final LocaleMatcher matcher = new LocaleMatcher("en, en_GB, es, es_419");
+    assertEquals("en_AU in {en, en_GB, es, es_419}", new ULocale("en_GB"), matcher.getBestMatch("en_AU"));
+    assertEquals("es_MX in {en, en_GB, es, es_419}", new ULocale("es_419"), matcher.getBestMatch("es_MX"));
+    assertEquals("es_ES in {en, en_GB, es, es_419}", new ULocale("es"), matcher.getBestMatch("es_ES"));
+  }
+  
+  private void assertEquals(Object expected, Object string) {
+      assertEquals("", expected, string);
+  }
+
+}
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocalePriorityListTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocalePriorityListTest.java
@ -0,0 +1,56 @@
+/*
+ ****************************************************************************************
+ * Copyright (C) 2009, Google, Inc.; International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                         *
+ ****************************************************************************************
+ */
+
+package com.ibm.icu.dev.test.util;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.util.ULocale;
+import com.ibm.icu.util.LocalePriorityList;
+
+/**
+ * Test the LanguagePriorityList
+ * @author markdavis@google.com
+ */
+public class LocalePriorityListTest extends TestFmwk {
+    
+    public static void main(String[] args) throws Exception {
+        new LocalePriorityListTest().run(args);
+      }
+
+  public void testLanguagePriorityList() {
+    final String expected = "af, en, fr";
+
+    LocalePriorityList list = LocalePriorityList.add("af, en, fr;q=0.9").build();
+    assertEquals(expected, list.toString());
+
+    // check looseness, and that later values win
+    LocalePriorityList list2 = LocalePriorityList.add(
+        ", fr ; q = 0.9 ,   en;q=0.1 , af, en, de;q=0, ").build();
+    assertEquals(expected, list2.toString());
+    assertEquals(list, list2);
+
+    LocalePriorityList list3 = LocalePriorityList
+        .add(new ULocale("af"))
+        .add(ULocale.FRENCH, 0.9d)
+        .add(ULocale.ENGLISH)
+        .build();
+    assertEquals(expected, list3.toString());
+    assertEquals(list, list3);
+    
+    LocalePriorityList list4 = LocalePriorityList
+    .add(list).build();
+    assertEquals(expected, list4.toString());
+    assertEquals(list, list4);
+    
+    LocalePriorityList list5 = LocalePriorityList.add("af, fr;q=0.9, en").build(true);
+    assertEquals("af, en, fr;q=0.9", list5.toString());
+  }
+
+private void assertEquals(Object expected, Object string) {
+    assertEquals("", expected, string);
+}
+}
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TestAll.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/TestAll.java
@ -31,6 +31,8 @@ public class TestAll extends TestGroup {
            "LocaleAliasTest",
            "DebugUtilitiesTest",
            "LocaleBuilderTest",
+            "LocaleMatcherTest",
+            "LocalePriorityListTest",
        },
              "Test miscellaneous public utilities");
    }
--- a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/cldr/CheckSystemFonts.java
+++ b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/cldr/CheckSystemFonts.java
@ -32,15 +32,15 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import com.ibm.icu.dev.test.util.BagFormatter;
-import com.ibm.icu.dev.test.util.Row;
 import com.ibm.icu.dev.test.util.TransliteratorUtilities;
 import com.ibm.icu.dev.test.util.UnicodeMap;
 import com.ibm.icu.dev.test.util.UnicodeMapIterator;
-import com.ibm.icu.dev.test.util.Row.R2;
 import com.ibm.icu.dev.test.util.Tabber.HTMLTabber;
 import com.ibm.icu.dev.test.util.UnicodeMap.Composer;
 import com.ibm.icu.dev.test.util.XEquivalenceClass.SetMaker;
+import com.ibm.icu.impl.Row;
 import com.ibm.icu.impl.Utility;
+import com.ibm.icu.impl.Row.R2;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.Collator;