ICU-20916 LocaleMatcher distinguish between equivalent locales

- equivalent but originally unequal
- locale distance shifted left for additional fraction bits with micro distance
- Java more verbose matcher debug output
See #949
This commit is contained in:
Markus Scherer 2019-12-20 00:09:10 +00:00
parent 2f72a932ac
commit ad638c274e
14 changed files with 392 additions and 206 deletions

View file

@ -22,7 +22,7 @@
#include "uhash.h"
#include "uvector.h"
#define UND_LSR LSR("und", "", "")
#define UND_LSR LSR("und", "", "", LSR::EXPLICIT_LSR)
/**
* Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher.
@ -393,26 +393,27 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
// 3. Remaining locales in builder order.
// In Java, we use a LinkedHashMap for both map & ordered lists.
// In C++, we use separate structures.
// We over-allocate arrays of LSRs and indexes for simplicity.
// We reserve slots at the array starts for the default and paradigm locales,
// plus enough for all supported locales.
// If there are few paradigm locales and few duplicate supported LSRs,
// then the amount of wasted space is small.
//
// We allocate arrays of LSRs and indexes,
// with as many slots as supported locales, for simplicity.
// We write the default and paradigm LSRs starting from the front of the arrays,
// and others starting from the back.
// At the end we reverse the non-paradigm LSRs.
// We end up wasting as many array slots as there are duplicate supported LSRs,
// but the amount of wasted space is small as long as there are few duplicates.
supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong,
supportedLocalesLength, &errorCode);
if (U_FAILURE(errorCode)) { return; }
int32_t paradigmLimit = 1 + localeDistance.getParadigmLSRsLength();
int32_t suppLSRsCapacity = paradigmLimit + supportedLocalesLength;
supportedLSRs = static_cast<const LSR **>(
uprv_malloc(suppLSRsCapacity * sizeof(const LSR *)));
uprv_malloc(supportedLocalesLength * sizeof(const LSR *)));
supportedIndexes = static_cast<int32_t *>(
uprv_malloc(suppLSRsCapacity * sizeof(int32_t)));
uprv_malloc(supportedLocalesLength * sizeof(int32_t)));
if (supportedLSRs == nullptr || supportedIndexes == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t paradigmIndex = 0;
int32_t otherIndex = paradigmLimit;
int32_t otherIndex = supportedLocalesLength;
if (idef >= 0) {
uhash_puti(supportedLsrToIndex, const_cast<LSR *>(defLSR), idef + 1, &errorCode);
supportedLSRs[0] = defLSR;
@ -446,21 +447,32 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
supportedLSRs[paradigmIndex] = &lsr;
supportedIndexes[paradigmIndex++] = i;
} else {
supportedLSRs[otherIndex] = &lsr;
supportedIndexes[otherIndex++] = i;
supportedLSRs[--otherIndex] = &lsr;
supportedIndexes[otherIndex] = i;
}
}
}
if (U_FAILURE(errorCode)) { return; }
}
// Squeeze out unused array slots.
if (paradigmIndex < paradigmLimit && paradigmLimit < otherIndex) {
uprv_memmove(supportedLSRs + paradigmIndex, supportedLSRs + paradigmLimit,
(otherIndex - paradigmLimit) * sizeof(const LSR *));
uprv_memmove(supportedIndexes + paradigmIndex, supportedIndexes + paradigmLimit,
(otherIndex - paradigmLimit) * sizeof(int32_t));
// Reverse the non-paradigm LSRs to be in order, right after the paradigm LSRs.
// First fill the unused slots between paradigm LSRs and other LSRs.
// This gap is as large as the number of locales with duplicate LSRs.
int32_t i = paradigmIndex;
int32_t j = supportedLocalesLength - 1;
while (i < otherIndex && otherIndex <= j) {
supportedLSRs[i] = supportedLSRs[j];
supportedIndexes[i++] = supportedIndexes[j--];
}
supportedLSRsLength = otherIndex - (paradigmLimit - paradigmIndex);
// Swap remaining non-paradigm LSRs in place.
while (i < j) {
const LSR *tempLSR = supportedLSRs[i];
supportedLSRs[i] = supportedLSRs[j];
supportedLSRs[j] = tempLSR;
int32_t tempIndex = supportedIndexes[i];
supportedIndexes[i++] = supportedIndexes[j];
supportedIndexes[j--] = tempIndex;
}
supportedLSRsLength = supportedLocalesLength - (otherIndex - paradigmIndex);
}
if (def != nullptr && (idef < 0 || def != supportedLocales[idef])) {
@ -662,7 +674,7 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
if (U_FAILURE(errorCode)) { return -1; }
int32_t desiredIndex = 0;
int32_t bestSupportedLsrIndex = -1;
for (int32_t bestDistance = thresholdDistance;;) {
for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) {
// Quick check for exact maximized LSR.
// Returns suppIndex+1 where 0 means not found.
if (supportedLsrToIndex != nullptr) {
@ -677,16 +689,17 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
}
}
int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance(
desiredLSR, supportedLSRs, supportedLSRsLength, bestDistance, favorSubtag);
desiredLSR, supportedLSRs, supportedLSRsLength, bestShiftedDistance, favorSubtag);
if (bestIndexAndDistance >= 0) {
bestDistance = bestIndexAndDistance & 0xff;
bestShiftedDistance = LocaleDistance::getShiftedDistance(bestIndexAndDistance);
if (remainingIter != nullptr) {
remainingIter->rememberCurrent(desiredIndex, errorCode);
if (U_FAILURE(errorCode)) { return -1; }
}
bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? bestIndexAndDistance >> 8 : -1;
bestSupportedLsrIndex = bestIndexAndDistance >= 0 ?
LocaleDistance::getIndex(bestIndexAndDistance) : -1;
}
if ((bestDistance -= demotionPerDesiredLocale) <= 0) {
if ((bestShiftedDistance -= LocaleDistance::shiftDistance(demotionPerDesiredLocale)) <= 0) {
break;
}
if (remainingIter == nullptr || !remainingIter->hasNext()) {
@ -708,11 +721,12 @@ double LocaleMatcher::internalMatch(const Locale &desired, const Locale &support
LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode);
if (U_FAILURE(errorCode)) { return 0; }
const LSR *pSuppLSR = &suppLSR;
int32_t distance = localeDistance.getBestIndexAndDistance(
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
&pSuppLSR, 1,
thresholdDistance, favorSubtag) & 0xff;
return (100 - distance) / 100.0;
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag);
double distance = LocaleDistance::getDistanceDouble(indexAndDistance);
return (100.0 - distance) / 100.0;
}
U_NAMESPACE_END

View file

@ -97,17 +97,23 @@ LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
// a mere region difference for one desired locale
// is as good as a perfect match for the next following desired locale.
// As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
LSR en("en", "Latn", "US");
LSR enGB("en", "Latn", "GB");
LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
const LSR *p_enGB = &enGB;
defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1,
50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff;
int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE);
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
}
int32_t LocaleDistance::getBestIndexAndDistance(
const LSR &desired,
const LSR **supportedLSRs, int32_t supportedLSRsLength,
int32_t threshold, ULocMatchFavorSubtag favorSubtag) const {
int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
// Round up the shifted threshold (if fraction bits are not 0)
// for comparison with un-shifted distances until we need fraction bits.
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
// when it's really still a micro distance below the threshold.)
int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
BytesTrie iter(trie);
// Look up the desired language only once for all supported LSRs.
// Its "distance" is either a match point value of 0, or a non-match negative value.
@ -153,7 +159,7 @@ int32_t LocaleDistance::getBestIndexAndDistance(
if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
distance >>= 2;
}
if (distance >= threshold) {
if (distance >= roundedThreshold) {
continue;
}
@ -171,7 +177,7 @@ int32_t LocaleDistance::getBestIndexAndDistance(
scriptDistance &= ~DISTANCE_IS_FINAL;
}
distance += scriptDistance;
if (distance >= threshold) {
if (distance >= roundedThreshold) {
continue;
}
@ -180,7 +186,7 @@ int32_t LocaleDistance::getBestIndexAndDistance(
} else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
distance += defaultRegionDistance;
} else {
int32_t remainingThreshold = threshold - distance;
int32_t remainingThreshold = roundedThreshold - distance;
if (minRegionDistance >= remainingThreshold) {
continue;
}
@ -196,15 +202,23 @@ int32_t LocaleDistance::getBestIndexAndDistance(
partitionsForRegion(supported),
remainingThreshold);
}
if (distance < threshold) {
if (distance == 0) {
return slIndex << 8;
int32_t shiftedDistance = shiftDistance(distance);
if (shiftedDistance == 0) {
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
}
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
threshold = distance;
shiftedThreshold = shiftedDistance;
}
}
return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD;
return bestIndex >= 0 ?
(bestIndex << INDEX_SHIFT) | shiftedThreshold :
INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
}
int32_t LocaleDistance::getDesSuppScriptDistance(
@ -352,11 +366,14 @@ int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue)
}
UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
// Linear search for a very short list (length 6 as of 2019).
// If there are many paradigm LSRs we should use a hash set.
// Linear search for a very short list (length 6 as of 2019),
// because we look for equivalence not equality, and
// because it's easy.
// If there are many paradigm LSRs we should use a hash set
// with custom comparator and hasher.
U_ASSERT(paradigmLSRsLength <= 15);
for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
if (lsr == paradigmLSRs[i]) { return true; }
if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
}
return false;
}

View file

@ -26,19 +26,36 @@ class LocaleDistance final : public UMemory {
public:
static const LocaleDistance *getSingleton(UErrorCode &errorCode);
static int32_t shiftDistance(int32_t distance) {
return distance << DISTANCE_SHIFT;
}
static int32_t getShiftedDistance(int32_t indexAndDistance) {
return indexAndDistance & DISTANCE_MASK;
}
static double getDistanceDouble(int32_t indexAndDistance) {
double shiftedDistance = getShiftedDistance(indexAndDistance);
return shiftedDistance / (1 << DISTANCE_SHIFT);
}
static int32_t getIndex(int32_t indexAndDistance) {
// assert indexAndDistance >= 0;
return indexAndDistance >> INDEX_SHIFT;
}
/**
* Finds the supported LSR with the smallest distance from the desired one.
* Equivalent LSR subtags must be normalized into a canonical form.
*
* <p>Returns the index of the lowest-distance supported LSR in bits 31..8
* <p>Returns the index of the lowest-distance supported LSR in the high bits
* (negative if none has a distance below the threshold),
* and its distance (0..ABOVE_THRESHOLD) in bits 7..0.
* and its distance (0..ABOVE_THRESHOLD) in the low bits.
*/
int32_t getBestIndexAndDistance(const LSR &desired,
const LSR **supportedLSRs, int32_t supportedLSRsLength,
int32_t threshold, ULocMatchFavorSubtag favorSubtag) const;
int32_t getParadigmLSRsLength() const { return paradigmLSRsLength; }
int32_t shiftedThreshold,
ULocMatchFavorSubtag favorSubtag) const;
UBool isParadigmLSR(const LSR &lsr) const;
@ -51,6 +68,20 @@ public:
}
private:
// The distance is shifted left to gain some fraction bits.
static constexpr int32_t DISTANCE_SHIFT = 3;
static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
// 7 bits for 0..100
static constexpr int32_t DISTANCE_INT_SHIFT = 7;
static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
static constexpr int32_t DISTANCE_MASK = 0x3ff;
// tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit
static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
static int32_t getDistanceFloor(int32_t indexAndDistance) {
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
}
LocaleDistance(const LocaleDistanceData &data);
LocaleDistance(const LocaleDistance &other) = delete;
LocaleDistance &operator=(const LocaleDistance &other) = delete;

View file

@ -250,7 +250,8 @@ struct XLikelySubtagsData {
for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) {
lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]),
strings.get(lsrSubtagIndexes[i + 1]),
strings.get(lsrSubtagIndexes[i + 2]));
strings.get(lsrSubtagIndexes[i + 2]),
LSR::IMPLICIT_LSR);
}
if (partitionsLength > 0) {
@ -275,7 +276,8 @@ struct XLikelySubtagsData {
for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) {
paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]),
strings.get(paradigmSubtagIndexes[i + 1]),
strings.get(paradigmSubtagIndexes[i + 2]));
strings.get(paradigmSubtagIndexes[i + 2]),
LSR::DONT_CARE_FLAGS);
}
distanceData.paradigms = paradigms;
}
@ -383,7 +385,7 @@ LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &error
const char *name = locale.getName();
if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
// Private use language tag x-subtag-subtag...
return LSR(name, "", "");
return LSR(name, "", "", LSR::EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), errorCode);
@ -407,26 +409,31 @@ LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, c
if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) {
switch (c1) {
case 'A':
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, errorCode);
return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'B':
return LSR(PSEUDO_BIDI_PREFIX, language, script, region, errorCode);
return LSR(PSEUDO_BIDI_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
case 'C':
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, errorCode);
return LSR(PSEUDO_CRACKED_PREFIX, language, script, region,
LSR::EXPLICIT_LSR, errorCode);
default: // normal locale
break;
}
}
if (variant[0] == 'P' && variant[1] == 'S') {
int32_t lsrFlags = *region == 0 ?
LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR;
if (uprv_strcmp(variant, "PSACCENT") == 0) {
return LSR(PSEUDO_ACCENTS_PREFIX, language, script,
*region == 0 ? "XA" : region, errorCode);
*region == 0 ? "XA" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSBIDI") == 0) {
return LSR(PSEUDO_BIDI_PREFIX, language, script,
*region == 0 ? "XB" : region, errorCode);
*region == 0 ? "XB" : region, lsrFlags, errorCode);
} else if (uprv_strcmp(variant, "PSCRACK") == 0) {
return LSR(PSEUDO_CRACKED_PREFIX, language, script,
*region == 0 ? "XC" : region, errorCode);
*region == 0 ? "XC" : region, lsrFlags, errorCode);
}
// else normal locale
}
@ -448,7 +455,7 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
region = "";
}
if (*script != 0 && *region != 0 && *language != 0) {
return LSR(language, script, region); // already maximized
return LSR(language, script, region, LSR::EXPLICIT_LSR); // already maximized
}
uint32_t retainOldMask = 0;
@ -535,7 +542,7 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
if (retainOldMask == 0) {
// Quickly return a copy of the lookup-result LSR
// without new allocation of the subtags.
return LSR(result.language, result.script, result.region);
return LSR(result.language, result.script, result.region, result.flags);
}
if ((retainOldMask & 4) == 0) {
language = result.language;
@ -546,7 +553,8 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
if ((retainOldMask & 1) == 0) {
region = result.region;
}
return LSR(language, script, region);
// retainOldMask flags = LSR explicit-subtag flags
return LSR(language, script, region, retainOldMask);
}
int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
@ -615,9 +623,9 @@ LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn
boolean favorRegionOk = false;
if (result.script.equals(value00.script)) { //script is default
if (result.region.equals(value00.region)) {
return new LSR(result.language, "", "");
return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
} else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
return new LSR(result.language, "", result.region);
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
} else {
favorRegionOk = true;
}
@ -627,9 +635,9 @@ LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn
// Maybe do later, but for now use the straightforward code.
LSR result2 = maximize(languageIn, scriptIn, "");
if (result2.equals(result)) {
return new LSR(result.language, result.script, "");
return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
} else if (favorRegionOk) {
return new LSR(result.language, "", result.region);
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
}
return result;
}

View file

@ -14,9 +14,10 @@
U_NAMESPACE_BEGIN
LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode) :
LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
UErrorCode &errorCode) :
language(nullptr), script(nullptr), region(r),
regionIndex(indexForRegion(region)) {
regionIndex(indexForRegion(region)), flags(f) {
if (U_SUCCESS(errorCode)) {
CharString langScript;
langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode);
@ -32,7 +33,8 @@ LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCo
LSR::LSR(LSR &&other) U_NOEXCEPT :
language(other.language), script(other.script), region(other.region), owned(other.owned),
regionIndex(other.regionIndex), hashCode(other.hashCode) {
regionIndex(other.regionIndex), flags(other.flags),
hashCode(other.hashCode) {
if (owned != nullptr) {
other.language = other.script = "";
other.owned = nullptr;
@ -50,6 +52,7 @@ LSR &LSR::operator=(LSR &&other) U_NOEXCEPT {
script = other.script;
region = other.region;
regionIndex = other.regionIndex;
flags = other.flags;
owned = other.owned;
hashCode = other.hashCode;
if (owned != nullptr) {
@ -60,7 +63,7 @@ LSR &LSR::operator=(LSR &&other) U_NOEXCEPT {
return *this;
}
UBool LSR::operator==(const LSR &other) const {
UBool LSR::isEquivalentTo(const LSR &other) const {
return
uprv_strcmp(language, other.language) == 0 &&
uprv_strcmp(script, other.script) == 0 &&
@ -69,6 +72,16 @@ UBool LSR::operator==(const LSR &other) const {
(regionIndex > 0 || uprv_strcmp(region, other.region) == 0);
}
UBool LSR::operator==(const LSR &other) const {
return
uprv_strcmp(language, other.language) == 0 &&
uprv_strcmp(script, other.script) == 0 &&
regionIndex == other.regionIndex &&
// Compare regions if both are ill-formed (and their indexes are 0).
(regionIndex > 0 || uprv_strcmp(region, other.region) == 0) &&
flags == other.flags;
}
int32_t LSR::indexForRegion(const char *region) {
int32_t c = region[0];
int32_t a = c - '0';
@ -90,10 +103,10 @@ int32_t LSR::indexForRegion(const char *region) {
LSR &LSR::setHashCode() {
if (hashCode == 0) {
hashCode =
(ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language))) * 37 +
ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)))) * 37 +
regionIndex;
int32_t h = ustr_hashCharsN(language, static_cast<int32_t>(uprv_strlen(language)));
h = h * 37 + ustr_hashCharsN(script, static_cast<int32_t>(uprv_strlen(script)));
h = h * 37 + regionIndex;
hashCode = h * 37 + flags;
}
return *this;
}

View file

@ -16,26 +16,35 @@ U_NAMESPACE_BEGIN
struct LSR final : public UMemory {
static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26;
static constexpr int32_t EXPLICIT_LSR = 7;
static constexpr int32_t EXPLICIT_LANGUAGE = 4;
static constexpr int32_t EXPLICIT_SCRIPT = 2;
static constexpr int32_t EXPLICIT_REGION = 1;
static constexpr int32_t IMPLICIT_LSR = 0;
static constexpr int32_t DONT_CARE_FLAGS = 0;
const char *language;
const char *script;
const char *region;
char *owned = nullptr;
/** Index for region, 0 if ill-formed. @see indexForRegion */
int32_t regionIndex = 0;
int32_t flags = 0;
/** Only set for LSRs that will be used in a hash table. */
int32_t hashCode = 0;
LSR() : language("und"), script(""), region("") {}
/** Constructor which aliases all subtag pointers. */
LSR(const char *lang, const char *scr, const char *r) :
LSR(const char *lang, const char *scr, const char *r, int32_t f) :
language(lang), script(scr), region(r),
regionIndex(indexForRegion(region)) {}
regionIndex(indexForRegion(region)), flags(f) {}
/**
* Constructor which prepends the prefix to the language and script,
* copies those into owned memory, and aliases the region.
*/
LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode);
LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
UErrorCode &errorCode);
LSR(LSR &&other) U_NOEXCEPT;
LSR(const LSR &other) = delete;
inline ~LSR() {
@ -55,6 +64,7 @@ struct LSR final : public UMemory {
*/
static int32_t indexForRegion(const char *region);
UBool isEquivalentTo(const LSR &other) const;
UBool operator==(const LSR &other) const;
inline UBool operator!=(const LSR &other) const {

View file

@ -1052,9 +1052,9 @@ en >> en-DE
ar-EG >> ar-SY
pt-BR >> pt
ar-XB >> ar-XB
ar-PSBIDI >> ar-XB # These are equivalent.
ar-PSBIDI >> ar-PSBIDI
en-XA >> en-XA
en-PSACCENT >> en-XA # These are equivalent.
en-PSACCENT >> en-PSACCENT
ar-PSCRACK >> ar-PSCRACK
@favor=script
@ -1063,9 +1063,9 @@ en >> en-DE
ar-EG >> ar-SY
pt-BR >> pt
ar-XB >> ar-XB
ar-PSBIDI >> ar-XB # These are equivalent.
ar-PSBIDI >> ar-PSBIDI
en-XA >> en-XA
en-PSACCENT >> en-XA # These are equivalent.
en-PSACCENT >> en-PSACCENT
ar-PSCRACK >> ar-PSCRACK
** test: BestMatchForTraditionalChinese
@ -1544,50 +1544,44 @@ zh-TW, en >> en-US
zh-Hant-CN, en >> en-US
zh-Hans, en >> zh-Hans-CN
** test: return first among likely-subtags equivalent locales
# Was: more specific script should win in case regions are identical
# with some different results.
** test: return most originally similar among likely-subtags equivalent locales
@supported=af, af-Latn, af-Arab
af >> af
af-ZA >> af
af-Latn-ZA >> af
af-Latn >> af
af-Latn-ZA >> af-Latn
af-Latn >> af-Latn
@favor=script
af >> af
af-ZA >> af
af-Latn-ZA >> af
af-Latn >> af
af-Latn-ZA >> af-Latn
af-Latn >> af-Latn
# Was: more specific region should win
# with some different results.
@supported=nl, nl-NL, nl-BE
@favor=
nl >> nl
nl-Latn >> nl
nl-Latn-NL >> nl
nl-NL >> nl
nl-Latn-NL >> nl-NL
nl-NL >> nl-NL
@favor=script
nl >> nl
nl-Latn >> nl
nl-Latn-NL >> nl
nl-NL >> nl
nl-Latn-NL >> nl-NL
nl-NL >> nl-NL
# Was: more specific region wins over more specific script
# with some different results.
@supported=nl, nl-Latn, nl-NL, nl-BE
@favor=
nl >> nl
nl-Latn >> nl
nl-NL >> nl
nl-Latn-NL >> nl
nl-Latn >> nl-Latn
nl-NL >> nl-NL
nl-Latn-NL >> nl-Latn
@favor=script
nl >> nl
nl-Latn >> nl
nl-NL >> nl
nl-Latn-NL >> nl
nl-Latn >> nl-Latn
nl-NL >> nl-NL
nl-Latn-NL >> nl-Latn
** test: region may replace matched if matched is enclosing
@supported=es-419, es
@ -1670,22 +1664,22 @@ ja-Jpan-JP, en-GB >> ja
** test: pick best maximized tag
@supported=ja, ja-Jpan-US, ja-JP, en, ru
ja-Jpan, ru >> ja
ja-JP, ru >> ja
ja-JP, ru >> ja-JP
ja-US, ru >> ja-Jpan-US
@favor=script
ja-Jpan, ru >> ja
ja-JP, ru >> ja
ja-JP, ru >> ja-JP
ja-US, ru >> ja-Jpan-US
** test: termination: pick best maximized match
@supported=ja, ja-Jpan, ja-JP, en, ru
ja-Jpan-JP, ru >> ja
ja-Jpan, ru >> ja
ja-Jpan-JP, ru >> ja-Jpan
ja-Jpan, ru >> ja-Jpan
@favor=script
ja-Jpan-JP, ru >> ja
ja-Jpan, ru >> ja
ja-Jpan-JP, ru >> ja-Jpan
ja-Jpan, ru >> ja-Jpan
** test: same language over exact, but distinguish when user is explicit
@supported=fr, en-GB, ja, es-ES, es-MX
@ -1900,7 +1894,7 @@ zh-TW >> zh
** test: testGetBestMatchWithMinMatchScore
@supported=fr-FR, fr, fr-CA, en
@default=und
fr >> fr-FR # First likely-subtags equivalent match is chosen.
fr >> fr
@supported=en, fr, fr-CA
fr-FR >> fr # Parent match is chosen.
@supported=en, fr-CA
@ -1930,7 +1924,7 @@ ru >> und
@favor=script
@supported=fr-FR, fr, fr-CA, en
fr >> fr-FR
fr >> fr
@supported=en, fr, fr-CA
fr-FR >> fr
@supported=en, fr-CA

View file

@ -7,6 +7,13 @@ import java.util.Objects;
public final class LSR {
public static final int REGION_INDEX_LIMIT = 1001 + 26 * 26;
public static final int EXPLICIT_LSR = 7;
public static final int EXPLICIT_LANGUAGE = 4;
public static final int EXPLICIT_SCRIPT = 2;
public static final int EXPLICIT_REGION = 1;
public static final int IMPLICIT_LSR = 0;
public static final int DONT_CARE_FLAGS = 0;
public static final boolean DEBUG_OUTPUT = false;
public final String language;
@ -14,12 +21,14 @@ public final class LSR {
public final String region;
/** Index for region, negative if ill-formed. @see indexForRegion */
final int regionIndex;
public final int flags;
public LSR(String language, String script, String region) {
public LSR(String language, String script, String region, int flags) {
this.language = language;
this.script = script;
this.region = region;
regionIndex = indexForRegion(region);
this.flags = flags;
}
/**
@ -57,6 +66,13 @@ public final class LSR {
}
return result.toString();
}
public boolean isEquivalentTo(LSR other) {
return language.equals(other.language)
&& script.equals(other.script)
&& region.equals(other.region);
}
@Override
public boolean equals(Object obj) {
LSR other;
@ -65,10 +81,12 @@ public final class LSR {
&& obj.getClass() == this.getClass()
&& language.equals((other = (LSR) obj).language)
&& script.equals(other.script)
&& region.equals(other.region));
&& region.equals(other.region)
&& flags == other.flags);
}
@Override
public int hashCode() {
return Objects.hash(language, script, region);
return Objects.hash(language, script, region, flags);
}
}

View file

@ -34,6 +34,17 @@ public class LocaleDistance {
private static final int DISTANCE_IS_FINAL = 0x100;
private static final int DISTANCE_IS_FINAL_OR_SKIP_SCRIPT =
DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
// The distance is shifted left to gain some fraction bits.
private static final int DISTANCE_SHIFT = 3;
private static final int DISTANCE_FRACTION_MASK = 7;
// 7 bits for 0..100
private static final int DISTANCE_INT_SHIFT = 7;
private static final int INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
private static final int DISTANCE_MASK = 0x3ff;
// vate static final int MAX_INDEX = 0x1fffff; // avoids sign bit
private static final int INDEX_NEG_1 = 0xfffffc00;
// Indexes into array of distances.
public static final int IX_DEF_LANG_DISTANCE = 0;
public static final int IX_DEF_SCRIPT_DISTANCE = 1;
@ -67,6 +78,28 @@ public class LocaleDistance {
private final int minRegionDistance;
private final int defaultDemotionPerDesiredLocale;
public static final int shiftDistance(int distance) {
return distance << DISTANCE_SHIFT;
}
public static final int getShiftedDistance(int indexAndDistance) {
return indexAndDistance & DISTANCE_MASK;
}
public static final double getDistanceDouble(int indexAndDistance) {
double shiftedDistance = getShiftedDistance(indexAndDistance);
return shiftedDistance / (1 << DISTANCE_SHIFT);
}
private static final int getDistanceFloor(int indexAndDistance) {
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
}
public static final int getIndex(int indexAndDistance) {
assert indexAndDistance >= 0;
return indexAndDistance >> INDEX_SHIFT;
}
// VisibleForTesting
public static final class Data {
public byte[] trie;
@ -121,7 +154,8 @@ public class LocaleDistance {
String[] paradigms = value.getStringArray();
paradigmLSRs = new HashSet<>(paradigms.length / 3);
for (int i = 0; i < paradigms.length; i += 3) {
paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2]));
paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2],
LSR.DONT_CARE_FLAGS));
}
} else {
paradigmLSRs = Collections.emptySet();
@ -168,10 +202,11 @@ public class LocaleDistance {
// a mere region difference for one desired locale
// is as good as a perfect match for the next following desired locale.
// As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
LSR en = new LSR("en", "Latn", "US");
LSR enGB = new LSR("en", "Latn", "GB");
defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, new LSR[] { enGB },
50, FavorSubtag.LANGUAGE) & 0xff;
LSR en = new LSR("en", "Latn", "US", LSR.EXPLICIT_LSR);
LSR enGB = new LSR("en", "Latn", "GB", LSR.EXPLICIT_LSR);
int indexAndDistance = getBestIndexAndDistance(en, new LSR[] { enGB },
shiftDistance(50), FavorSubtag.LANGUAGE);
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
if (DEBUG_OUTPUT) {
System.out.println("*** locale distance");
@ -187,20 +222,26 @@ public class LocaleDistance {
int threshold, FavorSubtag favorSubtag) {
LSR supportedLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(supported);
LSR desiredLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(desired);
return getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR },
threshold, favorSubtag) & 0xff;
int indexAndDistance = getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR },
shiftDistance(threshold), favorSubtag);
return getDistanceFloor(indexAndDistance);
}
/**
* Finds the supported LSR with the smallest distance from the desired one.
* Equivalent LSR subtags must be normalized into a canonical form.
*
* <p>Returns the index of the lowest-distance supported LSR in bits 31..8
* <p>Returns the index of the lowest-distance supported LSR in the high bits
* (negative if none has a distance below the threshold),
* and its distance (0..ABOVE_THRESHOLD) in bits 7..0.
* and its distance (0..ABOVE_THRESHOLD) in the low bits.
*/
public int getBestIndexAndDistance(LSR desired, LSR[] supportedLSRs,
int threshold, FavorSubtag favorSubtag) {
int shiftedThreshold, FavorSubtag favorSubtag) {
// Round up the shifted threshold (if fraction bits are not 0)
// for comparison with un-shifted distances until we need fraction bits.
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
// when it's really still a micro distance below the threshold.)
int roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
BytesTrie iter = new BytesTrie(trie);
// Look up the desired language only once for all supported LSRs.
// Its "distance" is either a match point value of 0, or a non-match negative value.
@ -246,7 +287,7 @@ public class LocaleDistance {
if (favorSubtag == FavorSubtag.SCRIPT) {
distance >>= 2;
}
if (distance >= threshold) {
if (distance >= roundedThreshold) {
continue;
}
@ -264,7 +305,7 @@ public class LocaleDistance {
scriptDistance &= ~DISTANCE_IS_FINAL;
}
distance += scriptDistance;
if (distance >= threshold) {
if (distance >= roundedThreshold) {
continue;
}
@ -273,7 +314,7 @@ public class LocaleDistance {
} else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
distance += defaultRegionDistance;
} else {
int remainingThreshold = threshold - distance;
int remainingThreshold = roundedThreshold - distance;
if (minRegionDistance >= remainingThreshold) {
continue;
}
@ -289,15 +330,23 @@ public class LocaleDistance {
partitionsForRegion(supported),
remainingThreshold);
}
if (distance < threshold) {
if (distance == 0) {
return slIndex << 8;
int shiftedDistance = shiftDistance(distance);
if (shiftedDistance == 0) {
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
}
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
threshold = distance;
shiftedThreshold = shiftedDistance;
}
}
return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD;
return bestIndex >= 0 ?
(bestIndex << INDEX_SHIFT) | shiftedThreshold :
INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
}
private static final int getDesSuppScriptDistance(BytesTrie iter, long startState,
@ -439,7 +488,17 @@ public class LocaleDistance {
}
public boolean isParadigmLSR(LSR lsr) {
return paradigmLSRs.contains(lsr);
// Linear search for a very short list (length 6 as of 2019),
// because we look for equivalence not equality, and
// HashSet does not support customizing equality.
// If there are many paradigm LSRs we should revisit this.
assert paradigmLSRs.size() <= 15;
for (LSR plsr : paradigmLSRs) {
if (lsr.isEquivalentTo(plsr)) {
return true;
}
}
return false;
}
// VisibleForTesting
@ -455,9 +514,6 @@ public class LocaleDistance {
return defaultDemotionPerDesiredLocale;
}
// TODO: When we build data offline,
// write test code to compare the loaded table with the builder output.
// Fail if different, with instructions for how to update the data file.
// VisibleForTesting
public Map<String, Integer> testOnlyGetDistanceTable() {
Map<String, Integer> map = new TreeMap<>();

View file

@ -87,7 +87,8 @@ public final class XLikelySubtags {
String[] lsrSubtags = getValue(likelyTable, "lsrs", value).getStringArray();
LSR[] lsrs = new LSR[lsrSubtags.length / 3];
for (int i = 0, j = 0; i < lsrSubtags.length; i += 3, ++j) {
lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2]);
lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2],
LSR.IMPLICIT_LSR);
}
return new Data(languageAliases, regionAliases, trie, lsrs);
@ -185,7 +186,7 @@ public final class XLikelySubtags {
String tag = locale.toLanguageTag();
assert tag.startsWith("x-");
// Private use language tag x-subtag-subtag...
return new LSR(tag, "", "");
return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant());
@ -195,7 +196,7 @@ public final class XLikelySubtags {
String tag = locale.toLanguageTag();
if (tag.startsWith("x-")) {
// Private use language tag x-subtag-subtag...
return new LSR(tag, "", "");
return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant());
@ -209,29 +210,34 @@ public final class XLikelySubtags {
switch (region.charAt(1)) {
case 'A':
return new LSR(PSEUDO_ACCENTS_PREFIX + language,
PSEUDO_ACCENTS_PREFIX + script, region);
PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR);
case 'B':
return new LSR(PSEUDO_BIDI_PREFIX + language,
PSEUDO_BIDI_PREFIX + script, region);
PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR);
case 'C':
return new LSR(PSEUDO_CRACKED_PREFIX + language,
PSEUDO_CRACKED_PREFIX + script, region);
PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR);
default: // normal locale
break;
}
}
if (variant.startsWith("PS")) {
int lsrFlags = region.isEmpty() ?
LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR;
switch (variant) {
case "PSACCENT":
return new LSR(PSEUDO_ACCENTS_PREFIX + language,
PSEUDO_ACCENTS_PREFIX + script, region.isEmpty() ? "XA" : region);
PSEUDO_ACCENTS_PREFIX + script,
region.isEmpty() ? "XA" : region, lsrFlags);
case "PSBIDI":
return new LSR(PSEUDO_BIDI_PREFIX + language,
PSEUDO_BIDI_PREFIX + script, region.isEmpty() ? "XB" : region);
PSEUDO_BIDI_PREFIX + script,
region.isEmpty() ? "XB" : region, lsrFlags);
case "PSCRACK":
return new LSR(PSEUDO_CRACKED_PREFIX + language,
PSEUDO_CRACKED_PREFIX + script, region.isEmpty() ? "XC" : region);
PSEUDO_CRACKED_PREFIX + script,
region.isEmpty() ? "XC" : region, lsrFlags);
default: // normal locale
break;
}
@ -257,7 +263,7 @@ public final class XLikelySubtags {
region = "";
}
if (!script.isEmpty() && !region.isEmpty() && !language.isEmpty()) {
return new LSR(language, script, region); // already maximized
return new LSR(language, script, region, LSR.EXPLICIT_LSR); // already maximized
}
int retainOldMask = 0;
@ -340,6 +346,7 @@ public final class XLikelySubtags {
}
if (retainOldMask == 0) {
assert result.flags == LSR.IMPLICIT_LSR;
return result;
}
if ((retainOldMask & 4) == 0) {
@ -351,7 +358,8 @@ public final class XLikelySubtags {
if ((retainOldMask & 1) == 0) {
region = result.region;
}
return new LSR(language, script, region);
// retainOldMask flags = LSR explicit-subtag flags
return new LSR(language, script, region, retainOldMask);
}
private static final int trieNext(BytesTrie iter, String s, int i) {
@ -411,9 +419,9 @@ public final class XLikelySubtags {
boolean favorRegionOk = false;
if (result.script.equals(value00.script)) { //script is default
if (result.region.equals(value00.region)) {
return new LSR(result.language, "", "");
return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
} else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
return new LSR(result.language, "", result.region);
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
} else {
favorRegionOk = true;
}
@ -423,9 +431,9 @@ public final class XLikelySubtags {
// Maybe do later, but for now use the straightforward code.
LSR result2 = maximize(languageIn, scriptIn, "");
if (result2.equals(result)) {
return new LSR(result.language, result.script, "");
return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
} else if (favorRegionOk) {
return new LSR(result.language, "", result.region);
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
}
return result;
}

View file

@ -64,7 +64,7 @@ import com.ibm.icu.impl.locale.XLikelySubtags;
* @stable ICU 4.4
*/
public final class LocaleMatcher {
private static final LSR UND_LSR = new LSR("und","","");
private static final LSR UND_LSR = new LSR("und","","", LSR.EXPLICIT_LSR);
// In ULocale, "und" and "" make the same object.
private static final ULocale UND_ULOCALE = new ULocale("und");
// In Locale, "und" and "" make different objects.
@ -680,6 +680,9 @@ public final class LocaleMatcher {
builder.demotion == Demotion.NONE ? 0 :
LocaleDistance.INSTANCE.getDefaultDemotionPerDesiredLocale(); // null or REGION
favorSubtag = builder.favor;
if (TRACE_MATCHER) {
System.err.printf("new LocaleMatcher: %s\n", toString());
}
}
private static final void putIfAbsent(Map<LSR, Integer> lsrToIndex, LSR lsr, int i) {
@ -938,26 +941,34 @@ public final class LocaleMatcher {
private int getBestSuppIndex(LSR desiredLSR, LsrIterator remainingIter) {
int desiredIndex = 0;
int bestSupportedLsrIndex = -1;
for (int bestDistance = thresholdDistance;;) {
StringBuilder sb = null;
if (TRACE_MATCHER) {
sb = new StringBuilder("LocaleMatcher desired:");
}
for (int bestShiftedDistance = LocaleDistance.shiftDistance(thresholdDistance);;) {
if (TRACE_MATCHER) {
sb.append(' ').append(desiredLSR);
}
// Quick check for exact maximized LSR.
Integer index = supportedLsrToIndex.get(desiredLSR);
if (index != null) {
int suppIndex = index;
if (TRACE_MATCHER) {
System.err.printf("Returning %s: desiredLSR=supportedLSR\n",
supportedULocales[suppIndex]);
System.err.printf("%s --> best=%s: desiredLSR=supportedLSR\n",
sb, supportedULocales[suppIndex]);
}
if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); }
return suppIndex;
}
int bestIndexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
desiredLSR, supportedLSRs, bestDistance, favorSubtag);
desiredLSR, supportedLSRs, bestShiftedDistance, favorSubtag);
if (bestIndexAndDistance >= 0) {
bestDistance = bestIndexAndDistance & 0xff;
bestShiftedDistance = LocaleDistance.getShiftedDistance(bestIndexAndDistance);
if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); }
bestSupportedLsrIndex = bestIndexAndDistance >> 8;
bestSupportedLsrIndex = LocaleDistance.getIndex(bestIndexAndDistance);
}
if ((bestDistance -= demotionPerDesiredLocale) <= 0) {
if ((bestShiftedDistance -= LocaleDistance.shiftDistance(demotionPerDesiredLocale))
<= 0) {
break;
}
if (remainingIter == null || !remainingIter.hasNext()) {
@ -968,14 +979,14 @@ public final class LocaleMatcher {
}
if (bestSupportedLsrIndex < 0) {
if (TRACE_MATCHER) {
System.err.printf("Returning default %s: no good match\n", defaultULocale);
System.err.printf("%s --> best=default %s: no good match\n", sb, defaultULocale);
}
return -1;
}
int suppIndex = supportedIndexes[bestSupportedLsrIndex];
if (TRACE_MATCHER) {
System.err.printf("Returning %s: best matching supported locale\n",
supportedULocales[suppIndex]);
System.err.printf("%s --> best=%s: best matching supported locale\n",
sb, supportedULocales[suppIndex]);
}
return suppIndex;
}
@ -1000,11 +1011,16 @@ public final class LocaleMatcher {
@Deprecated
public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) {
// Returns the inverse of the distance: That is, 1-distance(desired, supported).
int distance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
int indexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
getMaximalLsrOrUnd(desired),
new LSR[] { getMaximalLsrOrUnd(supported) },
thresholdDistance, favorSubtag) & 0xff;
return (100 - distance) / 100.0;
LocaleDistance.shiftDistance(thresholdDistance), favorSubtag);
double distance = LocaleDistance.getDistanceDouble(indexAndDistance);
if (TRACE_MATCHER) {
System.err.printf("LocaleMatcher distance(desired=%s, supported=%s)=%g\n",
Objects.toString(desired), Objects.toString(supported), distance);
}
return (100.0 - distance) / 100.0;
}
/**
@ -1032,16 +1048,17 @@ public final class LocaleMatcher {
@Override
public String toString() {
StringBuilder s = new StringBuilder().append("{LocaleMatcher");
if (supportedULocales.length > 0) {
s.append(" supported={").append(supportedULocales[0].toString());
for (int i = 1; i < supportedULocales.length; ++i) {
s.append(", ").append(supportedULocales[i].toString());
// Supported languages in the order that we try to match them.
if (supportedLSRs.length > 0) {
s.append(" supportedLSRs={").append(supportedLSRs[0].toString());
for (int i = 1; i < supportedLSRs.length; ++i) {
s.append(", ").append(supportedLSRs[i].toString());
}
s.append('}');
}
s.append(" default=").append(Objects.toString(defaultULocale));
if (favorSubtag != null) {
s.append(" distance=").append(favorSubtag.toString());
s.append(" favor=").append(favorSubtag.toString());
}
if (thresholdDistance >= 0) {
s.append(String.format(" threshold=%d", thresholdDistance));

View file

@ -1052,9 +1052,9 @@ en >> en-DE
ar-EG >> ar-SY
pt-BR >> pt
ar-XB >> ar-XB
ar-PSBIDI >> ar-XB # These are equivalent.
ar-PSBIDI >> ar-PSBIDI
en-XA >> en-XA
en-PSACCENT >> en-XA # These are equivalent.
en-PSACCENT >> en-PSACCENT
ar-PSCRACK >> ar-PSCRACK
@favor=script
@ -1063,9 +1063,9 @@ en >> en-DE
ar-EG >> ar-SY
pt-BR >> pt
ar-XB >> ar-XB
ar-PSBIDI >> ar-XB # These are equivalent.
ar-PSBIDI >> ar-PSBIDI
en-XA >> en-XA
en-PSACCENT >> en-XA # These are equivalent.
en-PSACCENT >> en-PSACCENT
ar-PSCRACK >> ar-PSCRACK
** test: BestMatchForTraditionalChinese
@ -1544,50 +1544,44 @@ zh-TW, en >> en-US
zh-Hant-CN, en >> en-US
zh-Hans, en >> zh-Hans-CN
** test: return first among likely-subtags equivalent locales
# Was: more specific script should win in case regions are identical
# with some different results.
** test: return most originally similar among likely-subtags equivalent locales
@supported=af, af-Latn, af-Arab
af >> af
af-ZA >> af
af-Latn-ZA >> af
af-Latn >> af
af-Latn-ZA >> af-Latn
af-Latn >> af-Latn
@favor=script
af >> af
af-ZA >> af
af-Latn-ZA >> af
af-Latn >> af
af-Latn-ZA >> af-Latn
af-Latn >> af-Latn
# Was: more specific region should win
# with some different results.
@supported=nl, nl-NL, nl-BE
@favor=
nl >> nl
nl-Latn >> nl
nl-Latn-NL >> nl
nl-NL >> nl
nl-Latn-NL >> nl-NL
nl-NL >> nl-NL
@favor=script
nl >> nl
nl-Latn >> nl
nl-Latn-NL >> nl
nl-NL >> nl
nl-Latn-NL >> nl-NL
nl-NL >> nl-NL
# Was: more specific region wins over more specific script
# with some different results.
@supported=nl, nl-Latn, nl-NL, nl-BE
@favor=
nl >> nl
nl-Latn >> nl
nl-NL >> nl
nl-Latn-NL >> nl
nl-Latn >> nl-Latn
nl-NL >> nl-NL
nl-Latn-NL >> nl-Latn
@favor=script
nl >> nl
nl-Latn >> nl
nl-NL >> nl
nl-Latn-NL >> nl
nl-Latn >> nl-Latn
nl-NL >> nl-NL
nl-Latn-NL >> nl-Latn
** test: region may replace matched if matched is enclosing
@supported=es-419, es
@ -1670,22 +1664,22 @@ ja-Jpan-JP, en-GB >> ja
** test: pick best maximized tag
@supported=ja, ja-Jpan-US, ja-JP, en, ru
ja-Jpan, ru >> ja
ja-JP, ru >> ja
ja-JP, ru >> ja-JP
ja-US, ru >> ja-Jpan-US
@favor=script
ja-Jpan, ru >> ja
ja-JP, ru >> ja
ja-JP, ru >> ja-JP
ja-US, ru >> ja-Jpan-US
** test: termination: pick best maximized match
@supported=ja, ja-Jpan, ja-JP, en, ru
ja-Jpan-JP, ru >> ja
ja-Jpan, ru >> ja
ja-Jpan-JP, ru >> ja-Jpan
ja-Jpan, ru >> ja-Jpan
@favor=script
ja-Jpan-JP, ru >> ja
ja-Jpan, ru >> ja
ja-Jpan-JP, ru >> ja-Jpan
ja-Jpan, ru >> ja-Jpan
** test: same language over exact, but distinguish when user is explicit
@supported=fr, en-GB, ja, es-ES, es-MX
@ -1900,7 +1894,7 @@ zh-TW >> zh
** test: testGetBestMatchWithMinMatchScore
@supported=fr-FR, fr, fr-CA, en
@default=und
fr >> fr-FR # First likely-subtags equivalent match is chosen.
fr >> fr
@supported=en, fr, fr-CA
fr-FR >> fr # Parent match is chosen.
@supported=en, fr-CA
@ -1930,7 +1924,7 @@ ru >> und
@favor=script
@supported=fr-FR, fr, fr-CA, en
fr >> fr-FR
fr >> fr
@supported=en, fr, fr-CA
fr-FR >> fr
@supported=en, fr-CA

View file

@ -139,10 +139,11 @@ public class LikelySubtagsBuilder {
Map<LSR, Integer> lsrIndexes = new LinkedHashMap<>();
// Reserve index 0 as "no value":
// The runtime lookup returns 0 for an intermediate match with no value.
lsrIndexes.put(new LSR("", "", ""), 0); // arbitrary LSR
lsrIndexes.put(new LSR("", "", "", LSR.DONT_CARE_FLAGS), 0); // arbitrary LSR
// Reserve index 1 for SKIP_SCRIPT:
// The runtime lookup returns 1 for an intermediate match with a value.
lsrIndexes.put(new LSR("skip", "script", ""), 1); // looks good when printing the data
// This LSR looks good when printing the data.
lsrIndexes.put(new LSR("skip", "script", "", LSR.DONT_CARE_FLAGS), 1);
// We could prefill the lsrList with common locales to give them small indexes,
// and see if that improves performance a little.
for (Map.Entry<String, Map<String, Map<String, LSR>>> ls : langTable.entrySet()) {
@ -251,7 +252,7 @@ public class LikelySubtagsBuilder {
}
}
// hack
set(result, "und", "Latn", "", new LSR("en", "Latn", "US"));
set(result, "und", "Latn", "", new LSR("en", "Latn", "US", LSR.DONT_CARE_FLAGS));
// hack, ensure that if und-YY => und-Xxxx-YY, then we add Xxxx=>YY to the table
// <likelySubtag from="und_GH" to="ak_Latn_GH"/>
@ -294,7 +295,9 @@ public class LikelySubtagsBuilder {
String lang = parts[0];
String p2 = parts.length < 2 ? "" : parts[1];
String p3 = parts.length < 3 ? "" : parts[2];
return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3);
return p2.length() < 4 ?
new LSR(lang, "", p2, LSR.DONT_CARE_FLAGS) :
new LSR(lang, p2, p3, LSR.DONT_CARE_FLAGS);
}
private static void set(Map<String, Map<String, Map<String, LSR>>> langTable,

View file

@ -487,7 +487,10 @@ public final class LocaleDistanceBuilder {
Set<LSR> paradigmLSRs = new HashSet<>(); // could be TreeSet if LSR were Comparable
for (String paradigm : paradigms) {
ULocale pl = new ULocale(paradigm);
paradigmLSRs.add(XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl));
LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl);
// Clear the LSR flags to make the data equality test in
// LocaleDistanceTest happy.
paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
}
TerritoryContainment tc = new TerritoryContainment(supplementalData);