From ad638c274e954268779ea72e6e7869c75ccf7a79 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 20 Dec 2019 00:09:10 +0000 Subject: [PATCH] ICU-20916 LocaleMatcher distinguish between equivalent locales - equivalent but originally unequal - locale distance shifted left for additional fraction bits with micro distance - Java more verbose matcher debug output See #949 --- icu4c/source/common/localematcher.cpp | 70 +++++++----- icu4c/source/common/locdistance.cpp | 49 ++++++--- icu4c/source/common/locdistance.h | 41 ++++++- icu4c/source/common/loclikelysubtags.cpp | 40 ++++--- icu4c/source/common/lsr.cpp | 29 +++-- icu4c/source/common/lsr.h | 16 ++- .../test/testdata/localeMatcherTest.txt | 60 +++++------ .../core/src/com/ibm/icu/impl/locale/LSR.java | 24 ++++- .../ibm/icu/impl/locale/LocaleDistance.java | 100 ++++++++++++++---- .../ibm/icu/impl/locale/XLikelySubtags.java | 38 ++++--- .../src/com/ibm/icu/util/LocaleMatcher.java | 55 ++++++---- .../dev/test/util/data/localeMatcherTest.txt | 60 +++++------ .../dev/tool/locale/LikelySubtagsBuilder.java | 11 +- .../tool/locale/LocaleDistanceBuilder.java | 5 +- 14 files changed, 392 insertions(+), 206 deletions(-) diff --git a/icu4c/source/common/localematcher.cpp b/icu4c/source/common/localematcher.cpp index d975fe759b4..0723bc1d459 100644 --- a/icu4c/source/common/localematcher.cpp +++ b/icu4c/source/common/localematcher.cpp @@ -22,7 +22,7 @@ #include "uhash.h" #include "uvector.h" -#define UND_LSR LSR("und", "", "") +#define UND_LSR LSR("und", "", "", LSR::EXPLICIT_LSR) /** * Indicator for the lifetime of desired-locale objects passed into the LocaleMatcher. @@ -393,26 +393,27 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) : // 3. Remaining locales in builder order. // In Java, we use a LinkedHashMap for both map & ordered lists. // In C++, we use separate structures. - // We over-allocate arrays of LSRs and indexes for simplicity. - // We reserve slots at the array starts for the default and paradigm locales, - // plus enough for all supported locales. - // If there are few paradigm locales and few duplicate supported LSRs, - // then the amount of wasted space is small. + // + // We allocate arrays of LSRs and indexes, + // with as many slots as supported locales, for simplicity. + // We write the default and paradigm LSRs starting from the front of the arrays, + // and others starting from the back. + // At the end we reverse the non-paradigm LSRs. + // We end up wasting as many array slots as there are duplicate supported LSRs, + // but the amount of wasted space is small as long as there are few duplicates. supportedLsrToIndex = uhash_openSize(hashLSR, compareLSRs, uhash_compareLong, supportedLocalesLength, &errorCode); if (U_FAILURE(errorCode)) { return; } - int32_t paradigmLimit = 1 + localeDistance.getParadigmLSRsLength(); - int32_t suppLSRsCapacity = paradigmLimit + supportedLocalesLength; supportedLSRs = static_cast( - uprv_malloc(suppLSRsCapacity * sizeof(const LSR *))); + uprv_malloc(supportedLocalesLength * sizeof(const LSR *))); supportedIndexes = static_cast( - uprv_malloc(suppLSRsCapacity * sizeof(int32_t))); + uprv_malloc(supportedLocalesLength * sizeof(int32_t))); if (supportedLSRs == nullptr || supportedIndexes == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } int32_t paradigmIndex = 0; - int32_t otherIndex = paradigmLimit; + int32_t otherIndex = supportedLocalesLength; if (idef >= 0) { uhash_puti(supportedLsrToIndex, const_cast(defLSR), idef + 1, &errorCode); supportedLSRs[0] = defLSR; @@ -446,21 +447,32 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) : supportedLSRs[paradigmIndex] = &lsr; supportedIndexes[paradigmIndex++] = i; } else { - supportedLSRs[otherIndex] = &lsr; - supportedIndexes[otherIndex++] = i; + supportedLSRs[--otherIndex] = &lsr; + supportedIndexes[otherIndex] = i; } } } if (U_FAILURE(errorCode)) { return; } } - // Squeeze out unused array slots. - if (paradigmIndex < paradigmLimit && paradigmLimit < otherIndex) { - uprv_memmove(supportedLSRs + paradigmIndex, supportedLSRs + paradigmLimit, - (otherIndex - paradigmLimit) * sizeof(const LSR *)); - uprv_memmove(supportedIndexes + paradigmIndex, supportedIndexes + paradigmLimit, - (otherIndex - paradigmLimit) * sizeof(int32_t)); + // Reverse the non-paradigm LSRs to be in order, right after the paradigm LSRs. + // First fill the unused slots between paradigm LSRs and other LSRs. + // This gap is as large as the number of locales with duplicate LSRs. + int32_t i = paradigmIndex; + int32_t j = supportedLocalesLength - 1; + while (i < otherIndex && otherIndex <= j) { + supportedLSRs[i] = supportedLSRs[j]; + supportedIndexes[i++] = supportedIndexes[j--]; } - supportedLSRsLength = otherIndex - (paradigmLimit - paradigmIndex); + // Swap remaining non-paradigm LSRs in place. + while (i < j) { + const LSR *tempLSR = supportedLSRs[i]; + supportedLSRs[i] = supportedLSRs[j]; + supportedLSRs[j] = tempLSR; + int32_t tempIndex = supportedIndexes[i]; + supportedIndexes[i++] = supportedIndexes[j]; + supportedIndexes[j--] = tempIndex; + } + supportedLSRsLength = supportedLocalesLength - (otherIndex - paradigmIndex); } if (def != nullptr && (idef < 0 || def != supportedLocales[idef])) { @@ -662,7 +674,7 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai if (U_FAILURE(errorCode)) { return -1; } int32_t desiredIndex = 0; int32_t bestSupportedLsrIndex = -1; - for (int32_t bestDistance = thresholdDistance;;) { + for (int32_t bestShiftedDistance = LocaleDistance::shiftDistance(thresholdDistance);;) { // Quick check for exact maximized LSR. // Returns suppIndex+1 where 0 means not found. if (supportedLsrToIndex != nullptr) { @@ -677,16 +689,17 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai } } int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance( - desiredLSR, supportedLSRs, supportedLSRsLength, bestDistance, favorSubtag); + desiredLSR, supportedLSRs, supportedLSRsLength, bestShiftedDistance, favorSubtag); if (bestIndexAndDistance >= 0) { - bestDistance = bestIndexAndDistance & 0xff; + bestShiftedDistance = LocaleDistance::getShiftedDistance(bestIndexAndDistance); if (remainingIter != nullptr) { remainingIter->rememberCurrent(desiredIndex, errorCode); if (U_FAILURE(errorCode)) { return -1; } } - bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? bestIndexAndDistance >> 8 : -1; + bestSupportedLsrIndex = bestIndexAndDistance >= 0 ? + LocaleDistance::getIndex(bestIndexAndDistance) : -1; } - if ((bestDistance -= demotionPerDesiredLocale) <= 0) { + if ((bestShiftedDistance -= LocaleDistance::shiftDistance(demotionPerDesiredLocale)) <= 0) { break; } if (remainingIter == nullptr || !remainingIter->hasNext()) { @@ -708,11 +721,12 @@ double LocaleMatcher::internalMatch(const Locale &desired, const Locale &support LSR suppLSR = getMaximalLsrOrUnd(likelySubtags, supported, errorCode); if (U_FAILURE(errorCode)) { return 0; } const LSR *pSuppLSR = &suppLSR; - int32_t distance = localeDistance.getBestIndexAndDistance( + int32_t indexAndDistance = localeDistance.getBestIndexAndDistance( getMaximalLsrOrUnd(likelySubtags, desired, errorCode), &pSuppLSR, 1, - thresholdDistance, favorSubtag) & 0xff; - return (100 - distance) / 100.0; + LocaleDistance::shiftDistance(thresholdDistance), favorSubtag); + double distance = LocaleDistance::getDistanceDouble(indexAndDistance); + return (100.0 - distance) / 100.0; } U_NAMESPACE_END diff --git a/icu4c/source/common/locdistance.cpp b/icu4c/source/common/locdistance.cpp index 800d0eacf2b..50633cc8289 100644 --- a/icu4c/source/common/locdistance.cpp +++ b/icu4c/source/common/locdistance.cpp @@ -97,17 +97,23 @@ LocaleDistance::LocaleDistance(const LocaleDistanceData &data) : // a mere region difference for one desired locale // is as good as a perfect match for the next following desired locale. // As of CLDR 36, we have . - LSR en("en", "Latn", "US"); - LSR enGB("en", "Latn", "GB"); + LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR); + LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR); const LSR *p_enGB = &enGB; - defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1, - 50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff; + int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, + shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE); + defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); } int32_t LocaleDistance::getBestIndexAndDistance( const LSR &desired, const LSR **supportedLSRs, int32_t supportedLSRsLength, - int32_t threshold, ULocMatchFavorSubtag favorSubtag) const { + int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const { + // Round up the shifted threshold (if fraction bits are not 0) + // for comparison with un-shifted distances until we need fraction bits. + // (If we simply shifted non-zero fraction bits away, then we might ignore a language + // when it's really still a micro distance below the threshold.) + int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; BytesTrie iter(trie); // Look up the desired language only once for all supported LSRs. // Its "distance" is either a match point value of 0, or a non-match negative value. @@ -153,7 +159,7 @@ int32_t LocaleDistance::getBestIndexAndDistance( if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { distance >>= 2; } - if (distance >= threshold) { + if (distance >= roundedThreshold) { continue; } @@ -171,7 +177,7 @@ int32_t LocaleDistance::getBestIndexAndDistance( scriptDistance &= ~DISTANCE_IS_FINAL; } distance += scriptDistance; - if (distance >= threshold) { + if (distance >= roundedThreshold) { continue; } @@ -180,7 +186,7 @@ int32_t LocaleDistance::getBestIndexAndDistance( } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { distance += defaultRegionDistance; } else { - int32_t remainingThreshold = threshold - distance; + int32_t remainingThreshold = roundedThreshold - distance; if (minRegionDistance >= remainingThreshold) { continue; } @@ -196,15 +202,23 @@ int32_t LocaleDistance::getBestIndexAndDistance( partitionsForRegion(supported), remainingThreshold); } - if (distance < threshold) { - if (distance == 0) { - return slIndex << 8; + int32_t shiftedDistance = shiftDistance(distance); + if (shiftedDistance == 0) { + // Distinguish between equivalent but originally unequal locales via an + // additional micro distance. + shiftedDistance |= (desired.flags ^ supported.flags); + } + if (shiftedDistance < shiftedThreshold) { + if (shiftedDistance == 0) { + return slIndex << INDEX_SHIFT; } bestIndex = slIndex; - threshold = distance; + shiftedThreshold = shiftedDistance; } } - return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD; + return bestIndex >= 0 ? + (bestIndex << INDEX_SHIFT) | shiftedThreshold : + INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); } int32_t LocaleDistance::getDesSuppScriptDistance( @@ -352,11 +366,14 @@ int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) } UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { - // Linear search for a very short list (length 6 as of 2019). - // If there are many paradigm LSRs we should use a hash set. + // Linear search for a very short list (length 6 as of 2019), + // because we look for equivalence not equality, and + // because it's easy. + // If there are many paradigm LSRs we should use a hash set + // with custom comparator and hasher. U_ASSERT(paradigmLSRsLength <= 15); for (int32_t i = 0; i < paradigmLSRsLength; ++i) { - if (lsr == paradigmLSRs[i]) { return true; } + if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } } return false; } diff --git a/icu4c/source/common/locdistance.h b/icu4c/source/common/locdistance.h index 7439f51c56b..0ee3d0e63e9 100644 --- a/icu4c/source/common/locdistance.h +++ b/icu4c/source/common/locdistance.h @@ -26,19 +26,36 @@ class LocaleDistance final : public UMemory { public: static const LocaleDistance *getSingleton(UErrorCode &errorCode); + static int32_t shiftDistance(int32_t distance) { + return distance << DISTANCE_SHIFT; + } + + static int32_t getShiftedDistance(int32_t indexAndDistance) { + return indexAndDistance & DISTANCE_MASK; + } + + static double getDistanceDouble(int32_t indexAndDistance) { + double shiftedDistance = getShiftedDistance(indexAndDistance); + return shiftedDistance / (1 << DISTANCE_SHIFT); + } + + static int32_t getIndex(int32_t indexAndDistance) { + // assert indexAndDistance >= 0; + return indexAndDistance >> INDEX_SHIFT; + } + /** * Finds the supported LSR with the smallest distance from the desired one. * Equivalent LSR subtags must be normalized into a canonical form. * - *

Returns the index of the lowest-distance supported LSR in bits 31..8 + *

Returns the index of the lowest-distance supported LSR in the high bits * (negative if none has a distance below the threshold), - * and its distance (0..ABOVE_THRESHOLD) in bits 7..0. + * and its distance (0..ABOVE_THRESHOLD) in the low bits. */ int32_t getBestIndexAndDistance(const LSR &desired, const LSR **supportedLSRs, int32_t supportedLSRsLength, - int32_t threshold, ULocMatchFavorSubtag favorSubtag) const; - - int32_t getParadigmLSRsLength() const { return paradigmLSRsLength; } + int32_t shiftedThreshold, + ULocMatchFavorSubtag favorSubtag) const; UBool isParadigmLSR(const LSR &lsr) const; @@ -51,6 +68,20 @@ public: } private: + // The distance is shifted left to gain some fraction bits. + static constexpr int32_t DISTANCE_SHIFT = 3; + static constexpr int32_t DISTANCE_FRACTION_MASK = 7; + // 7 bits for 0..100 + static constexpr int32_t DISTANCE_INT_SHIFT = 7; + static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT; + static constexpr int32_t DISTANCE_MASK = 0x3ff; + // tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit + static constexpr int32_t INDEX_NEG_1 = 0xfffffc00; + + static int32_t getDistanceFloor(int32_t indexAndDistance) { + return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT; + } + LocaleDistance(const LocaleDistanceData &data); LocaleDistance(const LocaleDistance &other) = delete; LocaleDistance &operator=(const LocaleDistance &other) = delete; diff --git a/icu4c/source/common/loclikelysubtags.cpp b/icu4c/source/common/loclikelysubtags.cpp index d7f5e124c2c..27f10b3fb92 100644 --- a/icu4c/source/common/loclikelysubtags.cpp +++ b/icu4c/source/common/loclikelysubtags.cpp @@ -250,7 +250,8 @@ struct XLikelySubtagsData { for (int32_t i = 0, j = 0; i < lsrSubtagsLength; i += 3, ++j) { lsrs[j] = LSR(strings.get(lsrSubtagIndexes[i]), strings.get(lsrSubtagIndexes[i + 1]), - strings.get(lsrSubtagIndexes[i + 2])); + strings.get(lsrSubtagIndexes[i + 2]), + LSR::IMPLICIT_LSR); } if (partitionsLength > 0) { @@ -275,7 +276,8 @@ struct XLikelySubtagsData { for (int32_t i = 0, j = 0; i < paradigmSubtagsLength; i += 3, ++j) { paradigms[j] = LSR(strings.get(paradigmSubtagIndexes[i]), strings.get(paradigmSubtagIndexes[i + 1]), - strings.get(paradigmSubtagIndexes[i + 2])); + strings.get(paradigmSubtagIndexes[i + 2]), + LSR::DONT_CARE_FLAGS); } distanceData.paradigms = paradigms; } @@ -383,7 +385,7 @@ LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &error const char *name = locale.getName(); if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=") // Private use language tag x-subtag-subtag... - return LSR(name, "", ""); + return LSR(name, "", "", LSR::EXPLICIT_LSR); } return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), locale.getVariant(), errorCode); @@ -407,26 +409,31 @@ LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, c if (region[0] == 'X' && (c1 = region[1]) != 0 && region[2] == 0) { switch (c1) { case 'A': - return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, errorCode); + return LSR(PSEUDO_ACCENTS_PREFIX, language, script, region, + LSR::EXPLICIT_LSR, errorCode); case 'B': - return LSR(PSEUDO_BIDI_PREFIX, language, script, region, errorCode); + return LSR(PSEUDO_BIDI_PREFIX, language, script, region, + LSR::EXPLICIT_LSR, errorCode); case 'C': - return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, errorCode); + return LSR(PSEUDO_CRACKED_PREFIX, language, script, region, + LSR::EXPLICIT_LSR, errorCode); default: // normal locale break; } } if (variant[0] == 'P' && variant[1] == 'S') { + int32_t lsrFlags = *region == 0 ? + LSR::EXPLICIT_LANGUAGE | LSR::EXPLICIT_SCRIPT : LSR::EXPLICIT_LSR; if (uprv_strcmp(variant, "PSACCENT") == 0) { return LSR(PSEUDO_ACCENTS_PREFIX, language, script, - *region == 0 ? "XA" : region, errorCode); + *region == 0 ? "XA" : region, lsrFlags, errorCode); } else if (uprv_strcmp(variant, "PSBIDI") == 0) { return LSR(PSEUDO_BIDI_PREFIX, language, script, - *region == 0 ? "XB" : region, errorCode); + *region == 0 ? "XB" : region, lsrFlags, errorCode); } else if (uprv_strcmp(variant, "PSCRACK") == 0) { return LSR(PSEUDO_CRACKED_PREFIX, language, script, - *region == 0 ? "XC" : region, errorCode); + *region == 0 ? "XC" : region, lsrFlags, errorCode); } // else normal locale } @@ -448,7 +455,7 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha region = ""; } if (*script != 0 && *region != 0 && *language != 0) { - return LSR(language, script, region); // already maximized + return LSR(language, script, region, LSR::EXPLICIT_LSR); // already maximized } uint32_t retainOldMask = 0; @@ -535,7 +542,7 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha if (retainOldMask == 0) { // Quickly return a copy of the lookup-result LSR // without new allocation of the subtags. - return LSR(result.language, result.script, result.region); + return LSR(result.language, result.script, result.region, result.flags); } if ((retainOldMask & 4) == 0) { language = result.language; @@ -546,7 +553,8 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha if ((retainOldMask & 1) == 0) { region = result.region; } - return LSR(language, script, region); + // retainOldMask flags = LSR explicit-subtag flags + return LSR(language, script, region, retainOldMask); } int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) { @@ -615,9 +623,9 @@ LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn boolean favorRegionOk = false; if (result.script.equals(value00.script)) { //script is default if (result.region.equals(value00.region)) { - return new LSR(result.language, "", ""); + return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS); } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) { - return new LSR(result.language, "", result.region); + return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS); } else { favorRegionOk = true; } @@ -627,9 +635,9 @@ LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn // Maybe do later, but for now use the straightforward code. LSR result2 = maximize(languageIn, scriptIn, ""); if (result2.equals(result)) { - return new LSR(result.language, result.script, ""); + return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS); } else if (favorRegionOk) { - return new LSR(result.language, "", result.region); + return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS); } return result; } diff --git a/icu4c/source/common/lsr.cpp b/icu4c/source/common/lsr.cpp index 0c28eeda1bc..a5e10ef1767 100644 --- a/icu4c/source/common/lsr.cpp +++ b/icu4c/source/common/lsr.cpp @@ -14,9 +14,10 @@ U_NAMESPACE_BEGIN -LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode) : +LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f, + UErrorCode &errorCode) : language(nullptr), script(nullptr), region(r), - regionIndex(indexForRegion(region)) { + regionIndex(indexForRegion(region)), flags(f) { if (U_SUCCESS(errorCode)) { CharString langScript; langScript.append(prefix, errorCode).append(lang, errorCode).append('\0', errorCode); @@ -32,7 +33,8 @@ LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCo LSR::LSR(LSR &&other) U_NOEXCEPT : language(other.language), script(other.script), region(other.region), owned(other.owned), - regionIndex(other.regionIndex), hashCode(other.hashCode) { + regionIndex(other.regionIndex), flags(other.flags), + hashCode(other.hashCode) { if (owned != nullptr) { other.language = other.script = ""; other.owned = nullptr; @@ -50,6 +52,7 @@ LSR &LSR::operator=(LSR &&other) U_NOEXCEPT { script = other.script; region = other.region; regionIndex = other.regionIndex; + flags = other.flags; owned = other.owned; hashCode = other.hashCode; if (owned != nullptr) { @@ -60,7 +63,7 @@ LSR &LSR::operator=(LSR &&other) U_NOEXCEPT { return *this; } -UBool LSR::operator==(const LSR &other) const { +UBool LSR::isEquivalentTo(const LSR &other) const { return uprv_strcmp(language, other.language) == 0 && uprv_strcmp(script, other.script) == 0 && @@ -69,6 +72,16 @@ UBool LSR::operator==(const LSR &other) const { (regionIndex > 0 || uprv_strcmp(region, other.region) == 0); } +UBool LSR::operator==(const LSR &other) const { + return + uprv_strcmp(language, other.language) == 0 && + uprv_strcmp(script, other.script) == 0 && + regionIndex == other.regionIndex && + // Compare regions if both are ill-formed (and their indexes are 0). + (regionIndex > 0 || uprv_strcmp(region, other.region) == 0) && + flags == other.flags; +} + int32_t LSR::indexForRegion(const char *region) { int32_t c = region[0]; int32_t a = c - '0'; @@ -90,10 +103,10 @@ int32_t LSR::indexForRegion(const char *region) { LSR &LSR::setHashCode() { if (hashCode == 0) { - hashCode = - (ustr_hashCharsN(language, static_cast(uprv_strlen(language))) * 37 + - ustr_hashCharsN(script, static_cast(uprv_strlen(script)))) * 37 + - regionIndex; + int32_t h = ustr_hashCharsN(language, static_cast(uprv_strlen(language))); + h = h * 37 + ustr_hashCharsN(script, static_cast(uprv_strlen(script))); + h = h * 37 + regionIndex; + hashCode = h * 37 + flags; } return *this; } diff --git a/icu4c/source/common/lsr.h b/icu4c/source/common/lsr.h index db6cf938f47..d535e5b0376 100644 --- a/icu4c/source/common/lsr.h +++ b/icu4c/source/common/lsr.h @@ -16,26 +16,35 @@ U_NAMESPACE_BEGIN struct LSR final : public UMemory { static constexpr int32_t REGION_INDEX_LIMIT = 1001 + 26 * 26; + static constexpr int32_t EXPLICIT_LSR = 7; + static constexpr int32_t EXPLICIT_LANGUAGE = 4; + static constexpr int32_t EXPLICIT_SCRIPT = 2; + static constexpr int32_t EXPLICIT_REGION = 1; + static constexpr int32_t IMPLICIT_LSR = 0; + static constexpr int32_t DONT_CARE_FLAGS = 0; + const char *language; const char *script; const char *region; char *owned = nullptr; /** Index for region, 0 if ill-formed. @see indexForRegion */ int32_t regionIndex = 0; + int32_t flags = 0; /** Only set for LSRs that will be used in a hash table. */ int32_t hashCode = 0; LSR() : language("und"), script(""), region("") {} /** Constructor which aliases all subtag pointers. */ - LSR(const char *lang, const char *scr, const char *r) : + LSR(const char *lang, const char *scr, const char *r, int32_t f) : language(lang), script(scr), region(r), - regionIndex(indexForRegion(region)) {} + regionIndex(indexForRegion(region)), flags(f) {} /** * Constructor which prepends the prefix to the language and script, * copies those into owned memory, and aliases the region. */ - LSR(char prefix, const char *lang, const char *scr, const char *r, UErrorCode &errorCode); + LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f, + UErrorCode &errorCode); LSR(LSR &&other) U_NOEXCEPT; LSR(const LSR &other) = delete; inline ~LSR() { @@ -55,6 +64,7 @@ struct LSR final : public UMemory { */ static int32_t indexForRegion(const char *region); + UBool isEquivalentTo(const LSR &other) const; UBool operator==(const LSR &other) const; inline UBool operator!=(const LSR &other) const { diff --git a/icu4c/source/test/testdata/localeMatcherTest.txt b/icu4c/source/test/testdata/localeMatcherTest.txt index 21c9b601410..649c95baea5 100644 --- a/icu4c/source/test/testdata/localeMatcherTest.txt +++ b/icu4c/source/test/testdata/localeMatcherTest.txt @@ -1052,9 +1052,9 @@ en >> en-DE ar-EG >> ar-SY pt-BR >> pt ar-XB >> ar-XB -ar-PSBIDI >> ar-XB # These are equivalent. +ar-PSBIDI >> ar-PSBIDI en-XA >> en-XA -en-PSACCENT >> en-XA # These are equivalent. +en-PSACCENT >> en-PSACCENT ar-PSCRACK >> ar-PSCRACK @favor=script @@ -1063,9 +1063,9 @@ en >> en-DE ar-EG >> ar-SY pt-BR >> pt ar-XB >> ar-XB -ar-PSBIDI >> ar-XB # These are equivalent. +ar-PSBIDI >> ar-PSBIDI en-XA >> en-XA -en-PSACCENT >> en-XA # These are equivalent. +en-PSACCENT >> en-PSACCENT ar-PSCRACK >> ar-PSCRACK ** test: BestMatchForTraditionalChinese @@ -1544,50 +1544,44 @@ zh-TW, en >> en-US zh-Hant-CN, en >> en-US zh-Hans, en >> zh-Hans-CN -** test: return first among likely-subtags equivalent locales -# Was: more specific script should win in case regions are identical -# with some different results. +** test: return most originally similar among likely-subtags equivalent locales @supported=af, af-Latn, af-Arab af >> af af-ZA >> af -af-Latn-ZA >> af -af-Latn >> af +af-Latn-ZA >> af-Latn +af-Latn >> af-Latn @favor=script af >> af af-ZA >> af -af-Latn-ZA >> af -af-Latn >> af +af-Latn-ZA >> af-Latn +af-Latn >> af-Latn -# Was: more specific region should win -# with some different results. @supported=nl, nl-NL, nl-BE @favor= nl >> nl nl-Latn >> nl -nl-Latn-NL >> nl -nl-NL >> nl +nl-Latn-NL >> nl-NL +nl-NL >> nl-NL @favor=script nl >> nl nl-Latn >> nl -nl-Latn-NL >> nl -nl-NL >> nl +nl-Latn-NL >> nl-NL +nl-NL >> nl-NL -# Was: more specific region wins over more specific script -# with some different results. @supported=nl, nl-Latn, nl-NL, nl-BE @favor= nl >> nl -nl-Latn >> nl -nl-NL >> nl -nl-Latn-NL >> nl +nl-Latn >> nl-Latn +nl-NL >> nl-NL +nl-Latn-NL >> nl-Latn @favor=script nl >> nl -nl-Latn >> nl -nl-NL >> nl -nl-Latn-NL >> nl +nl-Latn >> nl-Latn +nl-NL >> nl-NL +nl-Latn-NL >> nl-Latn ** test: region may replace matched if matched is enclosing @supported=es-419, es @@ -1670,22 +1664,22 @@ ja-Jpan-JP, en-GB >> ja ** test: pick best maximized tag @supported=ja, ja-Jpan-US, ja-JP, en, ru ja-Jpan, ru >> ja -ja-JP, ru >> ja +ja-JP, ru >> ja-JP ja-US, ru >> ja-Jpan-US @favor=script ja-Jpan, ru >> ja -ja-JP, ru >> ja +ja-JP, ru >> ja-JP ja-US, ru >> ja-Jpan-US ** test: termination: pick best maximized match @supported=ja, ja-Jpan, ja-JP, en, ru -ja-Jpan-JP, ru >> ja -ja-Jpan, ru >> ja +ja-Jpan-JP, ru >> ja-Jpan +ja-Jpan, ru >> ja-Jpan @favor=script -ja-Jpan-JP, ru >> ja -ja-Jpan, ru >> ja +ja-Jpan-JP, ru >> ja-Jpan +ja-Jpan, ru >> ja-Jpan ** test: same language over exact, but distinguish when user is explicit @supported=fr, en-GB, ja, es-ES, es-MX @@ -1900,7 +1894,7 @@ zh-TW >> zh ** test: testGetBestMatchWithMinMatchScore @supported=fr-FR, fr, fr-CA, en @default=und -fr >> fr-FR # First likely-subtags equivalent match is chosen. +fr >> fr @supported=en, fr, fr-CA fr-FR >> fr # Parent match is chosen. @supported=en, fr-CA @@ -1930,7 +1924,7 @@ ru >> und @favor=script @supported=fr-FR, fr, fr-CA, en -fr >> fr-FR +fr >> fr @supported=en, fr, fr-CA fr-FR >> fr @supported=en, fr-CA diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java index d1dc775d183..95c289814f2 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LSR.java @@ -7,6 +7,13 @@ import java.util.Objects; public final class LSR { public static final int REGION_INDEX_LIMIT = 1001 + 26 * 26; + public static final int EXPLICIT_LSR = 7; + public static final int EXPLICIT_LANGUAGE = 4; + public static final int EXPLICIT_SCRIPT = 2; + public static final int EXPLICIT_REGION = 1; + public static final int IMPLICIT_LSR = 0; + public static final int DONT_CARE_FLAGS = 0; + public static final boolean DEBUG_OUTPUT = false; public final String language; @@ -14,12 +21,14 @@ public final class LSR { public final String region; /** Index for region, negative if ill-formed. @see indexForRegion */ final int regionIndex; + public final int flags; - public LSR(String language, String script, String region) { + public LSR(String language, String script, String region, int flags) { this.language = language; this.script = script; this.region = region; regionIndex = indexForRegion(region); + this.flags = flags; } /** @@ -57,6 +66,13 @@ public final class LSR { } return result.toString(); } + + public boolean isEquivalentTo(LSR other) { + return language.equals(other.language) + && script.equals(other.script) + && region.equals(other.region); + } + @Override public boolean equals(Object obj) { LSR other; @@ -65,10 +81,12 @@ public final class LSR { && obj.getClass() == this.getClass() && language.equals((other = (LSR) obj).language) && script.equals(other.script) - && region.equals(other.region)); + && region.equals(other.region) + && flags == other.flags); } + @Override public int hashCode() { - return Objects.hash(language, script, region); + return Objects.hash(language, script, region, flags); } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java index 79fe285bcec..028cd8a2bad 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java @@ -34,6 +34,17 @@ public class LocaleDistance { private static final int DISTANCE_IS_FINAL = 0x100; private static final int DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; + + // The distance is shifted left to gain some fraction bits. + private static final int DISTANCE_SHIFT = 3; + private static final int DISTANCE_FRACTION_MASK = 7; + // 7 bits for 0..100 + private static final int DISTANCE_INT_SHIFT = 7; + private static final int INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT; + private static final int DISTANCE_MASK = 0x3ff; + // vate static final int MAX_INDEX = 0x1fffff; // avoids sign bit + private static final int INDEX_NEG_1 = 0xfffffc00; + // Indexes into array of distances. public static final int IX_DEF_LANG_DISTANCE = 0; public static final int IX_DEF_SCRIPT_DISTANCE = 1; @@ -67,6 +78,28 @@ public class LocaleDistance { private final int minRegionDistance; private final int defaultDemotionPerDesiredLocale; + public static final int shiftDistance(int distance) { + return distance << DISTANCE_SHIFT; + } + + public static final int getShiftedDistance(int indexAndDistance) { + return indexAndDistance & DISTANCE_MASK; + } + + public static final double getDistanceDouble(int indexAndDistance) { + double shiftedDistance = getShiftedDistance(indexAndDistance); + return shiftedDistance / (1 << DISTANCE_SHIFT); + } + + private static final int getDistanceFloor(int indexAndDistance) { + return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT; + } + + public static final int getIndex(int indexAndDistance) { + assert indexAndDistance >= 0; + return indexAndDistance >> INDEX_SHIFT; + } + // VisibleForTesting public static final class Data { public byte[] trie; @@ -121,7 +154,8 @@ public class LocaleDistance { String[] paradigms = value.getStringArray(); paradigmLSRs = new HashSet<>(paradigms.length / 3); for (int i = 0; i < paradigms.length; i += 3) { - paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2])); + paradigmLSRs.add(new LSR(paradigms[i], paradigms[i + 1], paradigms[i + 2], + LSR.DONT_CARE_FLAGS)); } } else { paradigmLSRs = Collections.emptySet(); @@ -168,10 +202,11 @@ public class LocaleDistance { // a mere region difference for one desired locale // is as good as a perfect match for the next following desired locale. // As of CLDR 36, we have . - LSR en = new LSR("en", "Latn", "US"); - LSR enGB = new LSR("en", "Latn", "GB"); - defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, new LSR[] { enGB }, - 50, FavorSubtag.LANGUAGE) & 0xff; + LSR en = new LSR("en", "Latn", "US", LSR.EXPLICIT_LSR); + LSR enGB = new LSR("en", "Latn", "GB", LSR.EXPLICIT_LSR); + int indexAndDistance = getBestIndexAndDistance(en, new LSR[] { enGB }, + shiftDistance(50), FavorSubtag.LANGUAGE); + defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); if (DEBUG_OUTPUT) { System.out.println("*** locale distance"); @@ -187,20 +222,26 @@ public class LocaleDistance { int threshold, FavorSubtag favorSubtag) { LSR supportedLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(supported); LSR desiredLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(desired); - return getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR }, - threshold, favorSubtag) & 0xff; + int indexAndDistance = getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR }, + shiftDistance(threshold), favorSubtag); + return getDistanceFloor(indexAndDistance); } /** * Finds the supported LSR with the smallest distance from the desired one. * Equivalent LSR subtags must be normalized into a canonical form. * - *

Returns the index of the lowest-distance supported LSR in bits 31..8 + *

Returns the index of the lowest-distance supported LSR in the high bits * (negative if none has a distance below the threshold), - * and its distance (0..ABOVE_THRESHOLD) in bits 7..0. + * and its distance (0..ABOVE_THRESHOLD) in the low bits. */ public int getBestIndexAndDistance(LSR desired, LSR[] supportedLSRs, - int threshold, FavorSubtag favorSubtag) { + int shiftedThreshold, FavorSubtag favorSubtag) { + // Round up the shifted threshold (if fraction bits are not 0) + // for comparison with un-shifted distances until we need fraction bits. + // (If we simply shifted non-zero fraction bits away, then we might ignore a language + // when it's really still a micro distance below the threshold.) + int roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; BytesTrie iter = new BytesTrie(trie); // Look up the desired language only once for all supported LSRs. // Its "distance" is either a match point value of 0, or a non-match negative value. @@ -246,7 +287,7 @@ public class LocaleDistance { if (favorSubtag == FavorSubtag.SCRIPT) { distance >>= 2; } - if (distance >= threshold) { + if (distance >= roundedThreshold) { continue; } @@ -264,7 +305,7 @@ public class LocaleDistance { scriptDistance &= ~DISTANCE_IS_FINAL; } distance += scriptDistance; - if (distance >= threshold) { + if (distance >= roundedThreshold) { continue; } @@ -273,7 +314,7 @@ public class LocaleDistance { } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { distance += defaultRegionDistance; } else { - int remainingThreshold = threshold - distance; + int remainingThreshold = roundedThreshold - distance; if (minRegionDistance >= remainingThreshold) { continue; } @@ -289,15 +330,23 @@ public class LocaleDistance { partitionsForRegion(supported), remainingThreshold); } - if (distance < threshold) { - if (distance == 0) { - return slIndex << 8; + int shiftedDistance = shiftDistance(distance); + if (shiftedDistance == 0) { + // Distinguish between equivalent but originally unequal locales via an + // additional micro distance. + shiftedDistance |= (desired.flags ^ supported.flags); + } + if (shiftedDistance < shiftedThreshold) { + if (shiftedDistance == 0) { + return slIndex << INDEX_SHIFT; } bestIndex = slIndex; - threshold = distance; + shiftedThreshold = shiftedDistance; } } - return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD; + return bestIndex >= 0 ? + (bestIndex << INDEX_SHIFT) | shiftedThreshold : + INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); } private static final int getDesSuppScriptDistance(BytesTrie iter, long startState, @@ -439,7 +488,17 @@ public class LocaleDistance { } public boolean isParadigmLSR(LSR lsr) { - return paradigmLSRs.contains(lsr); + // Linear search for a very short list (length 6 as of 2019), + // because we look for equivalence not equality, and + // HashSet does not support customizing equality. + // If there are many paradigm LSRs we should revisit this. + assert paradigmLSRs.size() <= 15; + for (LSR plsr : paradigmLSRs) { + if (lsr.isEquivalentTo(plsr)) { + return true; + } + } + return false; } // VisibleForTesting @@ -455,9 +514,6 @@ public class LocaleDistance { return defaultDemotionPerDesiredLocale; } - // TODO: When we build data offline, - // write test code to compare the loaded table with the builder output. - // Fail if different, with instructions for how to update the data file. // VisibleForTesting public Map testOnlyGetDistanceTable() { Map map = new TreeMap<>(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java index de42587b317..1938170e74e 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java @@ -87,7 +87,8 @@ public final class XLikelySubtags { String[] lsrSubtags = getValue(likelyTable, "lsrs", value).getStringArray(); LSR[] lsrs = new LSR[lsrSubtags.length / 3]; for (int i = 0, j = 0; i < lsrSubtags.length; i += 3, ++j) { - lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2]); + lsrs[j] = new LSR(lsrSubtags[i], lsrSubtags[i + 1], lsrSubtags[i + 2], + LSR.IMPLICIT_LSR); } return new Data(languageAliases, regionAliases, trie, lsrs); @@ -185,7 +186,7 @@ public final class XLikelySubtags { String tag = locale.toLanguageTag(); assert tag.startsWith("x-"); // Private use language tag x-subtag-subtag... - return new LSR(tag, "", ""); + return new LSR(tag, "", "", LSR.EXPLICIT_LSR); } return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), locale.getVariant()); @@ -195,7 +196,7 @@ public final class XLikelySubtags { String tag = locale.toLanguageTag(); if (tag.startsWith("x-")) { // Private use language tag x-subtag-subtag... - return new LSR(tag, "", ""); + return new LSR(tag, "", "", LSR.EXPLICIT_LSR); } return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(), locale.getVariant()); @@ -209,29 +210,34 @@ public final class XLikelySubtags { switch (region.charAt(1)) { case 'A': return new LSR(PSEUDO_ACCENTS_PREFIX + language, - PSEUDO_ACCENTS_PREFIX + script, region); + PSEUDO_ACCENTS_PREFIX + script, region, LSR.EXPLICIT_LSR); case 'B': return new LSR(PSEUDO_BIDI_PREFIX + language, - PSEUDO_BIDI_PREFIX + script, region); + PSEUDO_BIDI_PREFIX + script, region, LSR.EXPLICIT_LSR); case 'C': return new LSR(PSEUDO_CRACKED_PREFIX + language, - PSEUDO_CRACKED_PREFIX + script, region); + PSEUDO_CRACKED_PREFIX + script, region, LSR.EXPLICIT_LSR); default: // normal locale break; } } if (variant.startsWith("PS")) { + int lsrFlags = region.isEmpty() ? + LSR.EXPLICIT_LANGUAGE | LSR.EXPLICIT_SCRIPT : LSR.EXPLICIT_LSR; switch (variant) { case "PSACCENT": return new LSR(PSEUDO_ACCENTS_PREFIX + language, - PSEUDO_ACCENTS_PREFIX + script, region.isEmpty() ? "XA" : region); + PSEUDO_ACCENTS_PREFIX + script, + region.isEmpty() ? "XA" : region, lsrFlags); case "PSBIDI": return new LSR(PSEUDO_BIDI_PREFIX + language, - PSEUDO_BIDI_PREFIX + script, region.isEmpty() ? "XB" : region); + PSEUDO_BIDI_PREFIX + script, + region.isEmpty() ? "XB" : region, lsrFlags); case "PSCRACK": return new LSR(PSEUDO_CRACKED_PREFIX + language, - PSEUDO_CRACKED_PREFIX + script, region.isEmpty() ? "XC" : region); + PSEUDO_CRACKED_PREFIX + script, + region.isEmpty() ? "XC" : region, lsrFlags); default: // normal locale break; } @@ -257,7 +263,7 @@ public final class XLikelySubtags { region = ""; } if (!script.isEmpty() && !region.isEmpty() && !language.isEmpty()) { - return new LSR(language, script, region); // already maximized + return new LSR(language, script, region, LSR.EXPLICIT_LSR); // already maximized } int retainOldMask = 0; @@ -340,6 +346,7 @@ public final class XLikelySubtags { } if (retainOldMask == 0) { + assert result.flags == LSR.IMPLICIT_LSR; return result; } if ((retainOldMask & 4) == 0) { @@ -351,7 +358,8 @@ public final class XLikelySubtags { if ((retainOldMask & 1) == 0) { region = result.region; } - return new LSR(language, script, region); + // retainOldMask flags = LSR explicit-subtag flags + return new LSR(language, script, region, retainOldMask); } private static final int trieNext(BytesTrie iter, String s, int i) { @@ -411,9 +419,9 @@ public final class XLikelySubtags { boolean favorRegionOk = false; if (result.script.equals(value00.script)) { //script is default if (result.region.equals(value00.region)) { - return new LSR(result.language, "", ""); + return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS); } else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) { - return new LSR(result.language, "", result.region); + return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS); } else { favorRegionOk = true; } @@ -423,9 +431,9 @@ public final class XLikelySubtags { // Maybe do later, but for now use the straightforward code. LSR result2 = maximize(languageIn, scriptIn, ""); if (result2.equals(result)) { - return new LSR(result.language, result.script, ""); + return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS); } else if (favorRegionOk) { - return new LSR(result.language, "", result.region); + return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS); } return result; } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java index e0fd56909d2..a333355c4c0 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/LocaleMatcher.java @@ -64,7 +64,7 @@ import com.ibm.icu.impl.locale.XLikelySubtags; * @stable ICU 4.4 */ public final class LocaleMatcher { - private static final LSR UND_LSR = new LSR("und","",""); + private static final LSR UND_LSR = new LSR("und","","", LSR.EXPLICIT_LSR); // In ULocale, "und" and "" make the same object. private static final ULocale UND_ULOCALE = new ULocale("und"); // In Locale, "und" and "" make different objects. @@ -680,6 +680,9 @@ public final class LocaleMatcher { builder.demotion == Demotion.NONE ? 0 : LocaleDistance.INSTANCE.getDefaultDemotionPerDesiredLocale(); // null or REGION favorSubtag = builder.favor; + if (TRACE_MATCHER) { + System.err.printf("new LocaleMatcher: %s\n", toString()); + } } private static final void putIfAbsent(Map lsrToIndex, LSR lsr, int i) { @@ -938,26 +941,34 @@ public final class LocaleMatcher { private int getBestSuppIndex(LSR desiredLSR, LsrIterator remainingIter) { int desiredIndex = 0; int bestSupportedLsrIndex = -1; - for (int bestDistance = thresholdDistance;;) { + StringBuilder sb = null; + if (TRACE_MATCHER) { + sb = new StringBuilder("LocaleMatcher desired:"); + } + for (int bestShiftedDistance = LocaleDistance.shiftDistance(thresholdDistance);;) { + if (TRACE_MATCHER) { + sb.append(' ').append(desiredLSR); + } // Quick check for exact maximized LSR. Integer index = supportedLsrToIndex.get(desiredLSR); if (index != null) { int suppIndex = index; if (TRACE_MATCHER) { - System.err.printf("Returning %s: desiredLSR=supportedLSR\n", - supportedULocales[suppIndex]); + System.err.printf("%s --> best=%s: desiredLSR=supportedLSR\n", + sb, supportedULocales[suppIndex]); } if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); } return suppIndex; } int bestIndexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance( - desiredLSR, supportedLSRs, bestDistance, favorSubtag); + desiredLSR, supportedLSRs, bestShiftedDistance, favorSubtag); if (bestIndexAndDistance >= 0) { - bestDistance = bestIndexAndDistance & 0xff; + bestShiftedDistance = LocaleDistance.getShiftedDistance(bestIndexAndDistance); if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); } - bestSupportedLsrIndex = bestIndexAndDistance >> 8; + bestSupportedLsrIndex = LocaleDistance.getIndex(bestIndexAndDistance); } - if ((bestDistance -= demotionPerDesiredLocale) <= 0) { + if ((bestShiftedDistance -= LocaleDistance.shiftDistance(demotionPerDesiredLocale)) + <= 0) { break; } if (remainingIter == null || !remainingIter.hasNext()) { @@ -968,14 +979,14 @@ public final class LocaleMatcher { } if (bestSupportedLsrIndex < 0) { if (TRACE_MATCHER) { - System.err.printf("Returning default %s: no good match\n", defaultULocale); + System.err.printf("%s --> best=default %s: no good match\n", sb, defaultULocale); } return -1; } int suppIndex = supportedIndexes[bestSupportedLsrIndex]; if (TRACE_MATCHER) { - System.err.printf("Returning %s: best matching supported locale\n", - supportedULocales[suppIndex]); + System.err.printf("%s --> best=%s: best matching supported locale\n", + sb, supportedULocales[suppIndex]); } return suppIndex; } @@ -1000,11 +1011,16 @@ public final class LocaleMatcher { @Deprecated public double match(ULocale desired, ULocale desiredMax, ULocale supported, ULocale supportedMax) { // Returns the inverse of the distance: That is, 1-distance(desired, supported). - int distance = LocaleDistance.INSTANCE.getBestIndexAndDistance( + int indexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance( getMaximalLsrOrUnd(desired), new LSR[] { getMaximalLsrOrUnd(supported) }, - thresholdDistance, favorSubtag) & 0xff; - return (100 - distance) / 100.0; + LocaleDistance.shiftDistance(thresholdDistance), favorSubtag); + double distance = LocaleDistance.getDistanceDouble(indexAndDistance); + if (TRACE_MATCHER) { + System.err.printf("LocaleMatcher distance(desired=%s, supported=%s)=%g\n", + Objects.toString(desired), Objects.toString(supported), distance); + } + return (100.0 - distance) / 100.0; } /** @@ -1032,16 +1048,17 @@ public final class LocaleMatcher { @Override public String toString() { StringBuilder s = new StringBuilder().append("{LocaleMatcher"); - if (supportedULocales.length > 0) { - s.append(" supported={").append(supportedULocales[0].toString()); - for (int i = 1; i < supportedULocales.length; ++i) { - s.append(", ").append(supportedULocales[i].toString()); + // Supported languages in the order that we try to match them. + if (supportedLSRs.length > 0) { + s.append(" supportedLSRs={").append(supportedLSRs[0].toString()); + for (int i = 1; i < supportedLSRs.length; ++i) { + s.append(", ").append(supportedLSRs[i].toString()); } s.append('}'); } s.append(" default=").append(Objects.toString(defaultULocale)); if (favorSubtag != null) { - s.append(" distance=").append(favorSubtag.toString()); + s.append(" favor=").append(favorSubtag.toString()); } if (thresholdDistance >= 0) { s.append(String.format(" threshold=%d", thresholdDistance)); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt index 21c9b601410..649c95baea5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt @@ -1052,9 +1052,9 @@ en >> en-DE ar-EG >> ar-SY pt-BR >> pt ar-XB >> ar-XB -ar-PSBIDI >> ar-XB # These are equivalent. +ar-PSBIDI >> ar-PSBIDI en-XA >> en-XA -en-PSACCENT >> en-XA # These are equivalent. +en-PSACCENT >> en-PSACCENT ar-PSCRACK >> ar-PSCRACK @favor=script @@ -1063,9 +1063,9 @@ en >> en-DE ar-EG >> ar-SY pt-BR >> pt ar-XB >> ar-XB -ar-PSBIDI >> ar-XB # These are equivalent. +ar-PSBIDI >> ar-PSBIDI en-XA >> en-XA -en-PSACCENT >> en-XA # These are equivalent. +en-PSACCENT >> en-PSACCENT ar-PSCRACK >> ar-PSCRACK ** test: BestMatchForTraditionalChinese @@ -1544,50 +1544,44 @@ zh-TW, en >> en-US zh-Hant-CN, en >> en-US zh-Hans, en >> zh-Hans-CN -** test: return first among likely-subtags equivalent locales -# Was: more specific script should win in case regions are identical -# with some different results. +** test: return most originally similar among likely-subtags equivalent locales @supported=af, af-Latn, af-Arab af >> af af-ZA >> af -af-Latn-ZA >> af -af-Latn >> af +af-Latn-ZA >> af-Latn +af-Latn >> af-Latn @favor=script af >> af af-ZA >> af -af-Latn-ZA >> af -af-Latn >> af +af-Latn-ZA >> af-Latn +af-Latn >> af-Latn -# Was: more specific region should win -# with some different results. @supported=nl, nl-NL, nl-BE @favor= nl >> nl nl-Latn >> nl -nl-Latn-NL >> nl -nl-NL >> nl +nl-Latn-NL >> nl-NL +nl-NL >> nl-NL @favor=script nl >> nl nl-Latn >> nl -nl-Latn-NL >> nl -nl-NL >> nl +nl-Latn-NL >> nl-NL +nl-NL >> nl-NL -# Was: more specific region wins over more specific script -# with some different results. @supported=nl, nl-Latn, nl-NL, nl-BE @favor= nl >> nl -nl-Latn >> nl -nl-NL >> nl -nl-Latn-NL >> nl +nl-Latn >> nl-Latn +nl-NL >> nl-NL +nl-Latn-NL >> nl-Latn @favor=script nl >> nl -nl-Latn >> nl -nl-NL >> nl -nl-Latn-NL >> nl +nl-Latn >> nl-Latn +nl-NL >> nl-NL +nl-Latn-NL >> nl-Latn ** test: region may replace matched if matched is enclosing @supported=es-419, es @@ -1670,22 +1664,22 @@ ja-Jpan-JP, en-GB >> ja ** test: pick best maximized tag @supported=ja, ja-Jpan-US, ja-JP, en, ru ja-Jpan, ru >> ja -ja-JP, ru >> ja +ja-JP, ru >> ja-JP ja-US, ru >> ja-Jpan-US @favor=script ja-Jpan, ru >> ja -ja-JP, ru >> ja +ja-JP, ru >> ja-JP ja-US, ru >> ja-Jpan-US ** test: termination: pick best maximized match @supported=ja, ja-Jpan, ja-JP, en, ru -ja-Jpan-JP, ru >> ja -ja-Jpan, ru >> ja +ja-Jpan-JP, ru >> ja-Jpan +ja-Jpan, ru >> ja-Jpan @favor=script -ja-Jpan-JP, ru >> ja -ja-Jpan, ru >> ja +ja-Jpan-JP, ru >> ja-Jpan +ja-Jpan, ru >> ja-Jpan ** test: same language over exact, but distinguish when user is explicit @supported=fr, en-GB, ja, es-ES, es-MX @@ -1900,7 +1894,7 @@ zh-TW >> zh ** test: testGetBestMatchWithMinMatchScore @supported=fr-FR, fr, fr-CA, en @default=und -fr >> fr-FR # First likely-subtags equivalent match is chosen. +fr >> fr @supported=en, fr, fr-CA fr-FR >> fr # Parent match is chosen. @supported=en, fr-CA @@ -1930,7 +1924,7 @@ ru >> und @favor=script @supported=fr-FR, fr, fr-CA, en -fr >> fr-FR +fr >> fr @supported=en, fr, fr-CA fr-FR >> fr @supported=en, fr-CA diff --git a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java index 813d6f8f81b..6a7f6b8c640 100644 --- a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java +++ b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LikelySubtagsBuilder.java @@ -139,10 +139,11 @@ public class LikelySubtagsBuilder { Map lsrIndexes = new LinkedHashMap<>(); // Reserve index 0 as "no value": // The runtime lookup returns 0 for an intermediate match with no value. - lsrIndexes.put(new LSR("", "", ""), 0); // arbitrary LSR + lsrIndexes.put(new LSR("", "", "", LSR.DONT_CARE_FLAGS), 0); // arbitrary LSR // Reserve index 1 for SKIP_SCRIPT: // The runtime lookup returns 1 for an intermediate match with a value. - lsrIndexes.put(new LSR("skip", "script", ""), 1); // looks good when printing the data + // This LSR looks good when printing the data. + lsrIndexes.put(new LSR("skip", "script", "", LSR.DONT_CARE_FLAGS), 1); // We could prefill the lsrList with common locales to give them small indexes, // and see if that improves performance a little. for (Map.Entry>> ls : langTable.entrySet()) { @@ -251,7 +252,7 @@ public class LikelySubtagsBuilder { } } // hack - set(result, "und", "Latn", "", new LSR("en", "Latn", "US")); + set(result, "und", "Latn", "", new LSR("en", "Latn", "US", LSR.DONT_CARE_FLAGS)); // hack, ensure that if und-YY => und-Xxxx-YY, then we add Xxxx=>YY to the table // @@ -294,7 +295,9 @@ public class LikelySubtagsBuilder { String lang = parts[0]; String p2 = parts.length < 2 ? "" : parts[1]; String p3 = parts.length < 3 ? "" : parts[2]; - return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3); + return p2.length() < 4 ? + new LSR(lang, "", p2, LSR.DONT_CARE_FLAGS) : + new LSR(lang, p2, p3, LSR.DONT_CARE_FLAGS); } private static void set(Map>> langTable, diff --git a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java index a104c35ef07..43b3cf856bc 100644 --- a/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java +++ b/icu4j/tools/misc/src/com/ibm/icu/dev/tool/locale/LocaleDistanceBuilder.java @@ -487,7 +487,10 @@ public final class LocaleDistanceBuilder { Set paradigmLSRs = new HashSet<>(); // could be TreeSet if LSR were Comparable for (String paradigm : paradigms) { ULocale pl = new ULocale(paradigm); - paradigmLSRs.add(XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl)); + LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl); + // Clear the LSR flags to make the data equality test in + // LocaleDistanceTest happy. + paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS)); } TerritoryContainment tc = new TerritoryContainment(supplementalData);