ICU-20777 Merge the likelySubtags implemention

Change testdata/likelySubtags.txt to consider FAIL line

ICU-20777 Fix Java Tests

ICU-20777 Fix all issues

ICU-20777 Incase timeout

ICU-20777

ICU-20777 Skip Data Driven test
This commit is contained in:
Frank Tang 2023-08-15 20:23:28 -07:00 committed by Frank Yung-Fong Tang
parent 27181e36a6
commit ffc449de62
24 changed files with 3958 additions and 1493 deletions

View file

@ -603,7 +603,9 @@ cc_library(
"locbased.cpp",
"locid.cpp",
"loclikely.cpp",
"loclikelysubtags.cpp",
"locmap.cpp",
"lsr.cpp",
"resbund.cpp",
"resource.cpp",
"uloc.cpp",

View file

@ -307,7 +307,7 @@ LSR getMaximalLsrOrUnd(const XLikelySubtags &likelySubtags, const Locale &locale
if (U_FAILURE(errorCode) || locale.isBogus() || *locale.getName() == 0 /* "und" */) {
return UND_LSR;
} else {
return likelySubtags.makeMaximizedLsrFrom(locale, errorCode);
return likelySubtags.makeMaximizedLsrFrom(locale, false, errorCode);
}
}

View file

@ -2080,6 +2080,10 @@ Locale::addLikelySubtags(UErrorCode& status) {
void
Locale::minimizeSubtags(UErrorCode& status) {
Locale::minimizeSubtags(false, status);
}
void
Locale::minimizeSubtags(bool favorScript, UErrorCode& status) {
if (U_FAILURE(status)) {
return;
}
@ -2087,7 +2091,7 @@ Locale::minimizeSubtags(UErrorCode& status) {
CharString minimizedLocaleID;
{
CharStringByteSink sink(&minimizedLocaleID);
ulocimp_minimizeSubtags(fullName, sink, &status);
ulocimp_minimizeSubtags(fullName, sink, favorScript, &status);
}
if (U_FAILURE(status)) {

View file

@ -31,82 +31,10 @@
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "loclikelysubtags.h"
#include "ulocimp.h"
#include "ustr_imp.h"
/**
* These are the canonical strings for unknown languages, scripts and regions.
**/
static const char* const unknownLanguage = "und";
static const char* const unknownScript = "Zzzz";
static const char* const unknownRegion = "ZZ";
/**
* This function looks for the localeID in the likelySubtags resource.
*
* @param localeID The tag to find.
* @param buffer A buffer to hold the matching entry
* @param bufferLength The length of the output buffer
* @return A pointer to "buffer" if found, or a null pointer if not.
*/
static const char* U_CALLCONV
findLikelySubtags(const char* localeID,
char* buffer,
int32_t bufferLength,
UErrorCode* err) {
const char* result = nullptr;
if (!U_FAILURE(*err)) {
int32_t resLen = 0;
const char16_t* s = nullptr;
UErrorCode tmpErr = U_ZERO_ERROR;
icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr));
if (U_SUCCESS(tmpErr)) {
icu::CharString und;
if (localeID != nullptr) {
if (*localeID == '\0') {
localeID = unknownLanguage;
} else if (*localeID == '_') {
und.append(unknownLanguage, *err);
und.append(localeID, *err);
if (U_FAILURE(*err)) {
return nullptr;
}
localeID = und.data();
}
}
s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
if (U_FAILURE(tmpErr)) {
/*
* If a resource is missing, it's not really an error, it's
* just that we don't have any data for that particular locale ID.
*/
if (tmpErr != U_MISSING_RESOURCE_ERROR) {
*err = tmpErr;
}
}
else if (resLen >= bufferLength) {
/* The buffer should never overflow. */
*err = U_INTERNAL_PROGRAM_ERROR;
}
else {
u_UCharsToChars(s, buffer, resLen + 1);
if (resLen >= 3 &&
uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
(resLen == 3 || buffer[3] == '_')) {
uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
}
result = buffer;
}
} else {
*err = tmpErr;
}
}
return result;
}
/**
* Append a tag to a buffer, adding the separator if necessary. The buffer
* must be large enough to contain the resulting tag plus any separator
@ -360,57 +288,6 @@ error:
}
}
/**
* Create a tag string from the supplied parameters. The lang, script and region
* parameters may be nullptr pointers. If they are, their corresponding length parameters
* must be less than or equal to 0. If the lang parameter is an empty string, the
* default value for an unknown language is written to the output buffer.
*
* If the length of the new string exceeds the capacity of the output buffer,
* the function copies as many bytes to the output buffer as it can, and returns
* the error U_BUFFER_OVERFLOW_ERROR.
*
* If an illegal argument is provided, the function returns the error
* U_ILLEGAL_ARGUMENT_ERROR.
*
* @param lang The language tag to use.
* @param langLength The length of the language tag.
* @param script The script tag to use.
* @param scriptLength The length of the script tag.
* @param region The region tag to use.
* @param regionLength The length of the region tag.
* @param trailing Any trailing data to append to the new tag.
* @param trailingLength The length of the trailing data.
* @param sink The output sink receiving the tag string.
* @param err A pointer to a UErrorCode for error reporting.
**/
static void U_CALLCONV
createTagString(
const char* lang,
int32_t langLength,
const char* script,
int32_t scriptLength,
const char* region,
int32_t regionLength,
const char* trailing,
int32_t trailingLength,
icu::ByteSink& sink,
UErrorCode* err)
{
createTagStringWithAlternates(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
trailing,
trailingLength,
nullptr,
sink,
err);
}
/**
* Parse the language, script, and region subtags from a tag string, and copy the
* results into the corresponding output parameters. The buffers are null-terminated,
@ -494,13 +371,6 @@ parseTagString(
*scriptLength = subtagLength;
if (*scriptLength > 0) {
if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
/**
* If the script part is the "unknown" script, then don't return it.
**/
*scriptLength = 0;
}
/*
* Move past any separator.
*/
@ -517,14 +387,7 @@ parseTagString(
*regionLength = subtagLength;
if (*regionLength > 0) {
if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
/**
* If the region part is the "unknown" region, then don't return it.
**/
*regionLength = 0;
}
} else if (*position != 0 && *position != '@') {
if (*regionLength <= 0 && *position != 0 && *position != '@') {
/* back up over consumed trailing separator */
--position;
}
@ -546,264 +409,6 @@ error:
goto exit;
}
static UBool U_CALLCONV
createLikelySubtagsString(
const char* lang,
int32_t langLength,
const char* script,
int32_t scriptLength,
const char* region,
int32_t regionLength,
const char* variants,
int32_t variantsLength,
icu::ByteSink& sink,
UErrorCode* err) {
/**
* ULOC_FULLNAME_CAPACITY will provide enough capacity
* that we can build a string that contains the language,
* script and region code without worrying about overrunning
* the user-supplied buffer.
**/
char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
if(U_FAILURE(*err)) {
goto error;
}
/**
* Try the language with the script and region first.
**/
if (scriptLength > 0 && regionLength > 0) {
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
nullptr,
0,
nullptr,
0,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
/**
* Try the language with just the script.
**/
if (scriptLength > 0) {
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
script,
scriptLength,
nullptr,
0,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
nullptr,
0,
region,
regionLength,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
/**
* Try the language with just the region.
**/
if (regionLength > 0) {
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
nullptr,
0,
region,
regionLength,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
script,
scriptLength,
nullptr,
0,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
/**
* Finally, try just the language.
**/
{
const char* likelySubtags = nullptr;
icu::CharString tagBuffer;
{
icu::CharStringByteSink sink(&tagBuffer);
createTagString(
lang,
langLength,
nullptr,
0,
nullptr,
0,
nullptr,
0,
sink,
err);
}
if(U_FAILURE(*err)) {
goto error;
}
likelySubtags =
findLikelySubtags(
tagBuffer.data(),
likelySubtagsBuffer,
sizeof(likelySubtagsBuffer),
err);
if(U_FAILURE(*err)) {
goto error;
}
if (likelySubtags != nullptr) {
/* Always use the language tag from the
maximal string, since it may be more
specific than the one provided. */
createTagStringWithAlternates(
nullptr,
0,
script,
scriptLength,
region,
regionLength,
variants,
variantsLength,
likelySubtags,
sink,
err);
return true;
}
}
return false;
error:
if (!U_FAILURE(*err)) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
}
return false;
}
#define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
int32_t count = 0; \
int32_t i; \
@ -836,7 +441,6 @@ _uloc_addLikelySubtags(const char* localeID,
const char* trailing = "";
int32_t trailingLength = 0;
int32_t trailingIndex = 0;
UBool success = false;
if(U_FAILURE(*err)) {
goto error;
@ -862,6 +466,9 @@ _uloc_addLikelySubtags(const char* localeID,
goto error;
}
if (langLength > 3) {
goto error;
}
/* Find the length of the trailing portion. */
while (_isIDSeparator(localeID[trailingIndex])) {
@ -871,30 +478,33 @@ _uloc_addLikelySubtags(const char* localeID,
trailingLength = (int32_t)uprv_strlen(trailing);
CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
success =
createLikelySubtagsString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
{
const icu::XLikelySubtags* likelySubtags = icu::XLikelySubtags::getSingleton(*err);
if(U_FAILURE(*err)) {
goto error;
}
icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(icu::Locale::createFromName(localeID), true, *err);
const char* language = lsr.language;
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
createTagStringWithAlternates(
language,
(int32_t)uprv_strlen(language),
lsr.script,
(int32_t)uprv_strlen(lsr.script),
lsr.region,
(int32_t)uprv_strlen(lsr.region),
trailing,
trailingLength,
nullptr,
sink,
err);
if (!success) {
const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
/*
* If we get here, we need to return localeID.
*/
sink.Append(localeID, localIDLength);
if(U_FAILURE(*err)) {
goto error;
}
}
return success;
return true;
error:
@ -913,6 +523,7 @@ static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*)
static void
_uloc_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode* err) {
icu::CharString maximizedTagBuffer;
@ -925,7 +536,6 @@ _uloc_minimizeSubtags(const char* localeID,
const char* trailing = "";
int32_t trailingLength = 0;
int32_t trailingIndex = 0;
UBool successGetMax = false;
if(U_FAILURE(*err)) {
goto error;
@ -964,213 +574,38 @@ _uloc_minimizeSubtags(const char* localeID,
CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
{
icu::CharString base;
{
icu::CharStringByteSink baseSink(&base);
createTagString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
nullptr,
0,
baseSink,
err);
}
/**
* First, we need to first get the maximization
* from AddLikelySubtags.
**/
{
icu::CharStringByteSink maxSink(&maximizedTagBuffer);
successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
}
}
if(U_FAILURE(*err)) {
goto error;
}
if (!successGetMax) {
/**
* If we got here, return the locale ID parameter unchanged.
**/
const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
sink.Append(localeID, localeIDLength);
return;
}
// In the following, the lang, script, region are referring to those in
// the maximizedTagBuffer, not the one in the localeID.
langLength = sizeof(lang);
scriptLength = sizeof(script);
regionLength = sizeof(region);
parseTagString(
maximizedTagBuffer.data(),
lang,
&langLength,
script,
&scriptLength,
region,
&regionLength,
err);
if(U_FAILURE(*err)) {
goto error;
}
/**
* Start first with just the language.
**/
{
icu::CharString tagBuffer;
{
icu::CharStringByteSink tagSink(&tagBuffer);
createLikelySubtagsString(
lang,
langLength,
nullptr,
0,
nullptr,
0,
nullptr,
0,
tagSink,
err);
}
const icu::XLikelySubtags* likelySubtags = icu::XLikelySubtags::getSingleton(*err);
if(U_FAILURE(*err)) {
goto error;
}
else if (!tagBuffer.isEmpty() &&
uprv_strnicmp(
maximizedTagBuffer.data(),
tagBuffer.data(),
tagBuffer.length()) == 0) {
createTagString(
lang,
langLength,
nullptr,
0,
nullptr,
0,
trailing,
trailingLength,
sink,
err);
return;
}
}
/**
* Next, try the language and region.
**/
if (regionLength > 0) {
icu::CharString tagBuffer;
{
icu::CharStringByteSink tagSink(&tagBuffer);
createLikelySubtagsString(
lang,
langLength,
nullptr,
0,
region,
regionLength,
nullptr,
0,
tagSink,
err);
}
icu::LSR lsr = likelySubtags->minimizeSubtags(
{lang, langLength},
{script, scriptLength},
{region, regionLength},
favorScript,
*err);
if(U_FAILURE(*err)) {
goto error;
}
else if (!tagBuffer.isEmpty() &&
uprv_strnicmp(
maximizedTagBuffer.data(),
tagBuffer.data(),
tagBuffer.length()) == 0) {
createTagString(
lang,
langLength,
nullptr,
0,
region,
regionLength,
trailing,
trailingLength,
sink,
err);
return;
const char* language = lsr.language;
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
}
/**
* Finally, try the language and script. This is our last chance,
* since trying with all three subtags would only yield the
* maximal version that we already have.
**/
if (scriptLength > 0) {
icu::CharString tagBuffer;
{
icu::CharStringByteSink tagSink(&tagBuffer);
createLikelySubtagsString(
lang,
langLength,
script,
scriptLength,
nullptr,
0,
nullptr,
0,
tagSink,
err);
}
createTagStringWithAlternates(
language,
(int32_t)uprv_strlen(language),
lsr.script,
(int32_t)uprv_strlen(lsr.script),
lsr.region,
(int32_t)uprv_strlen(lsr.region),
trailing,
trailingLength,
nullptr,
sink,
err);
if(U_FAILURE(*err)) {
goto error;
}
else if (!tagBuffer.isEmpty() &&
uprv_strnicmp(
maximizedTagBuffer.data(),
tagBuffer.data(),
tagBuffer.length()) == 0) {
createTagString(
lang,
langLength,
script,
scriptLength,
nullptr,
0,
trailing,
trailingLength,
sink,
err);
return;
}
}
{
/**
* If we got here, return the max + trail.
**/
createTagString(
lang,
langLength,
script,
scriptLength,
region,
regionLength,
trailing,
trailingLength,
sink,
err);
return;
}
@ -1271,7 +706,7 @@ uloc_minimizeSubtags(const char* localeID,
icu::CheckedArrayByteSink sink(
minimizedLocaleID, minimizedLocaleIDCapacity);
ulocimp_minimizeSubtags(localeID, sink, status);
ulocimp_minimizeSubtags(localeID, sink, false, status);
int32_t reslen = sink.NumberOfBytesAppended();
if (U_FAILURE(*status)) {
@ -1291,6 +726,7 @@ uloc_minimizeSubtags(const char* localeID,
U_CAPI void U_EXPORT2
ulocimp_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode* status) {
PreflightingLocaleIDBuffer localeBuffer;
do {
@ -1298,7 +734,7 @@ ulocimp_minimizeSubtags(const char* localeID,
localeBuffer.getCapacity(), status);
} while (localeBuffer.needToTryAgain(status));
_uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
_uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, favorScript, status);
}
// Pairs of (language subtag, + or -) for finding out fast if common languages

View file

@ -24,6 +24,7 @@
#include "uniquecharstr.h"
#include "uresdata.h"
#include "uresimp.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
@ -304,7 +305,7 @@ private:
encoded %= 27 * 27;
if (encoded < 27) {
// Selected M49 code index, find the code from "m49" resource.
return m49IndexToCode(m49Array, value, 2, errorCode);
return m49IndexToCode(m49Array, value, encoded, errorCode);
}
char region[2];
region[0] = 'A' + ((encoded % 27) - 1);
@ -339,15 +340,52 @@ private:
namespace {
XLikelySubtags *gLikelySubtags = nullptr;
UVector *gMacroregions = nullptr;
UInitOnce gInitOnce {};
UBool U_CALLCONV cleanup() {
delete gLikelySubtags;
gLikelySubtags = nullptr;
delete gMacroregions;
gMacroregions = nullptr;
gInitOnce.reset();
return true;
}
static const char16_t RANGE_MARKER = 0x7E; /* '~' */
UVector* loadMacroregions(UErrorCode &status) {
LocalPointer<UVector> newMacroRegions(new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status), status);
LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));
LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
LocalUResourceBundlePointer regionList(ures_getByKey(idValidity.getAlias(),"region",nullptr,&status));
LocalUResourceBundlePointer regionMacro(ures_getByKey(regionList.getAlias(),"macroregion",nullptr,&status));
if (U_FAILURE(status)) {
return nullptr;
}
while (U_SUCCESS(status) && ures_hasNext(regionMacro.getAlias())) {
UnicodeString regionName = ures_getNextUnicodeString(regionMacro.getAlias(),nullptr,&status);
int32_t rangeMarkerLocation = regionName.indexOf(RANGE_MARKER);
char16_t buf[6];
regionName.extract(buf,6,status);
if ( rangeMarkerLocation > 0 ) {
char16_t endRange = regionName.charAt(rangeMarkerLocation+1);
buf[rangeMarkerLocation] = 0;
while ( buf[rangeMarkerLocation-1] <= endRange && U_SUCCESS(status)) {
LocalPointer<UnicodeString> newRegion(new UnicodeString(buf), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
buf[rangeMarkerLocation-1]++;
}
} else {
LocalPointer<UnicodeString> newRegion(new UnicodeString(regionName), status);
newMacroRegions->adoptElement(newRegion.orphan(),status);
}
}
return newMacroRegions.orphan();
}
} // namespace
void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
@ -357,10 +395,14 @@ void U_CALLCONV XLikelySubtags::initLikelySubtags(UErrorCode &errorCode) {
data.load(errorCode);
if (U_FAILURE(errorCode)) { return; }
gLikelySubtags = new XLikelySubtags(data);
if (gLikelySubtags == nullptr) {
gMacroregions = loadMacroregions(errorCode);
if (U_FAILURE(errorCode) || gLikelySubtags == nullptr || gMacroregions == nullptr) {
delete gLikelySubtags;
delete gMacroregions;
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
ucln_common_registerCleanup(UCLN_COMMON_LIKELY_SUBTAGS, cleanup);
}
@ -411,15 +453,28 @@ XLikelySubtags::~XLikelySubtags() {
delete[] lsrs;
}
LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const {
LSR XLikelySubtags::makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
const char *name = locale.getName();
if (uprv_isAtSign(name[0]) && name[1] == 'x' && name[2] == '=') { // name.startsWith("@x=")
// Private use language tag x-subtag-subtag... which CLDR changes to
// und-x-subtag-subtag...
return LSR(name, "", "", LSR::EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), errorCode);
LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), returnInputIfUnmatch, errorCode);
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// If the provided ULocale instance is already in the maximal form, or
// there is no data available available for maximization, it will be
// returned.
return LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR::EXPLICIT_LSR, errorCode);
}
return max;
}
namespace {
@ -432,7 +487,9 @@ const char *getCanonical(const CharStringMap &aliases, const char *alias) {
} // namespace
LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant, UErrorCode &errorCode) const {
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
@ -472,64 +529,91 @@ LSR XLikelySubtags::makeMaximizedLsr(const char *language, const char *script, c
language = getCanonical(languageAliases, language);
// (We have no script mappings.)
region = getCanonical(regionAliases, region);
return maximize(language, script, region);
return maximize(language, script, region, returnInputIfUnmatch, errorCode);
}
LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region) const {
if (uprv_strcmp(language, "und") == 0) {
LSR XLikelySubtags::maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
return maximize({language, (int32_t)uprv_strlen(language)},
{script, (int32_t)uprv_strlen(script)},
{region, (int32_t)uprv_strlen(region)},
returnInputIfUnmatch,
errorCode);
}
bool XLikelySubtags::isMacroregion(StringPiece& region, UErrorCode& errorCode) const {
// In Java, we use Region class. In C++, since Region is under i18n,
// we read the same data used by Region into gMacroregions avoid dependency
// from common to i18n/region.cpp
if (U_FAILURE(errorCode)) { return false; }
umtx_initOnce(gInitOnce, &XLikelySubtags::initLikelySubtags, errorCode);
if (U_FAILURE(errorCode)) { return false; }
UnicodeString str(UnicodeString::fromUTF8(region));
return gMacroregions->contains((void *)&str);
}
LSR XLikelySubtags::maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
if (language.compare("und") == 0) {
language = "";
}
if (uprv_strcmp(script, "Zzzz") == 0) {
if (script.compare("Zzzz") == 0) {
script = "";
}
if (uprv_strcmp(region, "ZZ") == 0) {
if (region.compare("ZZ") == 0) {
region = "";
}
if (*script != 0 && *region != 0 && *language != 0) {
return LSR(language, script, region, LSR::EXPLICIT_LSR); // already maximized
if (!script.empty() && !region.empty() && !language.empty()) {
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode); // already maximized
}
bool retainLanguage = false;
bool retainScript = false;
bool retainRegion = false;
uint32_t retainOldMask = 0;
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
language[1] != 0 && // language.length() >= 2
if (0 <= (c0 = uprv_lowerOrdinal(language.data()[0])) && c0 <= 25 &&
language.length() >= 2 &&
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
bool matchLanguage = (value >= 0);
bool matchScript = false;
if (value >= 0) {
if (*language != 0) {
retainOldMask |= 4;
}
retainLanguage = !language.empty();
state = iter.getState64();
} else {
retainOldMask |= 4;
retainLanguage = true;
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value >= 0 && !script.empty()) {
matchScript = true;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
if (*script != 0) {
retainOldMask |= 2;
}
retainScript = !script.empty();
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
if (*script != 0) {
retainOldMask |= 2;
}
retainScript = !script.empty();
state = iter.getState64();
} else {
retainOldMask |= 2;
retainScript = true;
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
@ -541,19 +625,19 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
}
}
bool matchRegion = false;
if (value > 0) {
// Final value from just language or language+script.
if (*region != 0) {
retainOldMask |= 1;
}
retainRegion = !region.empty();
} else {
value = trieNext(iter, region, 0);
if (value >= 0) {
if (*region != 0) {
retainOldMask |= 1;
if (!region.empty() && !isMacroregion(region, errorCode)) {
retainRegion = true;
matchRegion = true;
}
} else {
retainOldMask |= 1;
retainRegion = true;
if (state == 0) {
value = defaultLsrIndex;
} else {
@ -564,28 +648,33 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
}
}
U_ASSERT(value < lsrsLength);
const LSR &result = lsrs[value];
const LSR &matched = lsrs[value];
if (*language == 0) {
language = "und";
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.empty())))) {
return LSR("", "", "", LSR::EXPLICIT_LSR, errorCode); // no matching.
}
if (language.empty()) {
language = StringPiece("und");
}
if (retainOldMask == 0) {
if (!(retainLanguage || retainScript || retainRegion)) {
// Quickly return a copy of the lookup-result LSR
// without new allocation of the subtags.
return LSR(result.language, result.script, result.region, result.flags);
return LSR(matched.language, matched.script, matched.region, matched.flags);
}
if ((retainOldMask & 4) == 0) {
language = result.language;
if (!retainLanguage) {
language = matched.language;
}
if ((retainOldMask & 2) == 0) {
script = result.script;
if (!retainScript) {
script = matched.script;
}
if ((retainOldMask & 1) == 0) {
region = result.region;
if (!retainRegion) {
region = matched.region;
}
int32_t retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags
return LSR(language, script, region, retainOldMask);
return LSR(language, script, region, retainMask, errorCode);
}
int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
@ -721,57 +810,97 @@ int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
default: return -1;
}
}
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
// in loclikely.cpp to this new code, including activating this
// minimizeSubtags() function. The LocaleMatcher does not minimize.
#if 0
LSR XLikelySubtags::minimizeSubtags(const char *languageIn, const char *scriptIn,
const char *regionIn, ULocale.Minimize fieldToFavor,
UErrorCode &errorCode) const {
LSR result = maximize(languageIn, scriptIn, regionIn);
// We could try just a series of checks, like:
// LSR result2 = addLikelySubtags(languageIn, "", "");
// if result.equals(result2) return result2;
// However, we can optimize 2 of the cases:
// (languageIn, "", "")
// (languageIn, "", regionIn)
// value00 = lookup(result.language, "", "")
BytesTrie iter = new BytesTrie(trie);
int value = trieNext(iter, result.language, 0);
U_ASSERT(value >= 0);
if (value == 0) {
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
if (value == 0) {
value = trieNext(iter, "", 0);
int32_t XLikelySubtags::trieNext(BytesTrie &iter, StringPiece s, int32_t i) {
UStringTrieResult result;
uint8_t c;
if (s.length() == i) {
result = iter.next(u'*');
} else {
c = s.data()[i];
for (;;) {
c = uprv_invCharToAscii(c);
// EBCDIC: If s[i] is not an invariant character,
// then c is now 0 and will simply not match anything, which is harmless.
if (i+1 != s.length()) {
if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
return -1;
}
c = s.data()[++i];
} else {
// last character of this subtag
result = iter.next(c | 0x80);
break;
}
}
}
U_ASSERT(value > 0);
LSR value00 = lsrs[value];
boolean favorRegionOk = false;
if (result.script.equals(value00.script)) { //script is default
if (result.region.equals(value00.region)) {
return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
} else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
} else {
favorRegionOk = true;
}
switch (result) {
case USTRINGTRIE_NO_MATCH: return -1;
case USTRINGTRIE_NO_VALUE: return 0;
case USTRINGTRIE_INTERMEDIATE_VALUE:
U_ASSERT(iter.getValue() == SKIP_SCRIPT);
return SKIP_SCRIPT;
case USTRINGTRIE_FINAL_VALUE: return iter.getValue();
default: return -1;
}
// The last case is not as easy to optimize.
// Maybe do later, but for now use the straightforward code.
LSR result2 = maximize(languageIn, scriptIn, "");
if (result2.equals(result)) {
return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
} else if (favorRegionOk) {
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
}
return result;
}
#endif
LSR XLikelySubtags::minimizeSubtags(StringPiece language, StringPiece script,
StringPiece region,
bool favorScript,
UErrorCode &errorCode) const {
LSR max = maximize(language, script, region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
// If no match, return it.
if (uprv_strlen(max.language) == 0 &&
uprv_strlen(max.script) == 0 &&
uprv_strlen(max.region) == 0) {
// No match. ICU API mandate us to
// "If this Locale is already in the minimal form, or not valid, or
// there is no data available for minimization, the Locale will be
// unchanged."
return LSR(language, script, region, LSR::EXPLICIT_LSR, errorCode);
}
// try language
LSR test = maximize(max.language, "", "", true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (!favorScript) {
// favor Region
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
// try language and script
test = maximize(max.language, max.script, "", true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, max.script, "", LSR::DONT_CARE_FLAGS, errorCode);
}
if (favorScript) {
// try language and region
test = maximize(max.language, "", max.region, true, errorCode);
if (U_FAILURE(errorCode)) {
return max;
}
if (test.isEquivalentTo(max)) {
return LSR(max.language, "", max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
}
return LSR(max.language, max.script, max.region, LSR::DONT_CARE_FLAGS, errorCode);
}
U_NAMESPACE_END

View file

@ -11,6 +11,7 @@
#include "unicode/utypes.h"
#include "unicode/bytestrie.h"
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "charstrmap.h"
@ -47,7 +48,9 @@ public:
static const XLikelySubtags *getSingleton(UErrorCode &errorCode);
// VisibleForTesting
LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
LSR makeMaximizedLsrFrom(const Locale &locale,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
/**
* Tests whether lsr is "more likely" than other.
@ -61,13 +64,9 @@ public:
*/
int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
// in loclikely.cpp to this new code, including activating this
// minimizeSubtags() function. The LocaleMatcher does not minimize.
#if 0
LSR minimizeSubtags(const char *languageIn, const char *scriptIn, const char *regionIn,
ULocale.Minimize fieldToFavor, UErrorCode &errorCode) const;
#endif
LSR minimizeSubtags(StringPiece language, StringPiece script, StringPiece region,
bool favorScript,
UErrorCode &errorCode) const;
// visible for LocaleDistance
const LocaleDistanceData &getDistanceData() const { return distanceData; }
@ -80,16 +79,25 @@ private:
static void initLikelySubtags(UErrorCode &errorCode);
LSR makeMaximizedLsr(const char *language, const char *script, const char *region,
const char *variant, UErrorCode &errorCode) const;
const char *variant,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
/**
* Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
*/
LSR maximize(const char *language, const char *script, const char *region) const;
LSR maximize(const char *language, const char *script, const char *region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
LSR maximize(StringPiece language, StringPiece script, StringPiece region,
bool returnInputIfUnmatch,
UErrorCode &errorCode) const;
int32_t getLikelyIndex(const char *language, const char *script) const;
bool isMacroregion(StringPiece& region, UErrorCode &errorCode) const;
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
static int32_t trieNext(BytesTrie &iter, StringPiece s, int32_t i);
UResourceBundle *langInfoBundle;
// We could store the strings by value, except that if there were few enough strings,

View file

@ -31,6 +31,26 @@ LSR::LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t
}
}
LSR::LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
UErrorCode &errorCode) :
language(nullptr), script(nullptr), region(nullptr),
regionIndex(indexForRegion(r.data())), flags(f) {
if (U_SUCCESS(errorCode)) {
CharString data;
data.append(lang, errorCode).append('\0', errorCode);
int32_t scriptOffset = data.length();
data.append(scr, errorCode).append('\0', errorCode);
int32_t regionOffset = data.length();
data.append(r, errorCode);
owned = data.cloneData(errorCode);
if (U_SUCCESS(errorCode)) {
language = owned;
script = owned + scriptOffset;
region = owned + regionOffset;
}
}
}
LSR::LSR(LSR &&other) noexcept :
language(other.language), script(other.script), region(other.region), owned(other.owned),
regionIndex(other.regionIndex), flags(other.flags),

View file

@ -7,6 +7,7 @@
#ifndef __LSR_H__
#define __LSR_H__
#include "unicode/stringpiece.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "cstring.h"
@ -45,6 +46,8 @@ struct LSR final : public UMemory {
*/
LSR(char prefix, const char *lang, const char *scr, const char *r, int32_t f,
UErrorCode &errorCode);
LSR(StringPiece lang, StringPiece scr, StringPiece r, int32_t f,
UErrorCode &errorCode);
LSR(LSR &&other) noexcept;
LSR(const LSR &other) = delete;
inline ~LSR() {

View file

@ -237,6 +237,7 @@ ulocimp_addLikelySubtags(const char* localeID,
*
* @param localeID The locale to minimize
* @param sink The output sink receiving the maximized locale
* @param favorScript favor to keep script if true, region if false.
* @param err Error information if minimizing the locale failed. If the length
* of the localeID and the null-terminator is greater than the maximum allowed size,
* or the localeId is not well-formed, the error code is U_ILLEGAL_ARGUMENT_ERROR.
@ -245,6 +246,7 @@ ulocimp_addLikelySubtags(const char* localeID,
U_CAPI void U_EXPORT2
ulocimp_minimizeSubtags(const char* localeID,
icu::ByteSink& sink,
bool favorScript,
UErrorCode* err);
U_CAPI const char * U_EXPORT2

View file

@ -1113,6 +1113,15 @@ protected: /* only protected for testing purposes. DO NOT USE. */
* @internal
*/
void setFromPOSIXID(const char *posixID);
/**
* Minimize the subtags for this Locale, per the algorithm described
* @param favorScript favor to keep script if true, to keep region if false.
* @param status error information if maximizing this Locale failed.
* If this Locale is not well-formed, the error code is
* U_ILLEGAL_ARGUMENT_ERROR.
* @internal
*/
void minimizeSubtags(bool favorScript, UErrorCode& status);
#endif /* U_HIDE_INTERNAL_API */
private:

View file

@ -4605,8 +4605,8 @@ const char* const full_data[][3] = {
"am"
}, {
"und_Ethi_ER",
"am_Ethi_ER",
"am_ER"
"ti_Ethi_ER",
"ti_ER"
}, {
"und_FI",
"fi_Latn_FI",
@ -5381,8 +5381,8 @@ const char* const full_data[][3] = {
"trv"
}, {
"und_Latn_HK",
"zh_Latn_HK",
"zh_Latn_HK"
"en_Latn_HK",
"en_HK"
}, {
"und_Latn_AQ",
"_Latn_AQ",

View file

@ -655,17 +655,19 @@ group: resourcebundle
localebuilder.o
ulocale.o
ulocbuilder.o
loclikelysubtags.o
deps
udata ucol_swp
sort stringenumeration uhash uvector
uscript_props propname
bytesinkutil
errorcode
lsr
group: localematcher
localematcher.o
deps
resourcebundle localeprioritylist loclikelysubtags locdistance lsr
resourcebundle localeprioritylist locdistance
group: localeprioritylist
localeprioritylist.o
@ -675,12 +677,7 @@ group: localeprioritylist
group: locdistance
locdistance.o
deps
loclikelysubtags
group: loclikelysubtags
loclikelysubtags.o
deps
lsr resourcebundle
resourcebundle
group: lsr
lsr.o

View file

@ -11,6 +11,7 @@
#include <iterator>
#include <set>
#include <utility>
#include <cctype>
#include "loctest.h"
#include "unicode/localebuilder.h"
@ -234,6 +235,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
TESTCASE_AUTO(TestAddLikelySubtags);
TESTCASE_AUTO(TestMinimizeSubtags);
TESTCASE_AUTO(TestAddLikelyAndMinimizeSubtags);
TESTCASE_AUTO(TestDataDrivenLikelySubtags);
TESTCASE_AUTO(TestKeywordVariants);
TESTCASE_AUTO(TestCreateUnicodeKeywords);
TESTCASE_AUTO(TestKeywordVariantParsing);
@ -1711,6 +1713,11 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
const char* const add;
const char* const remove;
} full_data[] = {
{
"und",
"en_Latn_US",
"en"
},
{
"und_AQ",
"_Latn_AQ",
@ -2517,8 +2524,8 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
"am"
}, {
"und_Ethi_ER",
"am_Ethi_ER",
"am_ER"
"ti_Ethi_ER",
"ti_ER"
}, {
"und_FI",
"fi_Latn_FI",
@ -3293,8 +3300,8 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
"trv"
}, {
"und_Latn_HK",
"zh_Latn_HK",
"zh_Latn_HK"
"en_Latn_HK",
"en_HK"
}, {
"und_Latn_AQ",
"_Latn_AQ",
@ -3865,7 +3872,6 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
}
}
void
LocaleTest::TestKeywordVariants() {
static const struct {
@ -5546,6 +5552,184 @@ void LocaleTest::TestLocaleCanonicalizationFromFile()
}
}
std::string trim(const std::string &s) {
auto start = s.begin();
while (start != s.end() && std::isspace(*start)) {
start++;
}
auto end = s.end();
do {
end--;
} while (std::distance(start, end) > 0 && std::isspace(*end));
return std::string(start, end + 1);
}
// A testing helper class which favorScript when minimizeSubtags.
class FavorScriptLocale : public Locale {
public:
FavorScriptLocale(const Locale& l) :Locale(l) { }
void minimizeSubtags(UErrorCode& status) {
Locale::minimizeSubtags(true, status);
}
};
bool isKnownSourceFor20777(const std::string& s) {
return s == "und-001" ||
s == "und-AQ" ||
s == "und-CC" ||
s == "und-SL" ||
s == "und-SS" ||
s == "und-ZM" ||
s.find("und-Latn-") == 0;
}
void U_CALLCONV
testLikelySubtagsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
(void)fieldCount;
LocaleTest* THIS = (LocaleTest*)context;
std::string source(trim(std::string(fields[0][0], fields[0][1]-fields[0][0])));
std::string addLikely(trim(std::string(fields[1][0], fields[1][1]-fields[1][0])));
std::string removeFavorScript(trim(std::string(fields[2][0], fields[2][1]-fields[2][0])));
if (removeFavorScript.length() == 0) {
removeFavorScript = addLikely;
}
std::string removeFavorRegion(trim(std::string(fields[3][0], fields[3][1]-fields[3][0])));
if (removeFavorRegion.length() == 0) {
removeFavorRegion = removeFavorScript;
}
Locale l = Locale::forLanguageTag(source, *pErrorCode);
if (U_FAILURE(*pErrorCode)) {
THIS->errln("forLanguageTag(%s) return error %x %s", source.c_str(),
*pErrorCode, u_errorName(*pErrorCode));
*pErrorCode = U_ZERO_ERROR;
return;
}
Locale actualMax(l);
actualMax.addLikelySubtags(*pErrorCode);
if (addLikely == "FAIL") {
if (uprv_strcmp(l.getName(), actualMax.getName()) != 0) {
THIS->errln("addLikelySubtags('%s') return should return the same but return '%s'",
l.getName(), actualMax.getName());
}
} else {
std::string max = actualMax.toLanguageTag<std::string>(*pErrorCode);
if (U_FAILURE(*pErrorCode)) {
THIS->errln("toLanguageTag(%s) return error %x %s", actualMax.getName(),
*pErrorCode, u_errorName(*pErrorCode));
*pErrorCode = U_ZERO_ERROR;
} else {
if (max != addLikely) {
if (isKnownSourceFor20777(source)) {
THIS->logKnownIssue(
"ICU-20777", "addLikelySubtags('%s') should return '%s' but got '%s'",
source.c_str(), addLikely.c_str(), max.c_str());
} else {
THIS->errln("addLikelySubtags('%s') should return '%s' but got '%s'",
source.c_str(), addLikely.c_str(), max.c_str());
}
}
}
}
Locale actualMin(l);
actualMin.minimizeSubtags(*pErrorCode);
if (removeFavorRegion == "FAIL") {
if (uprv_strcmp(l.getName(), actualMin.getName()) != 0) {
THIS->errln("minimizeSubtags('%s') return should return the same but return '%s'",
l.getName(), actualMin.getName());
}
} else {
std::string min = actualMin.toLanguageTag<std::string>(*pErrorCode);
if (U_FAILURE(*pErrorCode)) {
THIS->errln("toLanguageTag(%s) return error %x %s", actualMin.getName(),
*pErrorCode, u_errorName(*pErrorCode));
*pErrorCode = U_ZERO_ERROR;
} else {
if (min != removeFavorRegion) {
if (isKnownSourceFor20777(source)) {
THIS->logKnownIssue(
"ICU-20777", "minimizeSubtags('%s') should return '%s' but got '%s'",
source.c_str(), removeFavorRegion.c_str(), min.c_str());
} else {
THIS->errln("minimizeSubtags('%s') should return '%s' but got '%s'",
source.c_str(), removeFavorRegion.c_str(), min.c_str());
}
}
}
}
FavorScriptLocale actualMinFavorScript(l);
actualMinFavorScript.minimizeSubtags(*pErrorCode);
if (removeFavorScript == "FAIL") {
if (uprv_strcmp(l.getName(), actualMinFavorScript.getName()) != 0) {
THIS->errln("minimizeSubtags('%s') return should return the same but return '%s'",
l.getName(), actualMinFavorScript.getName());
}
} else {
std::string min = actualMinFavorScript.toLanguageTag<std::string>(*pErrorCode);
if (U_FAILURE(*pErrorCode)) {
THIS->errln("toLanguageTag(%s) favor script return error %x %s", actualMinFavorScript.getName(),
*pErrorCode, u_errorName(*pErrorCode));
*pErrorCode = U_ZERO_ERROR;
} else {
if (min != removeFavorScript) {
if (isKnownSourceFor20777(source)) {
THIS->logKnownIssue(
"ICU-20777",
"minimizeSubtags('%s') favor script should return '%s' but got '%s'",
source.c_str(), removeFavorScript.c_str(), min.c_str());
} else {
THIS->errln("minimizeSubtags('%s') favor script should return '%s' but got '%s'",
source.c_str(), removeFavorScript.c_str(), min.c_str());
}
}
}
}
}
void
LocaleTest::TestDataDrivenLikelySubtags() {
if (quick) {
// This test is too slow to run. Only run in -e mode.
return;
}
IcuTestErrorCode errorCode(*this, "TestDataDrivenLikelySubtags()");
const char* name = "likelySubtags.txt";
const char *sourceTestDataPath = getSourceTestData(errorCode);
if (errorCode.errIfFailureAndReset("unable to find the source/test/testdata "
"folder (getSourceTestData())")) {
return;
}
CharString path(sourceTestDataPath, errorCode);
path.appendPathPart(name, errorCode);
LocalStdioFilePointer testFile(fopen(path.data(), "r"));
if (testFile.isNull()) {
errln("unable to open %s", path.data());
return;
}
// Columns (c1, c2,...) are separated by semicolons.
// Leading and trailing spaces and tabs in each column are ignored.
// Comments are indicated with hash marks.
const int32_t kNumFields = 4;
char *fields[kNumFields][2];
u_parseDelimitedFile(path.data(), ';', fields, kNumFields, testLikelySubtagsLineFn,
this, errorCode);
if (errorCode.errIfFailureAndReset("error parsing %s", name)) {
return;
}
}
void LocaleTest::TestKnownCanonicalizedListCorrect()
{
IcuTestErrorCode status(*this, "TestKnownCanonicalizedListCorrect");

View file

@ -131,6 +131,7 @@ public:
void TestAddLikelySubtags();
void TestMinimizeSubtags();
void TestAddLikelyAndMinimizeSubtags();
void TestDataDrivenLikelySubtags();
void TestForLanguageTag();
void TestForLanguageTagLegacyTagBug21676();

File diff suppressed because it is too large Load diff

View file

@ -225,8 +225,8 @@ public class LocaleDistance {
// VisibleForTesting
public int testOnlyDistance(ULocale desired, ULocale supported,
int threshold, FavorSubtag favorSubtag) {
LSR supportedLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(supported);
LSR desiredLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(desired);
LSR supportedLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(supported, false);
LSR desiredLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(desired, false);
int indexAndDistance = getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR }, 1,
shiftDistance(threshold), favorSubtag, LocaleMatcher.Direction.WITH_ONE_WAY);
return getDistanceFloor(indexAndDistance);

View file

@ -15,6 +15,7 @@ import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.UResource;
import com.ibm.icu.util.BytesTrie;
import com.ibm.icu.util.Region;
import com.ibm.icu.util.ULocale;
public final class XLikelySubtags {
@ -180,7 +181,7 @@ public final class XLikelySubtags {
}
// VisibleForTesting
public LSR makeMaximizedLsrFrom(ULocale locale) {
public LSR makeMaximizedLsrFrom(ULocale locale, boolean returnInputIfUnmatch) {
String name = locale.getName(); // Faster than .toLanguageTag().
if (name.startsWith("@x=")) {
String tag = locale.toLanguageTag();
@ -189,8 +190,12 @@ public final class XLikelySubtags {
// und-x-subtag-subtag...
return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant());
LSR max = makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant(), returnInputIfUnmatch);
if (max.language.isEmpty() && max.script.isEmpty() && max.region.isEmpty()) {
return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry(), LSR.EXPLICIT_LSR);
}
return max;
}
public LSR makeMaximizedLsrFrom(Locale locale) {
@ -201,10 +206,10 @@ public final class XLikelySubtags {
return new LSR(tag, "", "", LSR.EXPLICIT_LSR);
}
return makeMaximizedLsr(locale.getLanguage(), locale.getScript(), locale.getCountry(),
locale.getVariant());
locale.getVariant(), false);
}
private LSR makeMaximizedLsr(String language, String script, String region, String variant) {
private LSR makeMaximizedLsr(String language, String script, String region, String variant, boolean returnInputIfUnmatch) {
// Handle pseudolocales like en-XA, ar-XB, fr-PSCRACK.
// They should match only themselves,
// not other locales with what looks like the same language and script subtags.
@ -248,13 +253,23 @@ public final class XLikelySubtags {
language = getCanonical(languageAliases, language);
// (We have no script mappings.)
region = getCanonical(regionAliases, region);
return maximize(language, script, region);
return maximize(language, script, region, returnInputIfUnmatch);
}
/**
* Helper method to find out a region is a macroregion
*/
private boolean isMacroregion(String region) {
Region.RegionType type = Region.getInstance(region).getType();
return type == Region.RegionType.WORLD ||
type == Region.RegionType.CONTINENT ||
type == Region.RegionType.SUBCONTINENT ;
}
/**
* Raw access to addLikelySubtags. Input must be in canonical format, eg "en", not "eng" or "EN".
*/
private LSR maximize(String language, String script, String region) {
private LSR maximize(String language, String script, String region, boolean returnInputIfUnmatch) {
if (language.equals("und")) {
language = "";
}
@ -268,7 +283,9 @@ public final class XLikelySubtags {
return new LSR(language, script, region, LSR.EXPLICIT_LSR); // already maximized
}
int retainOldMask = 0;
boolean retainLanguage = false;
boolean retainScript = false;
boolean retainRegion = false;
BytesTrie iter = new BytesTrie(trie);
long state;
int value;
@ -280,34 +297,33 @@ public final class XLikelySubtags {
} else {
value = trieNext(iter, language, 0);
}
boolean matchLanguage = (value >= 0);
boolean matchScript = false;
if (value >= 0) {
if (!language.isEmpty()) {
retainOldMask |= 4;
}
retainLanguage = ! language.isEmpty();
state = iter.getState64();
} else {
retainOldMask |= 4;
retainLanguage = true;
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value >= 0 && !script.isEmpty()) {
matchScript = true;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
if (!script.isEmpty()) {
retainOldMask |= 2;
}
retainScript = ! script.isEmpty();
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
if (!script.isEmpty()) {
retainOldMask |= 2;
}
retainScript = ! script.isEmpty();
state = iter.getState64();
} else {
retainOldMask |= 2;
retainScript = true;
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
@ -319,19 +335,19 @@ public final class XLikelySubtags {
}
}
boolean matchRegion = false;
if (value > 0) {
// Final value from just language or language+script.
if (!region.isEmpty()) {
retainOldMask |= 1;
}
retainRegion = ! region.isEmpty();
} else {
value = trieNext(iter, region, 0);
if (value >= 0) {
if (!region.isEmpty()) {
retainOldMask |= 1;
if (!region.isEmpty() && !isMacroregion(region)) {
retainRegion = true;
matchRegion = true;
}
} else {
retainOldMask |= 1;
retainRegion = true;
if (state == 0) {
value = defaultLsrIndex;
} else {
@ -343,25 +359,30 @@ public final class XLikelySubtags {
}
LSR result = lsrs[value];
if (returnInputIfUnmatch &&
(!(matchLanguage || matchScript || (matchRegion && language.isEmpty())))) {
return new LSR("", "", "", LSR.EXPLICIT_LSR); // no matching.
}
if (language.isEmpty()) {
language = "und";
}
if (retainOldMask == 0) {
if (! (retainLanguage || retainScript || retainRegion)) {
assert result.flags == LSR.IMPLICIT_LSR;
return result;
}
if ((retainOldMask & 4) == 0) {
if (!retainLanguage) {
language = result.language;
}
if ((retainOldMask & 2) == 0) {
if (!retainScript) {
script = result.script;
}
if ((retainOldMask & 1) == 0) {
if (!retainRegion) {
region = result.region;
}
int retainMask = (retainLanguage ? 4 : 0) + (retainScript ? 2 : 0) + (retainRegion ? 1 : 0);
// retainOldMask flags = LSR explicit-subtag flags
return new LSR(language, script, region, retainOldMask);
return new LSR(language, script, region, retainMask);
}
/**
@ -502,50 +523,37 @@ public final class XLikelySubtags {
}
}
LSR minimizeSubtags(String languageIn, String scriptIn, String regionIn,
public LSR minimizeSubtags(String languageIn, String scriptIn, String regionIn,
ULocale.Minimize fieldToFavor) {
LSR result = maximize(languageIn, scriptIn, regionIn);
// We could try just a series of checks, like:
// LSR result2 = addLikelySubtags(languageIn, "", "");
// if result.equals(result2) return result2;
// However, we can optimize 2 of the cases:
// (languageIn, "", "")
// (languageIn, "", regionIn)
// value00 = lookup(result.language, "", "")
BytesTrie iter = new BytesTrie(trie);
int value = trieNext(iter, result.language, 0);
assert value >= 0;
if (value == 0) {
value = trieNext(iter, "", 0);
assert value >= 0;
if (value == 0) {
value = trieNext(iter, "", 0);
LSR max = maximize(languageIn, scriptIn, regionIn, true);
if (max.language.isEmpty() && max.region.isEmpty() && max.script.isEmpty()) {
// Cannot match, return as is
return new LSR(languageIn, scriptIn, regionIn, LSR.EXPLICIT_LSR);
}
LSR test = maximize(max.language, "", "", true);
if (test.isEquivalentTo(max)) {
return new LSR(max.language, "", "", LSR.DONT_CARE_FLAGS);
}
if (ULocale.Minimize.FAVOR_REGION == fieldToFavor) {
test = maximize(max.language, "", max.region, true);
if (test.isEquivalentTo(max)) {
return new LSR(max.language, "", max.region, LSR.DONT_CARE_FLAGS);
}
test = maximize(max.language, max.script, "", true);
if (test.isEquivalentTo(max)) {
return new LSR(max.language, max.script, "", LSR.DONT_CARE_FLAGS);
}
} else {
test = maximize(max.language, max.script, "", true);
if (test.isEquivalentTo(max)) {
return new LSR(max.language, max.script, "", LSR.DONT_CARE_FLAGS);
}
test = maximize(max.language, "", max.region, true);
if (test.isEquivalentTo(max)) {
return new LSR(max.language, "", max.region, LSR.DONT_CARE_FLAGS);
}
}
assert value > 0;
LSR value00 = lsrs[value];
boolean favorRegionOk = false;
if (result.script.equals(value00.script)) { //script is default
if (result.region.equals(value00.region)) {
return new LSR(result.language, "", "", LSR.DONT_CARE_FLAGS);
} else if (fieldToFavor == ULocale.Minimize.FAVOR_REGION) {
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
} else {
favorRegionOk = true;
}
}
// The last case is not as easy to optimize.
// Maybe do later, but for now use the straightforward code.
LSR result2 = maximize(languageIn, scriptIn, "");
if (result2.equals(result)) {
return new LSR(result.language, result.script, "", LSR.DONT_CARE_FLAGS);
} else if (favorRegionOk) {
return new LSR(result.language, "", result.region, LSR.DONT_CARE_FLAGS);
}
return result;
return new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS);
}
private Map<String, LSR> getTable() {

View file

@ -796,7 +796,7 @@ public final class LocaleMatcher {
if (locale.equals(UND_ULOCALE)) {
return UND_LSR;
} else {
return XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(locale);
return XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(locale, false);
}
}

View file

@ -42,15 +42,16 @@ import com.ibm.icu.impl.locale.BaseLocale;
import com.ibm.icu.impl.locale.Extension;
import com.ibm.icu.impl.locale.InternalLocaleBuilder;
import com.ibm.icu.impl.locale.KeyTypeData;
import com.ibm.icu.impl.locale.LSR;
import com.ibm.icu.impl.locale.LanguageTag;
import com.ibm.icu.impl.locale.LocaleExtensions;
import com.ibm.icu.impl.locale.LocaleSyntaxException;
import com.ibm.icu.impl.locale.ParseStatus;
import com.ibm.icu.impl.locale.UnicodeLocaleExtension;
import com.ibm.icu.impl.locale.XLikelySubtags;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.LocaleDisplayNames;
import com.ibm.icu.text.LocaleDisplayNames.DialectHandling;
/**
* {@icuenhanced java.util.Locale}.{@icu _usage_}
*
@ -2722,12 +2723,10 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
trailing = loc.localeID.substring(trailingIndex);
}
String newLocaleID =
createLikelySubtagsString(
tags[0],
tags[1],
tags[2],
trailing);
LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(
new ULocale(loc.getLanguage(), loc.getScript(), loc.getCountry()), true);
String newLocaleID = createTagString(max.language, max.script, max.region,
trailing);
return newLocaleID == null ? loc : new ULocale(newLocaleID);
}
@ -2819,148 +2818,22 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
@Deprecated
public static ULocale minimizeSubtags(ULocale loc, Minimize fieldToFavor) {
String[] tags = new String[3];
String trailing = null;
int trailingIndex = parseTagString(
loc.localeID,
tags);
String originalLang = tags[0];
String originalScript = tags[1];
String originalRegion = tags[2];
String originalTrailing = null;
if (trailingIndex < loc.localeID.length()) {
/*
* Create a String that contains everything
* after the language, script, and region.
*/
originalTrailing = loc.localeID.substring(trailingIndex);
trailing = loc.localeID.substring(trailingIndex);
}
/**
* First, we need to first get the maximization
* by adding any likely subtags.
**/
String maximizedLocaleID =
createLikelySubtagsString(
originalLang,
originalScript,
originalRegion,
null);
LSR lsr = XLikelySubtags.INSTANCE.minimizeSubtags(
loc.getLanguage(), loc.getScript(), loc.getCountry(), fieldToFavor);
String newLocaleID = createTagString(lsr.language, lsr.script, lsr.region,
trailing);
/**
* If maximization fails, there's nothing
* we can do.
**/
if (isEmptyString(maximizedLocaleID)) {
return loc;
}
else {
/**
* Start first with just the language.
**/
String tag =
createLikelySubtagsString(
originalLang,
null,
null,
null);
if (tag.equals(maximizedLocaleID)) {
String newLocaleID =
createTagString(
originalLang,
null,
null,
originalTrailing);
return new ULocale(newLocaleID);
}
}
/**
* Next, try the language and region.
**/
if (fieldToFavor == Minimize.FAVOR_REGION) {
if (originalRegion.length() != 0) {
String tag =
createLikelySubtagsString(
originalLang,
null,
originalRegion,
null);
if (tag.equals(maximizedLocaleID)) {
String newLocaleID =
createTagString(
originalLang,
null,
originalRegion,
originalTrailing);
return new ULocale(newLocaleID);
}
}
if (originalScript.length() != 0){
String tag =
createLikelySubtagsString(
originalLang,
originalScript,
null,
null);
if (tag.equals(maximizedLocaleID)) {
String newLocaleID =
createTagString(
originalLang,
originalScript,
null,
originalTrailing);
return new ULocale(newLocaleID);
}
}
} else { // FAVOR_SCRIPT, so
if (originalScript.length() != 0){
String tag =
createLikelySubtagsString(
originalLang,
originalScript,
null,
null);
if (tag.equals(maximizedLocaleID)) {
String newLocaleID =
createTagString(
originalLang,
originalScript,
null,
originalTrailing);
return new ULocale(newLocaleID);
}
}
if (originalRegion.length() != 0) {
String tag =
createLikelySubtagsString(
originalLang,
null,
originalRegion,
null);
if (tag.equals(maximizedLocaleID)) {
String newLocaleID =
createTagString(
originalLang,
null,
originalRegion,
originalTrailing);
return new ULocale(newLocaleID);
}
}
}
return loc;
return newLocaleID == null ? loc : new ULocale(newLocaleID);
}
/**
@ -3007,10 +2880,9 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
* @return The new tag string.
**/
private static String createTagString(String lang, String script, String region,
String trailing, String alternateTags) {
String trailing) {
LocaleIDParser parser = null;
boolean regionAppended = false;
StringBuilder tag = new StringBuilder();
@ -3018,8 +2890,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
appendTag(
lang,
tag);
}
else if (isEmptyString(alternateTags)) {
} else {
/*
* Append the value for an unknown language, if
* we found no language.
@ -3028,66 +2899,17 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
UNDEFINED_LANGUAGE,
tag);
}
else {
parser = new LocaleIDParser(alternateTags);
String alternateLang = parser.getLanguage();
/*
* Append the value for an unknown language, if
* we found no language.
*/
appendTag(
!isEmptyString(alternateLang) ? alternateLang : UNDEFINED_LANGUAGE,
tag);
}
if (!isEmptyString(script)) {
appendTag(
script,
tag);
}
else if (!isEmptyString(alternateTags)) {
/*
* Parse the alternateTags string for the script.
*/
if (parser == null) {
parser = new LocaleIDParser(alternateTags);
}
String alternateScript = parser.getScript();
if (!isEmptyString(alternateScript)) {
appendTag(
alternateScript,
tag);
}
}
if (!isEmptyString(region)) {
appendTag(
region,
tag);
regionAppended = true;
}
else if (!isEmptyString(alternateTags)) {
/*
* Parse the alternateTags string for the region.
*/
if (parser == null) {
parser = new LocaleIDParser(alternateTags);
}
String alternateRegion = parser.getCountry();
if (!isEmptyString(alternateRegion)) {
appendTag(
alternateRegion,
tag);
regionAppended = true;
}
}
if (trailing != null && trailing.length() > 1) {
@ -3107,7 +2929,7 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
separators = 1;
}
if (regionAppended) {
if (!isEmptyString(region)) {
/*
* If we appended a region, we may need to strip
* the extra separator from the variant portion.
@ -3134,21 +2956,6 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
return tag.toString();
}
/**
* Create a tag string from the supplied parameters. The lang, script and region
* parameters may be null references.If the lang parameter is an empty string, the
* default value for an unknown language is written to the output buffer.
*
* @param lang The language tag to use.
* @param script The script tag to use.
* @param region The region tag to use.
* @param trailing Any trailing data to append to the new tag.
* @return The new String.
**/
static String createTagString(String lang, String script, String region, String trailing) {
return createTagString(lang, script, region, trailing, null);
}
/**
* Parse the language, script, and region subtags from a tag string, and return the results.
*
@ -3214,144 +3021,6 @@ public final class ULocale implements Serializable, Comparable<ULocale> {
}
}
private static String lookupLikelySubtags(String localeId) {
UResourceBundle bundle =
UResourceBundle.getBundleInstance(
ICUData.ICU_BASE_NAME, "likelySubtags");
try {
return bundle.getString(localeId);
}
catch(MissingResourceException e) {
return null;
}
}
private static String createLikelySubtagsString(String lang, String script, String region,
String variants) {
/**
* Try the language with the script and region first.
*/
if (!isEmptyString(script) && !isEmptyString(region)) {
String searchTag =
createTagString(
lang,
script,
region,
null);
String likelySubtags = lookupLikelySubtags(searchTag);
/*
if (likelySubtags == null) {
if (likelySubtags2 != null) {
System.err.println("Tag mismatch: \"(null)\" \"" + likelySubtags2 + "\"");
}
}
else if (likelySubtags2 == null) {
System.err.println("Tag mismatch: \"" + likelySubtags + "\" \"(null)\"");
}
else if (!likelySubtags.equals(likelySubtags2)) {
System.err.println("Tag mismatch: \"" + likelySubtags + "\" \"" + likelySubtags2
+ "\"");
}
*/
if (likelySubtags != null) {
// Always use the language tag from the
// maximal string, since it may be more
// specific than the one provided.
return createTagString(
null,
null,
null,
variants,
likelySubtags);
}
}
/**
* Try the language with just the script.
**/
if (!isEmptyString(script)) {
String searchTag =
createTagString(
lang,
script,
null,
null);
String likelySubtags = lookupLikelySubtags(searchTag);
if (likelySubtags != null) {
// Always use the language tag from the
// maximal string, since it may be more
// specific than the one provided.
return createTagString(
null,
null,
region,
variants,
likelySubtags);
}
}
/**
* Try the language with just the region.
**/
if (!isEmptyString(region)) {
String searchTag =
createTagString(
lang,
null,
region,
null);
String likelySubtags = lookupLikelySubtags(searchTag);
if (likelySubtags != null) {
// Always use the language tag from the
// maximal string, since it may be more
// specific than the one provided.
return createTagString(
null,
script,
null,
variants,
likelySubtags);
}
}
/**
* Finally, try just the language.
**/
{
String searchTag =
createTagString(
lang,
null,
null,
null);
String likelySubtags = lookupLikelySubtags(searchTag);
if (likelySubtags != null) {
// Always use the language tag from the
// maximal string, since it may be more
// specific than the one provided.
return createTagString(
null,
script,
region,
variants,
likelySubtags);
}
}
return null;
}
// --------------------------------
// BCP47/OpenJDK APIs
// --------------------------------

File diff suppressed because it is too large Load diff

View file

@ -24,7 +24,6 @@ import org.junit.Test;
import org.junit.runner.RunWith;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.tool.locale.LikelySubtagsBuilder;
import com.ibm.icu.impl.locale.XCldrStub.FileUtilities;
import com.ibm.icu.impl.locale.XLikelySubtags;
import com.ibm.icu.util.LocaleMatcher;
@ -869,19 +868,12 @@ public class LocaleMatcherTest extends TestFmwk {
long start = System.nanoTime();
for (int i = iterations; i > 0; --i) {
for (ULocale locale : list) {
XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(locale);
XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(locale, false);
}
}
return System.nanoTime() - start;
}
@Test
public void testLikelySubtagsLoadedDataSameAsBuiltFromScratch() {
XLikelySubtags.Data built = LikelySubtagsBuilder.build();
XLikelySubtags.Data loaded = XLikelySubtags.Data.load();
assertEquals("run LocaleDistanceBuilder and update ICU4C langInfo.txt", built, loaded);
}
private static final class TestCase implements Cloneable {
private static final String ENDL = System.getProperties().getProperty("line.separator");

View file

@ -16,11 +16,13 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@ -54,7 +56,10 @@ import com.ibm.icu.util.ULocale.Minimize;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.VersionInfo;
@RunWith(JUnit4.class)
import junitparams.JUnitParamsRunner;
import junitparams.Parameters;
@RunWith(JUnitParamsRunner.class)
public class ULocaleTest extends TestFmwk {
// Ticket #8078 and #11674
@ -1947,7 +1952,7 @@ public class ULocaleTest extends TestFmwk {
"de__POSIX_1901"
}, {
"und",
""
"en"
}
};
@ -2760,8 +2765,8 @@ public class ULocaleTest extends TestFmwk {
"am"
}, {
"und_Ethi_ER",
"am_Ethi_ER",
"am_ER"
"ti_Ethi_ER",
"ti_ER"
}, {
"und_FI",
"fi_Latn_FI",
@ -3536,8 +3541,8 @@ public class ULocaleTest extends TestFmwk {
"trv"
}, {
"und_Latn_HK",
"zh_Latn_HK",
"zh_Latn_HK"
"en_Latn_HK",
"en_HK"
}, {
"und_Latn_AQ",
"_Latn_AQ",
@ -5417,4 +5422,103 @@ public class ULocaleTest extends TestFmwk {
}
}
boolean isKnownSourceFor20777(String s) {
return s.equals("und-001") ||
s.equals("und-AQ") ||
s.equals("und-CC") ||
s.equals("und-SL") ||
s.equals("und-SS") ||
s.equals("und-ZM") ||
s.startsWith("und-Latn-");
}
private static final class TestCase implements Cloneable {
private static final String ENDL = System.getProperties().getProperty("line.separator");
int lineNr = 0;
String source = "";
String addLikely = "";
String removeFavorScript = "";
String removeFavorRegion = "";
@Override
public TestCase clone() throws CloneNotSupportedException {
return (TestCase) super.clone();
}
@Override
public String toString() {
return (new StringBuilder(source))
.append(";")
.append(addLikely)
.append(";")
.append(removeFavorScript)
.append(";")
.append(removeFavorRegion)
.toString();
}
}
static List<TestCase> readLikelySubtagsTestCases() throws Exception {
List<TestCase> tests = new ArrayList<>();
TestCase test = new TestCase();
BufferedReader testFile = TestUtil.getDataReader("likelySubtags.txt");
try {
String line;
while ((line = testFile.readLine()) != null) {
if (line.startsWith("#")) continue;
String [] fields = line.split("[ \t]?;[ \t]?");
if (fields.length < 2) continue;
test.source = fields[0];
test.addLikely = fields[1];
test.removeFavorScript = (fields.length < 3) || fields[2].isEmpty() ? test.addLikely : fields[2];
test.removeFavorRegion = (fields.length < 4) || fields[3].isEmpty() ? test.removeFavorScript : fields[3];
tests.add(test.clone());
}
} finally {
testFile.close();
}
return tests;
}
@Test
@Parameters(method = "readLikelySubtagsTestCases")
public void likelySubtagsDataDriven(TestCase test) {
ULocale l = ULocale.forLanguageTag(test.source);
if (isKnownSourceFor20777(test.source)) {
if (test.addLikely.equals(ULocale.addLikelySubtags(l).toLanguageTag())) {
logKnownIssue("ICU-20777", "addLikelySubtags(" + test.source + ")");
}
if (test.removeFavorRegion.equals(ULocale.minimizeSubtags(l).toLanguageTag())) {
logKnownIssue("ICU-20777", "minimizeSubtags(" + test.source + ")");
}
if (test.removeFavorScript.equals(ULocale.minimizeSubtags(
l, ULocale.Minimize.FAVOR_SCRIPT).toLanguageTag())) {
logKnownIssue("ICU-20777", "minimizeSubtags(" + test.source + ") - FAVOR_SCRIPT");
}
} else {
if (test.addLikely.equals("FAIL")) {
assertEquals("addLikelySubtags(" + test.source + ") should be unchanged",
l, ULocale.addLikelySubtags(l));
} else {
assertEquals("addLikelySubtags(" + test.source + ")",
test.addLikely, ULocale.addLikelySubtags(l).toLanguageTag());
}
if (test.removeFavorRegion.equals("FAIL")) {
assertEquals("minimizeSubtags(" + test.source + ") should be unchanged",
l, ULocale.minimizeSubtags(l));
} else {
assertEquals("minimizeSubtags(" + test.source + ")",
test.removeFavorRegion, ULocale.minimizeSubtags(l).toLanguageTag());
}
if (test.removeFavorScript.equals("FAIL")) {
assertEquals("minimizeSubtags(" + test.source + ") - FAVOR_SCRIPT should be unchanged",
l, ULocale.minimizeSubtags(l, ULocale.Minimize.FAVOR_SCRIPT));
} else {
assertEquals("minimizeSubtags(" + test.source + ") - FAVOR_SCRIPT",
test.removeFavorScript, ULocale.minimizeSubtags(l, ULocale.Minimize.FAVOR_SCRIPT).toLanguageTag());
}
}
}
}

View file

@ -1,317 +0,0 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.dev.tool.locale;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.UResource;
import com.ibm.icu.impl.locale.LSR;
import com.ibm.icu.impl.locale.XCldrStub.HashMultimap;
import com.ibm.icu.impl.locale.XCldrStub.Multimap;
import com.ibm.icu.impl.locale.XCldrStub.Multimaps;
import com.ibm.icu.impl.locale.XLikelySubtags;
import com.ibm.icu.util.BytesTrieBuilder;
import com.ibm.icu.util.ICUException;
/**
* Builds data for XLikelySubtags.
* Reads source data from ICU resource bundles.
*/
public class LikelySubtagsBuilder {
private static final boolean DEBUG_OUTPUT = LSR.DEBUG_OUTPUT;
private static ICUResourceBundle getSupplementalDataBundle(String name) {
return ICUResourceBundle.getBundleInstance(
ICUData.ICU_BASE_NAME, name,
ICUResourceBundle.ICU_DATA_CLASS_LOADER, ICUResourceBundle.OpenType.DIRECT);
}
private static final class AliasesBuilder {
final Map<String, String> toCanonical = new HashMap<>();
final Multimap<String, String> toAliases;
public Set<String> getAliases(String canonical) {
Set<String> aliases = toAliases.get(canonical);
return aliases == null ? Collections.singleton(canonical) : aliases;
}
public AliasesBuilder(String type) {
ICUResourceBundle metadata = getSupplementalDataBundle("metadata");
UResource.Value value = metadata.getValueWithFallback("alias/" + type);
UResource.Table aliases = value.getTable();
UResource.Key key = new UResource.Key();
for (int i = 0; aliases.getKeyAndValue(i, key, value); ++i) {
String aliasFrom = key.toString();
if (aliasFrom.contains("_") || aliasFrom.contains("-")) {
continue; // only simple aliasing
}
UResource.Table table = value.getTable();
if (table.findValue("reason", value) && value.getString().equals("overlong")) {
continue;
}
if (!table.findValue("replacement", value)) {
continue;
}
String aliasTo = value.getString();
int spacePos = aliasTo.indexOf(' ');
String aliasFirst = spacePos < 0 ? aliasTo : aliasTo.substring(0, spacePos);
if (aliasFirst.contains("_")) {
continue; // only simple aliasing
}
toCanonical.put(aliasFrom, aliasFirst);
}
if (type.equals("language")) {
toCanonical.put("mo", "ro"); // special case
}
toAliases = Multimaps.invertFrom(toCanonical, HashMultimap.<String, String>create());
if (DEBUG_OUTPUT) {
System.out.println("*** " + type + " aliases");
for (Map.Entry<String, String> mapping : new TreeMap<>(toCanonical).entrySet()) {
System.out.println(mapping);
}
}
}
}
private static final class TrieBuilder {
byte[] bytes = new byte[24];
int length = 0;
BytesTrieBuilder tb = new BytesTrieBuilder();
void addValue(int value) {
assert value >= 0;
tb.add(bytes, length, value);
}
void addStar() {
bytes[length++] = '*';
}
void addSubtag(String s) {
assert !s.isEmpty();
assert !s.equals("*");
int end = s.length() - 1;
for (int i = 0;; ++i) {
char c = s.charAt(i);
assert c <= 0x7f;
if (i < end) {
bytes[length++] = (byte) c;
} else {
// Mark the last character as a terminator to avoid overlap matches.
bytes[length++] = (byte) (c | 0x80);
break;
}
}
}
byte[] build() {
ByteBuffer buffer = tb.buildByteBuffer(BytesTrieBuilder.Option.SMALL);
// Allocate an array with just the necessary capacity,
// so that we do not hold on to a larger array for a long time.
byte[] bytes = new byte[buffer.remaining()];
buffer.get(bytes);
if (DEBUG_OUTPUT) {
System.out.println("likely subtags trie size: " + bytes.length + " bytes");
}
return bytes;
}
}
// VisibleForTesting
public static XLikelySubtags.Data build() {
AliasesBuilder languageAliasesBuilder = new AliasesBuilder("language");
AliasesBuilder regionAliasesBuilder = new AliasesBuilder("territory");
Map<String, Map<String, Map<String, LSR>>> langTable =
makeTable(languageAliasesBuilder, regionAliasesBuilder);
TrieBuilder trieBuilder = new TrieBuilder();
Map<LSR, Integer> lsrIndexes = new LinkedHashMap<>();
// Reserve index 0 as "no value":
// The runtime lookup returns 0 for an intermediate match with no value.
lsrIndexes.put(new LSR("", "", "", LSR.DONT_CARE_FLAGS), 0); // arbitrary LSR
// Reserve index 1 for SKIP_SCRIPT:
// The runtime lookup returns 1 for an intermediate match with a value.
// This LSR looks good when printing the data.
lsrIndexes.put(new LSR("skip", "script", "", LSR.DONT_CARE_FLAGS), 1);
// We could prefill the lsrList with common locales to give them small indexes,
// and see if that improves performance a little.
for (Map.Entry<String, Map<String, Map<String, LSR>>> ls : langTable.entrySet()) {
trieBuilder.length = 0;
String lang = ls.getKey();
if (lang.equals("und")) {
trieBuilder.addStar();
} else {
trieBuilder.addSubtag(lang);
}
Map<String, Map<String, LSR>> scriptTable = ls.getValue();
boolean skipScript = false;
if (scriptTable.size() == 1) {
Map<String, LSR> regionTable = scriptTable.get("");
if (regionTable.size() == 1) {
// Prune the script and region levels from language with
// only * for scripts and regions.
int i = uniqueIdForLsr(lsrIndexes, regionTable.get(""));
trieBuilder.addValue(i);
continue;
} else {
// Prune the script level from language with only * for scripts
// but with real regions.
// Set an intermediate value as a signal to the lookup code.
trieBuilder.addValue(XLikelySubtags.SKIP_SCRIPT);
skipScript = true;
}
}
int scriptStartLength = trieBuilder.length;
for (Map.Entry<String, Map<String, LSR>> sr : scriptTable.entrySet()) {
trieBuilder.length = scriptStartLength;
if (!skipScript) {
String script = sr.getKey();
if (script.isEmpty()) {
trieBuilder.addStar();
} else {
trieBuilder.addSubtag(script);
}
}
Map<String, LSR> regionTable = sr.getValue();
if (regionTable.size() == 1) {
// Prune the region level from language+script with only * for regions.
int i = uniqueIdForLsr(lsrIndexes, regionTable.get(""));
trieBuilder.addValue(i);
continue;
}
int regionStartLength = trieBuilder.length;
for (Map.Entry<String, LSR> r2lsr : regionTable.entrySet()) {
trieBuilder.length = regionStartLength;
String region = r2lsr.getKey();
// Map the whole lang+script+region to a unique, dense index of the LSR.
if (region.isEmpty()) {
trieBuilder.addStar();
} else {
trieBuilder.addSubtag(region);
}
int i = uniqueIdForLsr(lsrIndexes, r2lsr.getValue());
trieBuilder.addValue(i);
}
}
}
byte[] trie = trieBuilder.build();
LSR[] lsrs = lsrIndexes.keySet().toArray(new LSR[lsrIndexes.size()]);
return new XLikelySubtags.Data(
languageAliasesBuilder.toCanonical, regionAliasesBuilder.toCanonical, trie, lsrs);
}
private static int uniqueIdForLsr(Map<LSR, Integer> lsrIndexes, LSR lsr) {
Integer index = lsrIndexes.get(lsr);
if (index != null) {
return index.intValue();
} else {
int i = lsrIndexes.size();
lsrIndexes.put(lsr, i);
return i;
}
}
private static Map<String, Map<String, Map<String, LSR>>> makeTable(
AliasesBuilder languageAliasesBuilder, AliasesBuilder regionAliasesBuilder) {
Map<String, Map<String, Map<String, LSR>>> result = new TreeMap<>();
// set the base data
ICUResourceBundle likelySubtags = getSupplementalDataBundle("likelySubtags");
UResource.Value value = likelySubtags.getValueWithFallback("");
UResource.Table table = value.getTable();
UResource.Key key = new UResource.Key();
for (int i = 0; table.getKeyAndValue(i, key, value); ++i) {
LSR ltp = lsrFromLocaleID(key.toString()); // source
final String language = ltp.language;
final String script = ltp.script;
final String region = ltp.region;
ltp = lsrFromLocaleID(value.getString()); // target
set(result, language, script, region, ltp);
// now add aliases
Collection<String> languageAliases = languageAliasesBuilder.getAliases(language);
Collection<String> regionAliases = regionAliasesBuilder.getAliases(region);
for (String languageAlias : languageAliases) {
for (String regionAlias : regionAliases) {
if (languageAlias.equals(language) && regionAlias.equals(region)) {
continue;
}
set(result, languageAlias, script, regionAlias, ltp);
}
}
}
// hack
set(result, "und", "Latn", "", new LSR("en", "Latn", "US", LSR.DONT_CARE_FLAGS));
// hack, ensure that if und-YY => und-Xxxx-YY, then we add Xxxx=>YY to the table
// <likelySubtag from="und_GH" to="ak_Latn_GH"/>
// so und-Latn-GH => ak-Latn-GH
Map<String, Map<String, LSR>> undScriptMap = result.get("und");
Map<String, LSR> undEmptyRegionMap = undScriptMap.get("");
for (Map.Entry<String, LSR> regionEntry : undEmptyRegionMap.entrySet()) {
final LSR lsr = regionEntry.getValue();
set(result, "und", lsr.script, lsr.region, lsr);
}
//
// check that every level has "" (or "und")
if (!result.containsKey("und")) {
throw new IllegalArgumentException("failure: base");
}
for (Map.Entry<String, Map<String, Map<String, LSR>>> langEntry : result.entrySet()) {
String lang = langEntry.getKey();
final Map<String, Map<String, LSR>> scriptMap = langEntry.getValue();
if (!scriptMap.containsKey("")) {
throw new IllegalArgumentException("failure: " + lang);
}
for (Map.Entry<String, Map<String, LSR>> scriptEntry : scriptMap.entrySet()) {
String script = scriptEntry.getKey();
final Map<String, LSR> regionMap = scriptEntry.getValue();
if (!regionMap.containsKey("")) {
throw new IllegalArgumentException("failure: " + lang + "-" + script);
}
}
}
return result;
}
// Parses locale IDs in the likelySubtags data, not arbitrary language tags.
private static LSR lsrFromLocaleID(String languageIdentifier) {
String[] parts = languageIdentifier.split("[-_]");
if (parts.length < 1 || parts.length > 3) {
throw new ICUException("too many subtags");
}
String lang = parts[0];
String p2 = parts.length < 2 ? "" : parts[1];
String p3 = parts.length < 3 ? "" : parts[2];
return p2.length() < 4 ?
new LSR(lang, "", p2, LSR.DONT_CARE_FLAGS) :
new LSR(lang, p2, p3, LSR.DONT_CARE_FLAGS);
}
private static void set(Map<String, Map<String, Map<String, LSR>>> langTable,
final String language, final String script, final String region, LSR newValue) {
Map<String, Map<String, LSR>> scriptTable = getSubtable(langTable, language);
Map<String, LSR> regionTable = getSubtable(scriptTable, script);
regionTable.put(region, newValue);
}
private static <K, V, T> Map<V, T> getSubtable(Map<K, Map<V, T>> table, final K subtag) {
Map<V, T> subTable = table.get(subtag);
if (subTable == null) {
table.put(subtag, subTable = new TreeMap<>());
}
return subTable;
}
}

View file

@ -489,7 +489,7 @@ public final class LocaleDistanceBuilder {
Set<LSR> paradigmLSRs = new LinkedHashSet<>(); // could be TreeSet if LSR were Comparable
for (String paradigm : paradigms) {
ULocale pl = new ULocale(paradigm);
LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl);
LSR max = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(pl, false);
// Clear the LSR flags to make the data equality test in
// LocaleDistanceTest happy.
paradigmLSRs.add(new LSR(max.language, max.script, max.region, LSR.DONT_CARE_FLAGS));
@ -887,7 +887,7 @@ public final class LocaleDistanceBuilder {
}
public static final void main(String[] args) throws IOException {
XLikelySubtags.Data likelyData = LikelySubtagsBuilder.build();
XLikelySubtags.Data likelyData = XLikelySubtags.Data.load();
LocaleDistance.Data distanceData = build();
System.out.println("Writing LocaleDistance.Data to " + TXT_PATH + '/' + TXT_FILE_NAME);
try (PrintWriter out = openWriter()) {