ICU-8260 support all collation-related keywords in Collator::createInstance()

X-SVN-Rev: 35762
This commit is contained in:
Markus Scherer 2014-05-27 18:50:10 +00:00
parent c283beec97
commit da533923a2
10 changed files with 305 additions and 51 deletions

View file

@ -59,6 +59,8 @@
#include "uresimp.h"
#include "ucln_in.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
static icu::Locale* availableLocaleList = NULL;
static int32_t availableLocaleListCount;
static icu::ICULocaleService* gService = NULL;
@ -256,6 +258,166 @@ static UBool isAvailableLocaleListInitialized(UErrorCode &status) {
// Collator public methods -----------------------------------------------
namespace {
static const struct {
const char *name;
UColAttribute attr;
} collAttributes[] = {
{ "colStrength", UCOL_STRENGTH },
{ "colBackwards", UCOL_FRENCH_COLLATION },
{ "colCaseLevel", UCOL_CASE_LEVEL },
{ "colCaseFirst", UCOL_CASE_FIRST },
{ "colAlternate", UCOL_ALTERNATE_HANDLING },
{ "colNormalization", UCOL_NORMALIZATION_MODE },
{ "colNumeric", UCOL_NUMERIC_COLLATION }
};
static const struct {
const char *name;
UColAttributeValue value;
} collAttributeValues[] = {
{ "primary", UCOL_PRIMARY },
{ "secondary", UCOL_SECONDARY },
{ "tertiary", UCOL_TERTIARY },
{ "quaternary", UCOL_QUATERNARY },
// Note: Not supporting typo "quarternary" because it was never supported in locale IDs.
{ "identical", UCOL_IDENTICAL },
{ "no", UCOL_OFF },
{ "yes", UCOL_ON },
{ "shifted", UCOL_SHIFTED },
{ "non-ignorable", UCOL_NON_IGNORABLE },
{ "lower", UCOL_LOWER_FIRST },
{ "upper", UCOL_UPPER_FIRST }
};
static const char *collReorderCodes[UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST] = {
"space", "punct", "symbol", "currency", "digit"
};
int32_t getReorderCode(const char *s) {
for (int32_t i = 0; i < LENGTHOF(collReorderCodes); ++i) {
if (uprv_stricmp(s, collReorderCodes[i]) == 0) {
return UCOL_REORDER_CODE_FIRST + i;
}
}
return -1;
}
/**
* Sets collation attributes according to locale keywords. See
* http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Settings
*
* Using "alias" keywords and values where defined:
* http://www.unicode.org/reports/tr35/tr35.html#Old_Locale_Extension_Syntax
* http://unicode.org/repos/cldr/trunk/common/bcp47/collation.xml
*/
void setAttributesFromKeywords(const Locale &loc, Collator &coll, UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) {
return;
}
if (uprv_strcmp(loc.getName(), loc.getBaseName()) == 0) {
// No keywords.
return;
}
char value[1024]; // The reordering value could be long.
// Check for collation keywords that were already deprecated
// before any were supported in createInstance() (except for "collation").
int32_t length = loc.getKeywordValue("colHiraganaQuaternary", value, LENGTHOF(value), errorCode);
if (U_FAILURE(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (length != 0) {
errorCode = U_UNSUPPORTED_ERROR;
return;
}
length = loc.getKeywordValue("variableTop", value, LENGTHOF(value), errorCode);
if (U_FAILURE(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (length != 0) {
errorCode = U_UNSUPPORTED_ERROR;
return;
}
// Parse known collation keywords, ignore others.
if (errorCode == U_STRING_NOT_TERMINATED_WARNING) {
errorCode = U_ZERO_ERROR;
}
for (int32_t i = 0; i < LENGTHOF(collAttributes); ++i) {
length = loc.getKeywordValue(collAttributes[i].name, value, LENGTHOF(value), errorCode);
if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (length == 0) { continue; }
for (int32_t j = 0;; ++j) {
if (j == LENGTHOF(collAttributeValues)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (uprv_stricmp(value, collAttributeValues[j].name) == 0) {
coll.setAttribute(collAttributes[i].attr, collAttributeValues[j].value, errorCode);
break;
}
}
}
length = loc.getKeywordValue("colReorder", value, LENGTHOF(value), errorCode);
if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (length != 0) {
int32_t codes[ + UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST];
int32_t codesLength = 0;
char *scriptName = value;
for (;;) {
if (codesLength == LENGTHOF(codes)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
char *limit = scriptName;
char c;
while ((c = *limit) != 0 && c != '-') { ++limit; }
*limit = 0;
int32_t code;
if ((limit - scriptName) == 4) {
// Strict parsing, accept only 4-letter script codes, not long names.
code = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName);
} else {
code = getReorderCode(scriptName);
}
if (code < 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
codes[codesLength++] = code;
if (c == 0) { break; }
scriptName = limit + 1;
}
coll.setReorderCodes(codes, codesLength, errorCode);
}
length = loc.getKeywordValue("kv", value, LENGTHOF(value), errorCode);
if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (length != 0) {
int32_t code = getReorderCode(value);
if (code < 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
coll.setMaxVariable((UColReorderCode)code, errorCode);
}
if (U_FAILURE(errorCode)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
}
} // namespace
Collator* U_EXPORT2 Collator::createInstance(UErrorCode& success)
{
return createInstance(Locale::getDefault(), success);
@ -266,14 +428,28 @@ Collator* U_EXPORT2 Collator::createInstance(const Locale& desiredLocale,
{
if (U_FAILURE(status))
return 0;
if (desiredLocale.isBogus()) {
// Locale constructed from malformed locale ID or language tag.
status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
Collator* coll;
#if !UCONFIG_NO_SERVICE
if (hasService()) {
Locale actualLoc;
return (Collator*)gService->get(desiredLocale, &actualLoc, status);
}
coll = (Collator*)gService->get(desiredLocale, &actualLoc, status);
} else
#endif
return makeInstance(desiredLocale, status);
{
coll = makeInstance(desiredLocale, status);
}
setAttributesFromKeywords(desiredLocale, *coll, status);
if (U_FAILURE(status)) {
delete coll;
return NULL;
}
return coll;
}

View file

@ -175,35 +175,16 @@ RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
}
return;
}
const CollationSettings &ts = *t->settings;
uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
int32_t fastLatinOptions = CollationFastLatin::getOptions(
t->data, ts, fastLatinPrimaries, LENGTHOF(fastLatinPrimaries));
if((strength != UCOL_DEFAULT && strength != ts.getStrength()) ||
(decompositionMode != UCOL_DEFAULT &&
decompositionMode != ts.getFlag(CollationSettings::CHECK_FCD)) ||
fastLatinOptions != ts.fastLatinOptions ||
(fastLatinOptions >= 0 &&
uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
sizeof(fastLatinPrimaries)) != 0)) {
CollationSettings *ownedSettings = SharedObject::copyOnWrite(t->settings);
if(ownedSettings == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
if(strength != UCOL_DEFAULT) {
ownedSettings->setStrength(strength, 0, errorCode);
}
if(decompositionMode != UCOL_DEFAULT) {
ownedSettings->setFlag(CollationSettings::CHECK_FCD, decompositionMode, 0, errorCode);
}
ownedSettings->fastLatinOptions = CollationFastLatin::getOptions(
t->data, *ownedSettings,
ownedSettings->fastLatinPrimaries, LENGTHOF(ownedSettings->fastLatinPrimaries));
}
if(U_FAILURE(errorCode)) { return; }
t->actualLocale.setToBogus();
adoptTailoring(t.orphan());
// Set attributes after building the collator,
// to keep the default settings consistent with the rule string.
if(strength != UCOL_DEFAULT) {
setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);
}
if(decompositionMode != UCOL_DEFAULT) {
setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
}
}
// CollationBuilder implementation ----------------------------------------- ***
@ -266,8 +247,8 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
variableTop = base->settings->variableTop;
parser.setSink(this);
parser.setImporter(importer);
parser.parse(ruleString, *SharedObject::copyOnWrite(tailoring->settings),
outParseError, errorCode);
CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
parser.parse(ruleString, ownedSettings, outParseError, errorCode);
errorReason = parser.getErrorReason();
if(U_FAILURE(errorCode)) { return NULL; }
if(dataBuilder->hasMappings()) {
@ -291,6 +272,9 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
tailoring->data = baseData;
}
if(U_FAILURE(errorCode)) { return NULL; }
ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
tailoring->data, ownedSettings,
ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries));
tailoring->rules = ruleString;
tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination
tailoring->setVersion(base->version, rulesVersion);

View file

@ -176,6 +176,8 @@ CollationLoader::loadTailoring(const Locale &locale, Locale &validLocale, UError
}
if(typeLength == 0 || uprv_strcmp(type, "default") == 0) {
uprv_strcpy(type, defaultType);
} else {
T_CString_toLowerCase(type);
}
// Load the collations/type tailoring, with type fallback.

View file

@ -292,10 +292,19 @@ public:
static Collator* U_EXPORT2 createInstance(UErrorCode& err);
/**
* Gets the table-based collation object for the desired locale. The
* Gets the collation object for the desired locale. The
* resource of the desired locale will be loaded.
*
* Locale::getRoot() is the base collation table and all other languages are
* built on top of it with additional language-specific modifications.
*
* For some languages, multiple collation types are available;
* for example, "de@collation=phonebook".
* Starting with ICU 54, collation attributes can be specified via locale keywords as well,
* in the old locale extension syntax ("el@colCaseFirst=upper")
* or in language tag syntax ("el-u-kf-upper").
* See <a href="http://userguide.icu-project.org/collation/api">User Guide: Collation API</a>.
*
* The UErrorCode& err parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
@ -305,6 +314,7 @@ public:
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
*
* The caller owns the returned object and is responsible for deleting it.
* @param loc The locale ID for which to open a collator.
* @param err the error code status.

View file

@ -115,7 +115,6 @@ public:
* description for more details on the collation rule syntax.
* @param rules the collation rules to build the collation table from.
* @param status reporting a success or an error.
* @see Locale
* @stable ICU 2.0
*/
RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
@ -125,9 +124,8 @@ public:
* collation table out of them. Please see RuleBasedCollator class
* description for more details on the collation rule syntax.
* @param rules the collation rules to build the collation table from.
* @param collationStrength default strength for comparison
* @param collationStrength strength for comparison
* @param status reporting a success or an error.
* @see Locale
* @stable ICU 2.0
*/
RuleBasedCollator(const UnicodeString& rules,
@ -141,7 +139,6 @@ public:
* @param rules the collation rules to build the collation table from.
* @param decompositionMode the normalisation mode
* @param status reporting a success or an error.
* @see Locale
* @stable ICU 2.0
*/
RuleBasedCollator(const UnicodeString& rules,
@ -153,10 +150,9 @@ public:
* collation table out of them. Please see RuleBasedCollator class
* description for more details on the collation rule syntax.
* @param rules the collation rules to build the collation table from.
* @param collationStrength default strength for comparison
* @param collationStrength strength for comparison
* @param decompositionMode the normalisation mode
* @param status reporting a success or an error.
* @see Locale
* @stable ICU 2.0
*/
RuleBasedCollator(const UnicodeString& rules,
@ -177,7 +173,6 @@ public:
/**
* Copy constructor.
* @param other the RuleBasedCollator object to be copied
* @see Locale
* @stable ICU 2.0
*/
RuleBasedCollator(const RuleBasedCollator& other);

View file

@ -362,6 +362,14 @@ typedef enum {
/**
* Open a UCollator for comparing strings.
*
* For some languages, multiple collation types are available;
* for example, "de@collation=phonebook".
* Starting with ICU 54, collation attributes can be specified via locale keywords as well,
* in the old locale extension syntax ("el@colCaseFirst=upper")
* or in language tag syntax ("el-u-kf-upper").
* See <a href="http://userguide.icu-project.org/collation/api">User Guide: Collation API</a>.
*
* The UCollator pointer is used in all the calls to the Collation
* service. After finished, collator must be disposed of by calling
* {@link #ucol_close }.

View file

@ -2411,6 +2411,45 @@ void CollationAPITest::TestIterNumeric() {
assertEquals("40<72", (int32_t)UCOL_LESS, (int32_t)result);
}
void CollationAPITest::TestBadKeywords() {
// Test locale IDs with errors.
// Valid locale IDs are tested via data-driven tests.
UErrorCode errorCode = U_ZERO_ERROR;
Locale bogusLocale(Locale::getRoot());
bogusLocale.setToBogus();
LocalPointer<Collator> coll(Collator::createInstance(bogusLocale, errorCode));
if(errorCode != U_ILLEGAL_ARGUMENT_ERROR) {
errln("Collator::createInstance(bogus locale) did not fail as expected - %s",
u_errorName(errorCode));
}
// Unknown value.
const char *localeID = "it-u-ks-xyz";
errorCode = U_ZERO_ERROR;
coll.adoptInstead(Collator::createInstance(localeID, errorCode));
if(errorCode != U_ILLEGAL_ARGUMENT_ERROR) {
errln("Collator::createInstance(%s) did not fail as expected - %s",
localeID, u_errorName(errorCode));
}
// Unsupported attributes.
localeID = "it@colHiraganaQuaternary=true";
errorCode = U_ZERO_ERROR;
coll.adoptInstead(Collator::createInstance(localeID, errorCode));
if(errorCode != U_UNSUPPORTED_ERROR) {
errln("Collator::createInstance(%s) did not fail as expected - %s",
localeID, u_errorName(errorCode));
}
localeID = "it-u-vt-u24";
errorCode = U_ZERO_ERROR;
coll.adoptInstead(Collator::createInstance(localeID, errorCode));
if(errorCode != U_UNSUPPORTED_ERROR) {
errln("Collator::createInstance(%s) did not fail as expected - %s",
localeID, u_errorName(errorCode));
}
}
void CollationAPITest::dump(UnicodeString msg, RuleBasedCollator* c, UErrorCode& status) {
const char* bigone = "One";
const char* littleone = "one";
@ -2451,6 +2490,7 @@ void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &n
TESTCASE_AUTO(TestClone);
TESTCASE_AUTO(TestCloneBinary);
TESTCASE_AUTO(TestIterNumeric);
TESTCASE_AUTO(TestBadKeywords);
TESTCASE_AUTO_END;
}

View file

@ -169,6 +169,7 @@ public:
void TestClone();
void TestCloneBinary();
void TestIterNumeric();
void TestBadKeywords();
private:
// If this is too small for the test data, just increase it.

View file

@ -1234,16 +1234,17 @@ void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
if(errorCode.isFailure()) { return; }
CharString langTag;
langTag.appendInvariantChars(fileLine.tempSubString(9), errorCode);
char localeID[ULOC_FULLNAME_CAPACITY];
int32_t parsedLength;
(void)uloc_forLanguageTag(
langTag.data(), localeID, LENGTHOF(localeID), &parsedLength, errorCode);
Locale locale(localeID);
if(fileLine.length() == 9 ||
errorCode.isFailure() || errorCode.get() == U_STRING_NOT_TERMINATED_WARNING ||
parsedLength != langTag.length() || locale.isBogus()) {
int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant
if(at >= 0) {
fileLine.setCharAt(at, (UChar)0x2a); // *
}
CharString localeID;
localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
if(at >= 0) {
localeID.data()[at - 9] = '@';
}
Locale locale(localeID.data());
if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
errln("invalid language tag on line %d", (int)fileLineNumber);
infoln(fileLine);
if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }

View file

@ -12,6 +12,7 @@
# A collator can be set with "@ root" or "@ locale language-tag",
# for example "@ locale de-u-co-phonebk".
# An old-style locale ID can also be used, for example "@ locale de@collation=phonebook".
# A collator can be built with "@ rules".
# An "@ rules" line is followed by one or more lines with the tailoring rules.
@ -2366,3 +2367,39 @@
<2 \u0027
<2 c
<1 r
# ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()"
** test: locale -u- with collation keywords, ICU ticket 8260
@ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4
* compare
<4 \u0020 # space is shifted, strength=quaternary
<1 ! # punctuation is regular
<1 2
<1 12 # numeric sorting
<1 B
<c b # uppercase first on case level
<1 x\u0301\u0308
<2 x\u0308\u0301 # normalization off
** test: locale @ with collation keywords, ICU ticket 8260
@ locale fr@colbAckwards=yes;ColStrength=Quaternary;kv=currencY;colalternate=shifted
* compare
<4 $ # currency symbols are shifted, strength=quaternary
<1 àla
<2 alà # backwards secondary level
** test: locale -u- with script reordering, ICU ticket 8260
@ locale el-u-kr-kana-SYMBOL-Grek
* compare
<1 \u0020
<1 あ
<1 ☂
<1 Ω
<1 L
** test: locale @collation=type should be case-insensitive
@ locale de@coLLation=PhoneBook
* compare
<1 ae
<2 ä
<3 Ä