diff --git a/icu4c/source/i18n/coll.cpp b/icu4c/source/i18n/coll.cpp index 54a1301a42c..49a4860e08d 100644 --- a/icu4c/source/i18n/coll.cpp +++ b/icu4c/source/i18n/coll.cpp @@ -59,6 +59,8 @@ #include "uresimp.h" #include "ucln_in.h" +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + static icu::Locale* availableLocaleList = NULL; static int32_t availableLocaleListCount; static icu::ICULocaleService* gService = NULL; @@ -256,6 +258,166 @@ static UBool isAvailableLocaleListInitialized(UErrorCode &status) { // Collator public methods ----------------------------------------------- +namespace { + +static const struct { + const char *name; + UColAttribute attr; +} collAttributes[] = { + { "colStrength", UCOL_STRENGTH }, + { "colBackwards", UCOL_FRENCH_COLLATION }, + { "colCaseLevel", UCOL_CASE_LEVEL }, + { "colCaseFirst", UCOL_CASE_FIRST }, + { "colAlternate", UCOL_ALTERNATE_HANDLING }, + { "colNormalization", UCOL_NORMALIZATION_MODE }, + { "colNumeric", UCOL_NUMERIC_COLLATION } +}; + +static const struct { + const char *name; + UColAttributeValue value; +} collAttributeValues[] = { + { "primary", UCOL_PRIMARY }, + { "secondary", UCOL_SECONDARY }, + { "tertiary", UCOL_TERTIARY }, + { "quaternary", UCOL_QUATERNARY }, + // Note: Not supporting typo "quarternary" because it was never supported in locale IDs. + { "identical", UCOL_IDENTICAL }, + { "no", UCOL_OFF }, + { "yes", UCOL_ON }, + { "shifted", UCOL_SHIFTED }, + { "non-ignorable", UCOL_NON_IGNORABLE }, + { "lower", UCOL_LOWER_FIRST }, + { "upper", UCOL_UPPER_FIRST } +}; + +static const char *collReorderCodes[UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST] = { + "space", "punct", "symbol", "currency", "digit" +}; + +int32_t getReorderCode(const char *s) { + for (int32_t i = 0; i < LENGTHOF(collReorderCodes); ++i) { + if (uprv_stricmp(s, collReorderCodes[i]) == 0) { + return UCOL_REORDER_CODE_FIRST + i; + } + } + return -1; +} + +/** + * Sets collation attributes according to locale keywords. See + * http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Settings + * + * Using "alias" keywords and values where defined: + * http://www.unicode.org/reports/tr35/tr35.html#Old_Locale_Extension_Syntax + * http://unicode.org/repos/cldr/trunk/common/bcp47/collation.xml + */ +void setAttributesFromKeywords(const Locale &loc, Collator &coll, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + if (uprv_strcmp(loc.getName(), loc.getBaseName()) == 0) { + // No keywords. + return; + } + char value[1024]; // The reordering value could be long. + // Check for collation keywords that were already deprecated + // before any were supported in createInstance() (except for "collation"). + int32_t length = loc.getKeywordValue("colHiraganaQuaternary", value, LENGTHOF(value), errorCode); + if (U_FAILURE(errorCode)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (length != 0) { + errorCode = U_UNSUPPORTED_ERROR; + return; + } + length = loc.getKeywordValue("variableTop", value, LENGTHOF(value), errorCode); + if (U_FAILURE(errorCode)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (length != 0) { + errorCode = U_UNSUPPORTED_ERROR; + return; + } + // Parse known collation keywords, ignore others. + if (errorCode == U_STRING_NOT_TERMINATED_WARNING) { + errorCode = U_ZERO_ERROR; + } + for (int32_t i = 0; i < LENGTHOF(collAttributes); ++i) { + length = loc.getKeywordValue(collAttributes[i].name, value, LENGTHOF(value), errorCode); + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (length == 0) { continue; } + for (int32_t j = 0;; ++j) { + if (j == LENGTHOF(collAttributeValues)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (uprv_stricmp(value, collAttributeValues[j].name) == 0) { + coll.setAttribute(collAttributes[i].attr, collAttributeValues[j].value, errorCode); + break; + } + } + } + length = loc.getKeywordValue("colReorder", value, LENGTHOF(value), errorCode); + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (length != 0) { + int32_t codes[ + UCOL_REORDER_CODE_LIMIT - UCOL_REORDER_CODE_FIRST]; + int32_t codesLength = 0; + char *scriptName = value; + for (;;) { + if (codesLength == LENGTHOF(codes)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + char *limit = scriptName; + char c; + while ((c = *limit) != 0 && c != '-') { ++limit; } + *limit = 0; + int32_t code; + if ((limit - scriptName) == 4) { + // Strict parsing, accept only 4-letter script codes, not long names. + code = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName); + } else { + code = getReorderCode(scriptName); + } + if (code < 0) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + codes[codesLength++] = code; + if (c == 0) { break; } + scriptName = limit + 1; + } + coll.setReorderCodes(codes, codesLength, errorCode); + } + length = loc.getKeywordValue("kv", value, LENGTHOF(value), errorCode); + if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + if (length != 0) { + int32_t code = getReorderCode(value); + if (code < 0) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + coll.setMaxVariable((UColReorderCode)code, errorCode); + } + if (U_FAILURE(errorCode)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + } +} + +} // namespace + Collator* U_EXPORT2 Collator::createInstance(UErrorCode& success) { return createInstance(Locale::getDefault(), success); @@ -266,14 +428,28 @@ Collator* U_EXPORT2 Collator::createInstance(const Locale& desiredLocale, { if (U_FAILURE(status)) return 0; - + if (desiredLocale.isBogus()) { + // Locale constructed from malformed locale ID or language tag. + status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + Collator* coll; #if !UCONFIG_NO_SERVICE if (hasService()) { Locale actualLoc; - return (Collator*)gService->get(desiredLocale, &actualLoc, status); - } + coll = (Collator*)gService->get(desiredLocale, &actualLoc, status); + } else #endif - return makeInstance(desiredLocale, status); + { + coll = makeInstance(desiredLocale, status); + } + setAttributesFromKeywords(desiredLocale, *coll, status); + if (U_FAILURE(status)) { + delete coll; + return NULL; + } + return coll; } diff --git a/icu4c/source/i18n/collationbuilder.cpp b/icu4c/source/i18n/collationbuilder.cpp index 337bcddd084..acf57387951 100644 --- a/icu4c/source/i18n/collationbuilder.cpp +++ b/icu4c/source/i18n/collationbuilder.cpp @@ -175,35 +175,16 @@ RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, } return; } - const CollationSettings &ts = *t->settings; - uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT]; - int32_t fastLatinOptions = CollationFastLatin::getOptions( - t->data, ts, fastLatinPrimaries, LENGTHOF(fastLatinPrimaries)); - if((strength != UCOL_DEFAULT && strength != ts.getStrength()) || - (decompositionMode != UCOL_DEFAULT && - decompositionMode != ts.getFlag(CollationSettings::CHECK_FCD)) || - fastLatinOptions != ts.fastLatinOptions || - (fastLatinOptions >= 0 && - uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries, - sizeof(fastLatinPrimaries)) != 0)) { - CollationSettings *ownedSettings = SharedObject::copyOnWrite(t->settings); - if(ownedSettings == NULL) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - return; - } - if(strength != UCOL_DEFAULT) { - ownedSettings->setStrength(strength, 0, errorCode); - } - if(decompositionMode != UCOL_DEFAULT) { - ownedSettings->setFlag(CollationSettings::CHECK_FCD, decompositionMode, 0, errorCode); - } - ownedSettings->fastLatinOptions = CollationFastLatin::getOptions( - t->data, *ownedSettings, - ownedSettings->fastLatinPrimaries, LENGTHOF(ownedSettings->fastLatinPrimaries)); - } - if(U_FAILURE(errorCode)) { return; } t->actualLocale.setToBogus(); adoptTailoring(t.orphan()); + // Set attributes after building the collator, + // to keep the default settings consistent with the rule string. + if(strength != UCOL_DEFAULT) { + setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); + } + if(decompositionMode != UCOL_DEFAULT) { + setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); + } } // CollationBuilder implementation ----------------------------------------- *** @@ -266,8 +247,8 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString, variableTop = base->settings->variableTop; parser.setSink(this); parser.setImporter(importer); - parser.parse(ruleString, *SharedObject::copyOnWrite(tailoring->settings), - outParseError, errorCode); + CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings); + parser.parse(ruleString, ownedSettings, outParseError, errorCode); errorReason = parser.getErrorReason(); if(U_FAILURE(errorCode)) { return NULL; } if(dataBuilder->hasMappings()) { @@ -291,6 +272,9 @@ CollationBuilder::parseAndBuild(const UnicodeString &ruleString, tailoring->data = baseData; } if(U_FAILURE(errorCode)) { return NULL; } + ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( + tailoring->data, ownedSettings, + ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries)); tailoring->rules = ruleString; tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination tailoring->setVersion(base->version, rulesVersion); diff --git a/icu4c/source/i18n/ucol_res.cpp b/icu4c/source/i18n/ucol_res.cpp index 15f0d6d901a..13295fb68a7 100644 --- a/icu4c/source/i18n/ucol_res.cpp +++ b/icu4c/source/i18n/ucol_res.cpp @@ -176,6 +176,8 @@ CollationLoader::loadTailoring(const Locale &locale, Locale &validLocale, UError } if(typeLength == 0 || uprv_strcmp(type, "default") == 0) { uprv_strcpy(type, defaultType); + } else { + T_CString_toLowerCase(type); } // Load the collations/type tailoring, with type fallback. diff --git a/icu4c/source/i18n/unicode/coll.h b/icu4c/source/i18n/unicode/coll.h index e5039106d2a..a7932759de4 100644 --- a/icu4c/source/i18n/unicode/coll.h +++ b/icu4c/source/i18n/unicode/coll.h @@ -292,10 +292,19 @@ public: static Collator* U_EXPORT2 createInstance(UErrorCode& err); /** - * Gets the table-based collation object for the desired locale. The + * Gets the collation object for the desired locale. The * resource of the desired locale will be loaded. + * * Locale::getRoot() is the base collation table and all other languages are * built on top of it with additional language-specific modifications. + * + * For some languages, multiple collation types are available; + * for example, "de@collation=phonebook". + * Starting with ICU 54, collation attributes can be specified via locale keywords as well, + * in the old locale extension syntax ("el@colCaseFirst=upper") + * or in language tag syntax ("el-u-kf-upper"). + * See User Guide: Collation API. + * * The UErrorCode& err parameter is used to return status information to the user. * To check whether the construction succeeded or not, you should check * the value of U_SUCCESS(err). If you wish more detailed information, you @@ -305,6 +314,7 @@ public: * used. U_USING_DEFAULT_ERROR indicates that the default locale data was * used; neither the requested locale nor any of its fall back locales * could be found. + * * The caller owns the returned object and is responsible for deleting it. * @param loc The locale ID for which to open a collator. * @param err the error code status. diff --git a/icu4c/source/i18n/unicode/tblcoll.h b/icu4c/source/i18n/unicode/tblcoll.h index 00ab863bcc0..cca4a4e53cc 100644 --- a/icu4c/source/i18n/unicode/tblcoll.h +++ b/icu4c/source/i18n/unicode/tblcoll.h @@ -115,7 +115,6 @@ public: * description for more details on the collation rule syntax. * @param rules the collation rules to build the collation table from. * @param status reporting a success or an error. - * @see Locale * @stable ICU 2.0 */ RuleBasedCollator(const UnicodeString& rules, UErrorCode& status); @@ -125,9 +124,8 @@ public: * collation table out of them. Please see RuleBasedCollator class * description for more details on the collation rule syntax. * @param rules the collation rules to build the collation table from. - * @param collationStrength default strength for comparison + * @param collationStrength strength for comparison * @param status reporting a success or an error. - * @see Locale * @stable ICU 2.0 */ RuleBasedCollator(const UnicodeString& rules, @@ -141,7 +139,6 @@ public: * @param rules the collation rules to build the collation table from. * @param decompositionMode the normalisation mode * @param status reporting a success or an error. - * @see Locale * @stable ICU 2.0 */ RuleBasedCollator(const UnicodeString& rules, @@ -153,10 +150,9 @@ public: * collation table out of them. Please see RuleBasedCollator class * description for more details on the collation rule syntax. * @param rules the collation rules to build the collation table from. - * @param collationStrength default strength for comparison + * @param collationStrength strength for comparison * @param decompositionMode the normalisation mode * @param status reporting a success or an error. - * @see Locale * @stable ICU 2.0 */ RuleBasedCollator(const UnicodeString& rules, @@ -177,7 +173,6 @@ public: /** * Copy constructor. * @param other the RuleBasedCollator object to be copied - * @see Locale * @stable ICU 2.0 */ RuleBasedCollator(const RuleBasedCollator& other); diff --git a/icu4c/source/i18n/unicode/ucol.h b/icu4c/source/i18n/unicode/ucol.h index bd6ff050cae..9cbc962eb46 100644 --- a/icu4c/source/i18n/unicode/ucol.h +++ b/icu4c/source/i18n/unicode/ucol.h @@ -362,6 +362,14 @@ typedef enum { /** * Open a UCollator for comparing strings. + * + * For some languages, multiple collation types are available; + * for example, "de@collation=phonebook". + * Starting with ICU 54, collation attributes can be specified via locale keywords as well, + * in the old locale extension syntax ("el@colCaseFirst=upper") + * or in language tag syntax ("el-u-kf-upper"). + * See User Guide: Collation API. + * * The UCollator pointer is used in all the calls to the Collation * service. After finished, collator must be disposed of by calling * {@link #ucol_close }. diff --git a/icu4c/source/test/intltest/apicoll.cpp b/icu4c/source/test/intltest/apicoll.cpp index 4b3dc850acc..7270c784381 100644 --- a/icu4c/source/test/intltest/apicoll.cpp +++ b/icu4c/source/test/intltest/apicoll.cpp @@ -2411,6 +2411,45 @@ void CollationAPITest::TestIterNumeric() { assertEquals("40<72", (int32_t)UCOL_LESS, (int32_t)result); } +void CollationAPITest::TestBadKeywords() { + // Test locale IDs with errors. + // Valid locale IDs are tested via data-driven tests. + UErrorCode errorCode = U_ZERO_ERROR; + Locale bogusLocale(Locale::getRoot()); + bogusLocale.setToBogus(); + LocalPointer coll(Collator::createInstance(bogusLocale, errorCode)); + if(errorCode != U_ILLEGAL_ARGUMENT_ERROR) { + errln("Collator::createInstance(bogus locale) did not fail as expected - %s", + u_errorName(errorCode)); + } + + // Unknown value. + const char *localeID = "it-u-ks-xyz"; + errorCode = U_ZERO_ERROR; + coll.adoptInstead(Collator::createInstance(localeID, errorCode)); + if(errorCode != U_ILLEGAL_ARGUMENT_ERROR) { + errln("Collator::createInstance(%s) did not fail as expected - %s", + localeID, u_errorName(errorCode)); + } + + // Unsupported attributes. + localeID = "it@colHiraganaQuaternary=true"; + errorCode = U_ZERO_ERROR; + coll.adoptInstead(Collator::createInstance(localeID, errorCode)); + if(errorCode != U_UNSUPPORTED_ERROR) { + errln("Collator::createInstance(%s) did not fail as expected - %s", + localeID, u_errorName(errorCode)); + } + + localeID = "it-u-vt-u24"; + errorCode = U_ZERO_ERROR; + coll.adoptInstead(Collator::createInstance(localeID, errorCode)); + if(errorCode != U_UNSUPPORTED_ERROR) { + errln("Collator::createInstance(%s) did not fail as expected - %s", + localeID, u_errorName(errorCode)); + } +} + void CollationAPITest::dump(UnicodeString msg, RuleBasedCollator* c, UErrorCode& status) { const char* bigone = "One"; const char* littleone = "one"; @@ -2451,6 +2490,7 @@ void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &n TESTCASE_AUTO(TestClone); TESTCASE_AUTO(TestCloneBinary); TESTCASE_AUTO(TestIterNumeric); + TESTCASE_AUTO(TestBadKeywords); TESTCASE_AUTO_END; } diff --git a/icu4c/source/test/intltest/apicoll.h b/icu4c/source/test/intltest/apicoll.h index 16d5634c63c..0a134b7628f 100644 --- a/icu4c/source/test/intltest/apicoll.h +++ b/icu4c/source/test/intltest/apicoll.h @@ -169,6 +169,7 @@ public: void TestClone(); void TestCloneBinary(); void TestIterNumeric(); + void TestBadKeywords(); private: // If this is too small for the test data, just increase it. diff --git a/icu4c/source/test/intltest/collationtest.cpp b/icu4c/source/test/intltest/collationtest.cpp index 4b69bbaa052..907428cad6b 100644 --- a/icu4c/source/test/intltest/collationtest.cpp +++ b/icu4c/source/test/intltest/collationtest.cpp @@ -1234,16 +1234,17 @@ void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) { void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) { if(errorCode.isFailure()) { return; } - CharString langTag; - langTag.appendInvariantChars(fileLine.tempSubString(9), errorCode); - char localeID[ULOC_FULLNAME_CAPACITY]; - int32_t parsedLength; - (void)uloc_forLanguageTag( - langTag.data(), localeID, LENGTHOF(localeID), &parsedLength, errorCode); - Locale locale(localeID); - if(fileLine.length() == 9 || - errorCode.isFailure() || errorCode.get() == U_STRING_NOT_TERMINATED_WARNING || - parsedLength != langTag.length() || locale.isBogus()) { + int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant + if(at >= 0) { + fileLine.setCharAt(at, (UChar)0x2a); // * + } + CharString localeID; + localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode); + if(at >= 0) { + localeID.data()[at - 9] = '@'; + } + Locale locale(localeID.data()); + if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) { errln("invalid language tag on line %d", (int)fileLineNumber); infoln(fileLine); if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); } diff --git a/icu4c/source/test/testdata/collationtest.txt b/icu4c/source/test/testdata/collationtest.txt index d91ba24dbb8..777ed15234e 100644 --- a/icu4c/source/test/testdata/collationtest.txt +++ b/icu4c/source/test/testdata/collationtest.txt @@ -12,6 +12,7 @@ # A collator can be set with "@ root" or "@ locale language-tag", # for example "@ locale de-u-co-phonebk". +# An old-style locale ID can also be used, for example "@ locale de@collation=phonebook". # A collator can be built with "@ rules". # An "@ rules" line is followed by one or more lines with the tailoring rules. @@ -2366,3 +2367,39 @@ <2 \u0027 <2 c <1 r + +# ICU ticket #8260 "Support all collation-related keywords in Collator.getInstance()" +** test: locale -u- with collation keywords, ICU ticket 8260 +@ locale de-u-kv-sPace-ka-shifTed-kn-kk-falsE-kf-Upper-kc-tRue-ks-leVel4 +* compare +<4 \u0020 # space is shifted, strength=quaternary +<1 ! # punctuation is regular +<1 2 +<1 12 # numeric sorting +<1 B +