diff --git a/icu4c/source/i18n/tblcoll.cpp b/icu4c/source/i18n/tblcoll.cpp index 7a8f8b19f83..2dffe5f9a66 100644 --- a/icu4c/source/i18n/tblcoll.cpp +++ b/icu4c/source/i18n/tblcoll.cpp @@ -321,21 +321,13 @@ void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) } } -UnicodeSet +UnicodeSet * RuleBasedCollator::getTailoredSet(UErrorCode &status) const { if(U_FAILURE(status)) { - return UnicodeSet(); - } - USet *set = ucol_getTailoredSet(this->ucollator, &status); - if(U_SUCCESS(status)) { - UnicodeSet result(*(const UnicodeSet *)set); - UnicodeString pattern; - uset_close(set); - return result; - } else { - return UnicodeSet(); + return NULL; } + return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); } diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index ede292d5376..506217aeba3 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -23,6 +23,7 @@ #include "unicode/unorm.h" #include "unicode/udata.h" #include "unicode/uchar.h" +#include "unicode/caniter.h" #include "ucol_bld.h" #include "ucol_imp.h" @@ -6729,17 +6730,33 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) const UChar *rules = ucol_getRules(coll, &rulesLen); const UChar *current = NULL; UBool startOfRules = TRUE; - USet *tailored = uset_open(1, 0); + // we internally use the C++ class, for the following reasons: + // 1. we need to utilize canonical iterator, which is a C++ only class + // 2. canonical iterator returns UnicodeStrings - USet cannot take them + // 3. USet is internally really UnicodeSet, C is just a wrapper + UnicodeSet *tailored = new UnicodeSet(); UnicodeString pattern; + CanonicalIterator it("", *status); + + // The idea is to tokenize the rule set. For each non-reset token, + // we add all the canonicaly equivalent FCD sequences ucol_tok_initTokenList(&src, rules, rulesLen, UCA, status); while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) { startOfRules = FALSE; if(src.parsedToken.strength != UCOL_TOK_RESET) { const UChar *stuff = src.source+(src.parsedToken.charsOffset); - uset_addString(tailored, stuff, src.parsedToken.charsLen); + it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status); + pattern = it.next(); + while(!pattern.isBogus()) { + if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) { + tailored->add(pattern); + } + pattern = it.next(); + } } } ucol_tok_closeTokenList(&src); - return tailored; + return (USet *)tailored; } + diff --git a/icu4c/source/i18n/unicode/coll.h b/icu4c/source/i18n/unicode/coll.h index be8f21d808b..59e2d6cb09f 100644 --- a/icu4c/source/i18n/unicode/coll.h +++ b/icu4c/source/i18n/unicode/coll.h @@ -608,13 +608,15 @@ public: virtual uint32_t getVariableTop(UErrorCode &status) const = 0; /** - * Get an UnicodeSet that contains all the characters and sequences tailored in - * this collator. + * Get an UnicodeSet that contains all the characters and sequences + * tailored in this collator. * @param status error code of the operation - * @return an UnicodeSet object containing all the tailored code points and sequences + * @return a pointer to a UnicodeSet object containing all the + * code points and sequences that may sort differently than + * in the UCA. The object must be disposed of by using delete * @draft ICU 2.4 */ - virtual UnicodeSet getTailoredSet(UErrorCode &status) const; + virtual UnicodeSet *getTailoredSet(UErrorCode &status) const; /** @@ -785,10 +787,13 @@ inline UBool Collator::operator!=(const Collator& other) const return (UBool)!(*this == other); } -inline UnicodeSet Collator::getTailoredSet(UErrorCode &status) const +inline UnicodeSet *Collator::getTailoredSet(UErrorCode &status) const { - status = U_UNSUPPORTED_ERROR; - return UnicodeSet(); + if(U_FAILURE(status)) { + return NULL; + } + // everything can be changed + return new UnicodeSet(0, 0x10FFFF); } /* diff --git a/icu4c/source/i18n/unicode/tblcoll.h b/icu4c/source/i18n/unicode/tblcoll.h index 795720def3b..43fc688a093 100644 --- a/icu4c/source/i18n/unicode/tblcoll.h +++ b/icu4c/source/i18n/unicode/tblcoll.h @@ -495,10 +495,12 @@ public: * Get an UnicodeSet that contains all the characters and sequences tailored in * this collator. * @param status error code of the operation - * @return an UnicodeSet object containing all the tailored code points and sequences + * @return a pointer to a UnicodeSet object containing all the + * code points and sequences that may sort differently than + * in the UCA. The object must be disposed of by using delete * @draft ICU 2.4 */ - virtual UnicodeSet getTailoredSet(UErrorCode &status) const; + virtual UnicodeSet *getTailoredSet(UErrorCode &status) const; /** * Thread safe cloning operation. diff --git a/icu4c/source/test/cintltst/capitst.c b/icu4c/source/test/cintltst/capitst.c index 1e1c06c0d6b..dd4a42ea512 100644 --- a/icu4c/source/test/cintltst/capitst.c +++ b/icu4c/source/test/cintltst/capitst.c @@ -45,6 +45,7 @@ void addCollAPITest(TestNode** root) addTest(root, &TestGetLocale, "tscoll/capitst/TestGetLocale"); addTest(root, &TestSortKeyBufferOverrun, "tscoll/capitst/TestSortKeyBufferOverrun"); addTest(root, &TestAttribute, "tscoll/capitst/TestAttribute"); + addTest(root, &TestGetTailoredSet, "tscoll/capitst/TestGetTailoredSet"); } @@ -1497,3 +1498,44 @@ static void TestAttribute() ucol_close(coll); } + +void TestGetTailoredSet() { + struct { + char *rules; + char *tests[20]; + int32_t testsize; + } setTest[] = { + { "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3}, + { "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4} + }; + + int32_t i = 0, j = 0; + UErrorCode status = U_ZERO_ERROR; + UParseError pError; + + UCollator *coll = NULL; + UChar buff[1024]; + int32_t buffLen = 0; + USet *set = NULL; + + for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) { + buffLen = u_unescape(setTest[i].rules, buff, 1024); + coll = ucol_openRules(buff, buffLen, UCOL_DEFAULT, UCOL_DEFAULT, &pError, &status); + if(U_SUCCESS(status)) { + set = ucol_getTailoredSet(coll, &status); + if(uset_size(set) != setTest[i].testsize) { + log_err("Tailored set size different (%d) than expected (%d)\n", uset_size(set), setTest[i].testsize); + } + for(j = 0; j < setTest[i].testsize; j++) { + buffLen = u_unescape(setTest[i].tests[j], buff, 1024); + if(!uset_containsString(set, buff, buffLen)) { + log_err("Tailored set doesn't contain %s... It should\n", setTest[i].tests[j]); + } + } + uset_close(set); + } else { + log_err("Couldn't open collator with rules %s\n", setTest[i].rules); + } + ucol_close(coll); + } +} \ No newline at end of file diff --git a/icu4c/source/test/cintltst/capitst.h b/icu4c/source/test/cintltst/capitst.h index 2fe52e2d0b4..4b5df5b8f91 100644 --- a/icu4c/source/test/cintltst/capitst.h +++ b/icu4c/source/test/cintltst/capitst.h @@ -86,5 +86,9 @@ * Test getting and setting of attributes */ void TestGetSetAttr(void); + /** + * Test getTailoredSet + */ + void TestGetTailoredSet(void); #endif diff --git a/icu4c/source/test/intltest/apicoll.cpp b/icu4c/source/test/intltest/apicoll.cpp index b516f93ad1b..be81ad92de9 100644 --- a/icu4c/source/test/intltest/apicoll.cpp +++ b/icu4c/source/test/intltest/apicoll.cpp @@ -1661,6 +1661,47 @@ void CollationAPITest::TestBounds(void) { delete coll; } + +void CollationAPITest::TestGetTailoredSet() +{ + struct { + char *rules; + char *tests[20]; + int32_t testsize; + } setTest[] = { + { "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3}, + { "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4} + }; + + int32_t i = 0, j = 0; + UErrorCode status = U_ZERO_ERROR; + + RuleBasedCollator *coll = NULL; + UnicodeString buff; + UnicodeSet *set = NULL; + + for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) { + buff = UnicodeString(setTest[i].rules, "").unescape(); + coll = new RuleBasedCollator(buff, status); + if(U_SUCCESS(status)) { + set = coll->getTailoredSet(status); + if(set->size() != setTest[i].testsize) { + errln("Tailored set size different (%d) than expected (%d)", set->size(), setTest[i].testsize); + } + for(j = 0; j < setTest[i].testsize; j++) { + buff = UnicodeString(setTest[i].tests[j], "").unescape(); + if(!set->contains(buff)) { + errln("Tailored set doesn't contain %s... It should", setTest[i].tests[j]); + } + } + delete set; + } else { + errln("Couldn't open collator with rules %s\n", setTest[i].rules); + } + delete coll; + } +} + void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */) { if (exec) logln("TestSuite CollationAPITest: "); @@ -1684,6 +1725,7 @@ void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &n case 16: name = "TestRules"; if (exec) TestRules(); break; case 17: name = "TestGetLocale"; if (exec) TestGetLocale(); break; case 18: name = "TestBounds"; if (exec) TestBounds(); break; + case 19: name = "TestGetTailoredSet"; if (exec) TestGetTailoredSet(); break; default: name = ""; break; } } diff --git a/icu4c/source/test/intltest/apicoll.h b/icu4c/source/test/intltest/apicoll.h index 6245dd1fbc0..7b8b8822972 100644 --- a/icu4c/source/test/intltest/apicoll.h +++ b/icu4c/source/test/intltest/apicoll.h @@ -140,6 +140,11 @@ public: */ void TestBounds(); + /** + * Tests getTailoredSet API + */ + void TestGetTailoredSet(); + private: // If this is too small for the test data, just increase it. // Just don't make it too large, otherwise the executable will get too big