mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 06:25:30 +00:00
ICU-1433 getTailoredSet C/C++ return all the canonically equivalent sequences. Also added tests
X-SVN-Rev: 9867
This commit is contained in:
parent
0fb205837f
commit
150ad44972
8 changed files with 132 additions and 23 deletions
|
@ -321,21 +321,13 @@ void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
|
|||
}
|
||||
}
|
||||
|
||||
UnicodeSet
|
||||
UnicodeSet *
|
||||
RuleBasedCollator::getTailoredSet(UErrorCode &status) const
|
||||
{
|
||||
if(U_FAILURE(status)) {
|
||||
return UnicodeSet();
|
||||
}
|
||||
USet *set = ucol_getTailoredSet(this->ucollator, &status);
|
||||
if(U_SUCCESS(status)) {
|
||||
UnicodeSet result(*(const UnicodeSet *)set);
|
||||
UnicodeString pattern;
|
||||
uset_close(set);
|
||||
return result;
|
||||
} else {
|
||||
return UnicodeSet();
|
||||
return NULL;
|
||||
}
|
||||
return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "unicode/unorm.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/caniter.h"
|
||||
|
||||
#include "ucol_bld.h"
|
||||
#include "ucol_imp.h"
|
||||
|
@ -6729,17 +6730,33 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
|
|||
const UChar *rules = ucol_getRules(coll, &rulesLen);
|
||||
const UChar *current = NULL;
|
||||
UBool startOfRules = TRUE;
|
||||
USet *tailored = uset_open(1, 0);
|
||||
// we internally use the C++ class, for the following reasons:
|
||||
// 1. we need to utilize canonical iterator, which is a C++ only class
|
||||
// 2. canonical iterator returns UnicodeStrings - USet cannot take them
|
||||
// 3. USet is internally really UnicodeSet, C is just a wrapper
|
||||
UnicodeSet *tailored = new UnicodeSet();
|
||||
UnicodeString pattern;
|
||||
CanonicalIterator it("", *status);
|
||||
|
||||
|
||||
// The idea is to tokenize the rule set. For each non-reset token,
|
||||
// we add all the canonicaly equivalent FCD sequences
|
||||
ucol_tok_initTokenList(&src, rules, rulesLen, UCA, status);
|
||||
while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
|
||||
startOfRules = FALSE;
|
||||
if(src.parsedToken.strength != UCOL_TOK_RESET) {
|
||||
const UChar *stuff = src.source+(src.parsedToken.charsOffset);
|
||||
uset_addString(tailored, stuff, src.parsedToken.charsLen);
|
||||
it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
|
||||
pattern = it.next();
|
||||
while(!pattern.isBogus()) {
|
||||
if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
|
||||
tailored->add(pattern);
|
||||
}
|
||||
pattern = it.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
ucol_tok_closeTokenList(&src);
|
||||
return tailored;
|
||||
return (USet *)tailored;
|
||||
}
|
||||
|
||||
|
|
|
@ -608,13 +608,15 @@ public:
|
|||
virtual uint32_t getVariableTop(UErrorCode &status) const = 0;
|
||||
|
||||
/**
|
||||
* Get an UnicodeSet that contains all the characters and sequences tailored in
|
||||
* this collator.
|
||||
* Get an UnicodeSet that contains all the characters and sequences
|
||||
* tailored in this collator.
|
||||
* @param status error code of the operation
|
||||
* @return an UnicodeSet object containing all the tailored code points and sequences
|
||||
* @return a pointer to a UnicodeSet object containing all the
|
||||
* code points and sequences that may sort differently than
|
||||
* in the UCA. The object must be disposed of by using delete
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeSet getTailoredSet(UErrorCode &status) const;
|
||||
virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -785,10 +787,13 @@ inline UBool Collator::operator!=(const Collator& other) const
|
|||
return (UBool)!(*this == other);
|
||||
}
|
||||
|
||||
inline UnicodeSet Collator::getTailoredSet(UErrorCode &status) const
|
||||
inline UnicodeSet *Collator::getTailoredSet(UErrorCode &status) const
|
||||
{
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
return UnicodeSet();
|
||||
if(U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
// everything can be changed
|
||||
return new UnicodeSet(0, 0x10FFFF);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -495,10 +495,12 @@ public:
|
|||
* Get an UnicodeSet that contains all the characters and sequences tailored in
|
||||
* this collator.
|
||||
* @param status error code of the operation
|
||||
* @return an UnicodeSet object containing all the tailored code points and sequences
|
||||
* @return a pointer to a UnicodeSet object containing all the
|
||||
* code points and sequences that may sort differently than
|
||||
* in the UCA. The object must be disposed of by using delete
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
virtual UnicodeSet getTailoredSet(UErrorCode &status) const;
|
||||
virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Thread safe cloning operation.
|
||||
|
|
|
@ -45,6 +45,7 @@ void addCollAPITest(TestNode** root)
|
|||
addTest(root, &TestGetLocale, "tscoll/capitst/TestGetLocale");
|
||||
addTest(root, &TestSortKeyBufferOverrun, "tscoll/capitst/TestSortKeyBufferOverrun");
|
||||
addTest(root, &TestAttribute, "tscoll/capitst/TestAttribute");
|
||||
addTest(root, &TestGetTailoredSet, "tscoll/capitst/TestGetTailoredSet");
|
||||
|
||||
}
|
||||
|
||||
|
@ -1497,3 +1498,44 @@ static void TestAttribute()
|
|||
|
||||
ucol_close(coll);
|
||||
}
|
||||
|
||||
void TestGetTailoredSet() {
|
||||
struct {
|
||||
char *rules;
|
||||
char *tests[20];
|
||||
int32_t testsize;
|
||||
} setTest[] = {
|
||||
{ "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3},
|
||||
{ "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4}
|
||||
};
|
||||
|
||||
int32_t i = 0, j = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pError;
|
||||
|
||||
UCollator *coll = NULL;
|
||||
UChar buff[1024];
|
||||
int32_t buffLen = 0;
|
||||
USet *set = NULL;
|
||||
|
||||
for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) {
|
||||
buffLen = u_unescape(setTest[i].rules, buff, 1024);
|
||||
coll = ucol_openRules(buff, buffLen, UCOL_DEFAULT, UCOL_DEFAULT, &pError, &status);
|
||||
if(U_SUCCESS(status)) {
|
||||
set = ucol_getTailoredSet(coll, &status);
|
||||
if(uset_size(set) != setTest[i].testsize) {
|
||||
log_err("Tailored set size different (%d) than expected (%d)\n", uset_size(set), setTest[i].testsize);
|
||||
}
|
||||
for(j = 0; j < setTest[i].testsize; j++) {
|
||||
buffLen = u_unescape(setTest[i].tests[j], buff, 1024);
|
||||
if(!uset_containsString(set, buff, buffLen)) {
|
||||
log_err("Tailored set doesn't contain %s... It should\n", setTest[i].tests[j]);
|
||||
}
|
||||
}
|
||||
uset_close(set);
|
||||
} else {
|
||||
log_err("Couldn't open collator with rules %s\n", setTest[i].rules);
|
||||
}
|
||||
ucol_close(coll);
|
||||
}
|
||||
}
|
|
@ -86,5 +86,9 @@
|
|||
* Test getting and setting of attributes
|
||||
*/
|
||||
void TestGetSetAttr(void);
|
||||
/**
|
||||
* Test getTailoredSet
|
||||
*/
|
||||
void TestGetTailoredSet(void);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1661,6 +1661,47 @@ void CollationAPITest::TestBounds(void) {
|
|||
delete coll;
|
||||
}
|
||||
|
||||
|
||||
void CollationAPITest::TestGetTailoredSet()
|
||||
{
|
||||
struct {
|
||||
char *rules;
|
||||
char *tests[20];
|
||||
int32_t testsize;
|
||||
} setTest[] = {
|
||||
{ "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3},
|
||||
{ "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4}
|
||||
};
|
||||
|
||||
int32_t i = 0, j = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
RuleBasedCollator *coll = NULL;
|
||||
UnicodeString buff;
|
||||
UnicodeSet *set = NULL;
|
||||
|
||||
for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) {
|
||||
buff = UnicodeString(setTest[i].rules, "").unescape();
|
||||
coll = new RuleBasedCollator(buff, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
set = coll->getTailoredSet(status);
|
||||
if(set->size() != setTest[i].testsize) {
|
||||
errln("Tailored set size different (%d) than expected (%d)", set->size(), setTest[i].testsize);
|
||||
}
|
||||
for(j = 0; j < setTest[i].testsize; j++) {
|
||||
buff = UnicodeString(setTest[i].tests[j], "").unescape();
|
||||
if(!set->contains(buff)) {
|
||||
errln("Tailored set doesn't contain %s... It should", setTest[i].tests[j]);
|
||||
}
|
||||
}
|
||||
delete set;
|
||||
} else {
|
||||
errln("Couldn't open collator with rules %s\n", setTest[i].rules);
|
||||
}
|
||||
delete coll;
|
||||
}
|
||||
}
|
||||
|
||||
void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
|
||||
{
|
||||
if (exec) logln("TestSuite CollationAPITest: ");
|
||||
|
@ -1684,6 +1725,7 @@ void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &n
|
|||
case 16: name = "TestRules"; if (exec) TestRules(); break;
|
||||
case 17: name = "TestGetLocale"; if (exec) TestGetLocale(); break;
|
||||
case 18: name = "TestBounds"; if (exec) TestBounds(); break;
|
||||
case 19: name = "TestGetTailoredSet"; if (exec) TestGetTailoredSet(); break;
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -140,6 +140,11 @@ public:
|
|||
*/
|
||||
void TestBounds();
|
||||
|
||||
/**
|
||||
* Tests getTailoredSet API
|
||||
*/
|
||||
void TestGetTailoredSet();
|
||||
|
||||
private:
|
||||
// If this is too small for the test data, just increase it.
|
||||
// Just don't make it too large, otherwise the executable will get too big
|
||||
|
|
Loading…
Add table
Reference in a new issue