ICU-1433 getTailoredSet C/C++ return all the canonically equivalent sequences. Also added tests

X-SVN-Rev: 9867
This commit is contained in:
Vladimir Weinstein 2002-09-17 06:27:51 +00:00
parent 0fb205837f
commit 150ad44972
8 changed files with 132 additions and 23 deletions

View file

@ -321,21 +321,13 @@ void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
}
}
UnicodeSet
UnicodeSet *
RuleBasedCollator::getTailoredSet(UErrorCode &status) const
{
if(U_FAILURE(status)) {
return UnicodeSet();
}
USet *set = ucol_getTailoredSet(this->ucollator, &status);
if(U_SUCCESS(status)) {
UnicodeSet result(*(const UnicodeSet *)set);
UnicodeString pattern;
uset_close(set);
return result;
} else {
return UnicodeSet();
return NULL;
}
return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
}

View file

@ -23,6 +23,7 @@
#include "unicode/unorm.h"
#include "unicode/udata.h"
#include "unicode/uchar.h"
#include "unicode/caniter.h"
#include "ucol_bld.h"
#include "ucol_imp.h"
@ -6729,17 +6730,33 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
const UChar *rules = ucol_getRules(coll, &rulesLen);
const UChar *current = NULL;
UBool startOfRules = TRUE;
USet *tailored = uset_open(1, 0);
// we internally use the C++ class, for the following reasons:
// 1. we need to utilize canonical iterator, which is a C++ only class
// 2. canonical iterator returns UnicodeStrings - USet cannot take them
// 3. USet is internally really UnicodeSet, C is just a wrapper
UnicodeSet *tailored = new UnicodeSet();
UnicodeString pattern;
CanonicalIterator it("", *status);
// The idea is to tokenize the rule set. For each non-reset token,
// we add all the canonicaly equivalent FCD sequences
ucol_tok_initTokenList(&src, rules, rulesLen, UCA, status);
while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
startOfRules = FALSE;
if(src.parsedToken.strength != UCOL_TOK_RESET) {
const UChar *stuff = src.source+(src.parsedToken.charsOffset);
uset_addString(tailored, stuff, src.parsedToken.charsLen);
it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
pattern = it.next();
while(!pattern.isBogus()) {
if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
tailored->add(pattern);
}
pattern = it.next();
}
}
}
ucol_tok_closeTokenList(&src);
return tailored;
return (USet *)tailored;
}

View file

@ -608,13 +608,15 @@ public:
virtual uint32_t getVariableTop(UErrorCode &status) const = 0;
/**
* Get an UnicodeSet that contains all the characters and sequences tailored in
* this collator.
* Get an UnicodeSet that contains all the characters and sequences
* tailored in this collator.
* @param status error code of the operation
* @return an UnicodeSet object containing all the tailored code points and sequences
* @return a pointer to a UnicodeSet object containing all the
* code points and sequences that may sort differently than
* in the UCA. The object must be disposed of by using delete
* @draft ICU 2.4
*/
virtual UnicodeSet getTailoredSet(UErrorCode &status) const;
virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
/**
@ -785,10 +787,13 @@ inline UBool Collator::operator!=(const Collator& other) const
return (UBool)!(*this == other);
}
inline UnicodeSet Collator::getTailoredSet(UErrorCode &status) const
inline UnicodeSet *Collator::getTailoredSet(UErrorCode &status) const
{
status = U_UNSUPPORTED_ERROR;
return UnicodeSet();
if(U_FAILURE(status)) {
return NULL;
}
// everything can be changed
return new UnicodeSet(0, 0x10FFFF);
}
/*

View file

@ -495,10 +495,12 @@ public:
* Get an UnicodeSet that contains all the characters and sequences tailored in
* this collator.
* @param status error code of the operation
* @return an UnicodeSet object containing all the tailored code points and sequences
* @return a pointer to a UnicodeSet object containing all the
* code points and sequences that may sort differently than
* in the UCA. The object must be disposed of by using delete
* @draft ICU 2.4
*/
virtual UnicodeSet getTailoredSet(UErrorCode &status) const;
virtual UnicodeSet *getTailoredSet(UErrorCode &status) const;
/**
* Thread safe cloning operation.

View file

@ -45,6 +45,7 @@ void addCollAPITest(TestNode** root)
addTest(root, &TestGetLocale, "tscoll/capitst/TestGetLocale");
addTest(root, &TestSortKeyBufferOverrun, "tscoll/capitst/TestSortKeyBufferOverrun");
addTest(root, &TestAttribute, "tscoll/capitst/TestAttribute");
addTest(root, &TestGetTailoredSet, "tscoll/capitst/TestGetTailoredSet");
}
@ -1497,3 +1498,44 @@ static void TestAttribute()
ucol_close(coll);
}
void TestGetTailoredSet() {
struct {
char *rules;
char *tests[20];
int32_t testsize;
} setTest[] = {
{ "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3},
{ "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4}
};
int32_t i = 0, j = 0;
UErrorCode status = U_ZERO_ERROR;
UParseError pError;
UCollator *coll = NULL;
UChar buff[1024];
int32_t buffLen = 0;
USet *set = NULL;
for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) {
buffLen = u_unescape(setTest[i].rules, buff, 1024);
coll = ucol_openRules(buff, buffLen, UCOL_DEFAULT, UCOL_DEFAULT, &pError, &status);
if(U_SUCCESS(status)) {
set = ucol_getTailoredSet(coll, &status);
if(uset_size(set) != setTest[i].testsize) {
log_err("Tailored set size different (%d) than expected (%d)\n", uset_size(set), setTest[i].testsize);
}
for(j = 0; j < setTest[i].testsize; j++) {
buffLen = u_unescape(setTest[i].tests[j], buff, 1024);
if(!uset_containsString(set, buff, buffLen)) {
log_err("Tailored set doesn't contain %s... It should\n", setTest[i].tests[j]);
}
}
uset_close(set);
} else {
log_err("Couldn't open collator with rules %s\n", setTest[i].rules);
}
ucol_close(coll);
}
}

View file

@ -86,5 +86,9 @@
* Test getting and setting of attributes
*/
void TestGetSetAttr(void);
/**
* Test getTailoredSet
*/
void TestGetTailoredSet(void);
#endif

View file

@ -1661,6 +1661,47 @@ void CollationAPITest::TestBounds(void) {
delete coll;
}
void CollationAPITest::TestGetTailoredSet()
{
struct {
char *rules;
char *tests[20];
int32_t testsize;
} setTest[] = {
{ "&a < \\u212b", { "\\u212b", "A\\u030a", "\\u00c5" }, 3},
{ "& S < \\u0161 <<< \\u0160", { "\\u0161", "s\\u030C", "\\u0160", "S\\u030C" }, 4}
};
int32_t i = 0, j = 0;
UErrorCode status = U_ZERO_ERROR;
RuleBasedCollator *coll = NULL;
UnicodeString buff;
UnicodeSet *set = NULL;
for(i = 0; i < sizeof(setTest)/sizeof(setTest[0]); i++) {
buff = UnicodeString(setTest[i].rules, "").unescape();
coll = new RuleBasedCollator(buff, status);
if(U_SUCCESS(status)) {
set = coll->getTailoredSet(status);
if(set->size() != setTest[i].testsize) {
errln("Tailored set size different (%d) than expected (%d)", set->size(), setTest[i].testsize);
}
for(j = 0; j < setTest[i].testsize; j++) {
buff = UnicodeString(setTest[i].tests[j], "").unescape();
if(!set->contains(buff)) {
errln("Tailored set doesn't contain %s... It should", setTest[i].tests[j]);
}
}
delete set;
} else {
errln("Couldn't open collator with rules %s\n", setTest[i].rules);
}
delete coll;
}
}
void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par */)
{
if (exec) logln("TestSuite CollationAPITest: ");
@ -1684,6 +1725,7 @@ void CollationAPITest::runIndexedTest( int32_t index, UBool exec, const char* &n
case 16: name = "TestRules"; if (exec) TestRules(); break;
case 17: name = "TestGetLocale"; if (exec) TestGetLocale(); break;
case 18: name = "TestBounds"; if (exec) TestBounds(); break;
case 19: name = "TestGetTailoredSet"; if (exec) TestGetTailoredSet(); break;
default: name = ""; break;
}
}

View file

@ -140,6 +140,11 @@ public:
*/
void TestBounds();
/**
* Tests getTailoredSet API
*/
void TestGetTailoredSet();
private:
// If this is too small for the test data, just increase it.
// Just don't make it too large, otherwise the executable will get too big