ICU-4020 Add USET_ADD_CASE_MAPPINGS

X-SVN-Rev: 16668
This commit is contained in:
Deborah Goldsmith 2004-10-29 01:19:55 +00:00
parent c160b79336
commit cf9a074264
3 changed files with 89 additions and 17 deletions
icu4c/source
common
test/intltest

View file

@ -55,7 +55,8 @@ enum {
/**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'.
* match all except 'a', 'A', 'b', and 'B'. This performs a full
* closure over case mappings, e.g. U+017F for s.
* @stable ICU 2.4
*/
USET_CASE_INSENSITIVE = 2,
@ -66,6 +67,17 @@ enum {
* @internal
*/
USET_CASE = 2,
/**
* Enable case insensitive matching. E.g., "[ab]" with this flag
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
* title-, and uppercase mappings of each existing element in the
* set.
* @draft ICU 3.2
*/
USET_ADD_CASE_MAPPINGS = 4,
/**
* Enough for any single-code point set
* @internal

View file

@ -23,6 +23,8 @@
#include "unicode/uscript.h"
#include "unicode/symtable.h"
#include "unicode/uset.h"
#include "unicode/locid.h"
#include "unicode/brkiter.h"
#include "uset_imp.h"
#include "ruleiter.h"
#include "cmemory.h"
@ -806,6 +808,9 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
if ((options & USET_CASE_INSENSITIVE) != 0) {
closeOver(USET_CASE);
}
else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
closeOver(USET_ADD_CASE_MAPPINGS);
}
if (invert) {
complement();
}
@ -1404,6 +1409,42 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
}
*this = foldSet;
}
else if ((attribute & USET_ADD_CASE_MAPPINGS)) {
UnicodeSet foldSet;
UnicodeString str;
UErrorCode status = U_ZERO_ERROR;
Locale root("");
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
if (U_SUCCESS(status)) {
int32_t n = getRangeCount();
for (int32_t i=0; i<n; ++i) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
for (UChar32 cp=start; cp<=end; ++cp) {
str.setTo(cp);
str.toLower(root);
foldSet.add(str);
str.toTitle(bi, root);
foldSet.add(str);
str.toUpper(root);
foldSet.add(str);
}
}
if (strings != NULL && strings->size() > 0) {
for (int32_t j=0; j<strings->size(); ++j) {
str = * (const UnicodeString*) strings->elementAt(j);
str.toLower(root);
foldSet.add(str);
str.toTitle(bi, root);
foldSet.add(str);
str.toUpper(root);
foldSet.add(str);
}
}
delete bi;
*this = foldSet;
}
}
return *this;
}

View file

@ -859,25 +859,25 @@ void UnicodeSetTest::TestPropertySet() {
"abcd\\uDC00",
"ef\\uD800\\U00010000",
"[:^lccc=0:]", // Lead canonical class
"\\u0300\\u0301",
"abcd\\u00c0\\u00c5",
"[:^lccc=0:]", // Lead canonical class
"\\u0300\\u0301",
"abcd\\u00c0\\u00c5",
"[:^tccc=0:]", // Trail canonical class
"\\u0300\\u0301\\u00c0\\u00c5",
"abcd",
"[:^tccc=0:]", // Trail canonical class
"\\u0300\\u0301\\u00c0\\u00c5",
"abcd",
"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
"\\u0300\\u0301\\u00c0\\u00c5",
"abcd",
"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
"\\u0300\\u0301\\u00c0\\u00c5",
"abcd",
"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
"",
"abcd\\u0300\\u0301\\u00c0\\u00c5",
"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
"\\u0F73\\u0F75\\u0F81",
"abcd\\u0300\\u0301\\u00c0\\u00c5",
"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
"",
"abcd\\u0300\\u0301\\u00c0\\u00c5",
"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
"\\u0F73\\u0F75\\u0F81",
"abcd\\u0300\\u0301\\u00c0\\u00c5",
};
@ -933,6 +933,7 @@ void UnicodeSetTest::TestCloseOver() {
UErrorCode ec = U_ZERO_ERROR;
char CASE[] = {(char)USET_CASE};
char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
const char* DATA[] = {
// selector, input, output
CASE,
@ -962,6 +963,18 @@ void UnicodeSetTest::TestCloseOver() {
CASE,
"[ABC]","[A-Ca-c]",
CASE_MAPPINGS,
"[aq\\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\\u00DF{Ss}{SS}{Bc}{BC}{bc}{FI}{Fi}{fi}]",
CASE_MAPPINGS,
"[\\u01F1]", // 'DZ'
"[\\u01F1\\u01F2\\u01F3]",
CASE_MAPPINGS,
"[a-z]",
"[A-Za-z]",
NULL
};
@ -1000,6 +1013,12 @@ void UnicodeSetTest::TestCloseOver() {
} else {
expectContainment(v, "defDEF", "abcABC");
}
UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
if (U_FAILURE(ec)) {
errln("FAIL: construct w/case mappings failed");
} else {
expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
}
}
void UnicodeSetTest::TestEscapePattern() {