mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-4020 Add USET_ADD_CASE_MAPPINGS
X-SVN-Rev: 16668
This commit is contained in:
parent
c160b79336
commit
cf9a074264
3 changed files with 89 additions and 17 deletions
icu4c/source
|
@ -55,7 +55,8 @@ enum {
|
|||
/**
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'.
|
||||
* match all except 'a', 'A', 'b', and 'B'. This performs a full
|
||||
* closure over case mappings, e.g. U+017F for s.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
USET_CASE_INSENSITIVE = 2,
|
||||
|
@ -66,6 +67,17 @@ enum {
|
|||
* @internal
|
||||
*/
|
||||
USET_CASE = 2,
|
||||
|
||||
/**
|
||||
* Enable case insensitive matching. E.g., "[ab]" with this flag
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
|
||||
* title-, and uppercase mappings of each existing element in the
|
||||
* set.
|
||||
* @draft ICU 3.2
|
||||
*/
|
||||
USET_ADD_CASE_MAPPINGS = 4,
|
||||
|
||||
/**
|
||||
* Enough for any single-code point set
|
||||
* @internal
|
||||
|
|
|
@ -23,6 +23,8 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/symtable.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "uset_imp.h"
|
||||
#include "ruleiter.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -806,6 +808,9 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
|
|||
if ((options & USET_CASE_INSENSITIVE) != 0) {
|
||||
closeOver(USET_CASE);
|
||||
}
|
||||
else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
|
||||
closeOver(USET_ADD_CASE_MAPPINGS);
|
||||
}
|
||||
if (invert) {
|
||||
complement();
|
||||
}
|
||||
|
@ -1404,6 +1409,42 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
|||
}
|
||||
*this = foldSet;
|
||||
}
|
||||
else if ((attribute & USET_ADD_CASE_MAPPINGS)) {
|
||||
UnicodeSet foldSet;
|
||||
UnicodeString str;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Locale root("");
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
int32_t n = getRangeCount();
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar32 start = getRangeStart(i);
|
||||
UChar32 end = getRangeEnd(i);
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
str.setTo(cp);
|
||||
str.toLower(root);
|
||||
foldSet.add(str);
|
||||
str.toTitle(bi, root);
|
||||
foldSet.add(str);
|
||||
str.toUpper(root);
|
||||
foldSet.add(str);
|
||||
}
|
||||
}
|
||||
if (strings != NULL && strings->size() > 0) {
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
str = * (const UnicodeString*) strings->elementAt(j);
|
||||
str.toLower(root);
|
||||
foldSet.add(str);
|
||||
str.toTitle(bi, root);
|
||||
foldSet.add(str);
|
||||
str.toUpper(root);
|
||||
foldSet.add(str);
|
||||
}
|
||||
}
|
||||
delete bi;
|
||||
*this = foldSet;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
|
|
@ -859,25 +859,25 @@ void UnicodeSetTest::TestPropertySet() {
|
|||
"abcd\\uDC00",
|
||||
"ef\\uD800\\U00010000",
|
||||
|
||||
"[:^lccc=0:]", // Lead canonical class
|
||||
"\\u0300\\u0301",
|
||||
"abcd\\u00c0\\u00c5",
|
||||
"[:^lccc=0:]", // Lead canonical class
|
||||
"\\u0300\\u0301",
|
||||
"abcd\\u00c0\\u00c5",
|
||||
|
||||
"[:^tccc=0:]", // Trail canonical class
|
||||
"\\u0300\\u0301\\u00c0\\u00c5",
|
||||
"abcd",
|
||||
"[:^tccc=0:]", // Trail canonical class
|
||||
"\\u0300\\u0301\\u00c0\\u00c5",
|
||||
"abcd",
|
||||
|
||||
"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
|
||||
"\\u0300\\u0301\\u00c0\\u00c5",
|
||||
"abcd",
|
||||
"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
|
||||
"\\u0300\\u0301\\u00c0\\u00c5",
|
||||
"abcd",
|
||||
|
||||
"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
|
||||
"",
|
||||
"abcd\\u0300\\u0301\\u00c0\\u00c5",
|
||||
|
||||
"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
|
||||
"\\u0F73\\u0F75\\u0F81",
|
||||
"abcd\\u0300\\u0301\\u00c0\\u00c5",
|
||||
"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
|
||||
"",
|
||||
"abcd\\u0300\\u0301\\u00c0\\u00c5",
|
||||
|
||||
"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
|
||||
"\\u0F73\\u0F75\\u0F81",
|
||||
"abcd\\u0300\\u0301\\u00c0\\u00c5",
|
||||
|
||||
};
|
||||
|
||||
|
@ -933,6 +933,7 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
UErrorCode ec = U_ZERO_ERROR;
|
||||
|
||||
char CASE[] = {(char)USET_CASE};
|
||||
char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
|
||||
const char* DATA[] = {
|
||||
// selector, input, output
|
||||
CASE,
|
||||
|
@ -962,6 +963,18 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
CASE,
|
||||
"[ABC]","[A-Ca-c]",
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[aq\\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\\u00DF{Ss}{SS}{Bc}{BC}{bc}{FI}{Fi}{fi}]",
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[\\u01F1]", // 'DZ'
|
||||
"[\\u01F1\\u01F2\\u01F3]",
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[a-z]",
|
||||
"[A-Za-z]",
|
||||
|
||||
NULL
|
||||
};
|
||||
|
||||
|
@ -1000,6 +1013,12 @@ void UnicodeSetTest::TestCloseOver() {
|
|||
} else {
|
||||
expectContainment(v, "defDEF", "abcABC");
|
||||
}
|
||||
UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
errln("FAIL: construct w/case mappings failed");
|
||||
} else {
|
||||
expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestEscapePattern() {
|
||||
|
|
Loading…
Add table
Reference in a new issue