ICU-4020 Add USET_ADD_CASE_MAPPINGS

X-SVN-Rev: 16668
2025-04-13 08:53:20 +00:00 · 2004-10-29 01:19:55 +00:00 · 2004-10-29 01:19:55 +00:00 · cf9a074264
commit cf9a074264
parent c160b79336
3 changed files with 89 additions and 17 deletions
--- a/icu4c/source/common/unicode/uset.h
+++ b/icu4c/source/common/unicode/uset.h
@ -55,7 +55,8 @@ enum {
    /**
     * Enable case insensitive matching.  E.g., "[ab]" with this flag
     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
-     * match all except 'a', 'A', 'b', and 'B'.
+     * match all except 'a', 'A', 'b', and 'B'. This performs a full
+     * closure over case mappings, e.g. U+017F for s.
     * @stable ICU 2.4
     */
    USET_CASE_INSENSITIVE = 2,  
@ -66,6 +67,17 @@ enum {
     * @internal
     */
    USET_CASE = 2,
+
+    /**
+     * Enable case insensitive matching.  E.g., "[ab]" with this flag
+     * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
+     * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
+     * title-, and uppercase mappings of each existing element in the
+     * set.
+     * @draft ICU 3.2
+     */
+    USET_ADD_CASE_MAPPINGS = 4,
+
    /**
     * Enough for any single-code point set
     * @internal
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@ -23,6 +23,8 @@
 #include "unicode/uscript.h"
 #include "unicode/symtable.h"
 #include "unicode/uset.h"
+#include "unicode/locid.h"
+#include "unicode/brkiter.h"
 #include "uset_imp.h"
 #include "ruleiter.h"
 #include "cmemory.h"
@ -806,6 +808,9 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
    if ((options & USET_CASE_INSENSITIVE) != 0) {
        closeOver(USET_CASE);
    }
+    else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
+        closeOver(USET_ADD_CASE_MAPPINGS);
+    }
    if (invert) {
        complement();
    }
@ -1404,6 +1409,42 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
        }
        *this = foldSet;
    }
+    else if ((attribute & USET_ADD_CASE_MAPPINGS)) {
+        UnicodeSet foldSet;
+        UnicodeString str;
+        UErrorCode status = U_ZERO_ERROR;
+        Locale root("");
+        BreakIterator *bi = BreakIterator::createWordInstance(root, status);
+        if (U_SUCCESS(status)) {
+            int32_t n = getRangeCount();
+            for (int32_t i=0; i<n; ++i) {
+                UChar32 start = getRangeStart(i);
+                UChar32 end   = getRangeEnd(i);
+                for (UChar32 cp=start; cp<=end; ++cp) {
+                    str.setTo(cp);
+                    str.toLower(root);
+                    foldSet.add(str);
+                    str.toTitle(bi, root);
+                    foldSet.add(str);
+                    str.toUpper(root);
+                    foldSet.add(str);
+                }
+            }
+            if (strings != NULL && strings->size() > 0) {
+                for (int32_t j=0; j<strings->size(); ++j) {
+                    str = * (const UnicodeString*) strings->elementAt(j);
+                    str.toLower(root);
+                    foldSet.add(str);
+                    str.toTitle(bi, root);
+                    foldSet.add(str);
+                    str.toUpper(root);
+                    foldSet.add(str);
+                }
+            }
+            delete bi;
+            *this = foldSet;
+        }
+    }
    return *this;
 }

--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -859,25 +859,25 @@ void UnicodeSetTest::TestPropertySet() {
        "abcd\\uDC00",
        "ef\\uD800\\U00010000",

-		"[:^lccc=0:]", // Lead canonical class
-		"\\u0300\\u0301",
-		"abcd\\u00c0\\u00c5",
+        "[:^lccc=0:]", // Lead canonical class
+        "\\u0300\\u0301",
+        "abcd\\u00c0\\u00c5",

-		"[:^tccc=0:]", // Trail canonical class
-		"\\u0300\\u0301\\u00c0\\u00c5",
-		"abcd",
+        "[:^tccc=0:]", // Trail canonical class
+        "\\u0300\\u0301\\u00c0\\u00c5",
+        "abcd",

-		"[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
-		"\\u0300\\u0301\\u00c0\\u00c5",
-		"abcd",
+        "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
+        "\\u0300\\u0301\\u00c0\\u00c5",
+        "abcd",

-		"[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
-		"",
-		"abcd\\u0300\\u0301\\u00c0\\u00c5",
-		
-		"[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
-		"\\u0F73\\u0F75\\u0F81",
-		"abcd\\u0300\\u0301\\u00c0\\u00c5",
+        "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
+        "",
+        "abcd\\u0300\\u0301\\u00c0\\u00c5",
+        
+        "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
+        "\\u0F73\\u0F75\\u0F81",
+        "abcd\\u0300\\u0301\\u00c0\\u00c5",

    };

@ -933,6 +933,7 @@ void UnicodeSetTest::TestCloseOver() {
    UErrorCode ec = U_ZERO_ERROR;

    char CASE[] = {(char)USET_CASE};
+    char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
    const char* DATA[] = {
        // selector, input, output
        CASE,
@ -962,6 +963,18 @@ void UnicodeSetTest::TestCloseOver() {
        CASE,
        "[ABC]","[A-Ca-c]",

+        CASE_MAPPINGS,
+        "[aq\\u00DF{Bc}{bC}{Fi}]",
+        "[aAqQ\\u00DF{Ss}{SS}{Bc}{BC}{bc}{FI}{Fi}{fi}]",
+
+        CASE_MAPPINGS,
+        "[\\u01F1]", // 'DZ'
+        "[\\u01F1\\u01F2\\u01F3]",
+        
+        CASE_MAPPINGS,
+        "[a-z]",
+        "[A-Za-z]",
+
        NULL
    };

@ -1000,6 +1013,12 @@ void UnicodeSetTest::TestCloseOver() {
    } else {
        expectContainment(v, "defDEF", "abcABC");
    }
+    UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
+    if (U_FAILURE(ec)) {
+        errln("FAIL: construct w/case mappings failed");
+    } else {
+        expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
+    }
 }

 void UnicodeSetTest::TestEscapePattern() {