ICU-1923 upgrade case folding to Unicode 3.2

X-SVN-Rev: 8769
2025-04-10 07:39:16 +00:00 · 2002-06-03 03:33:44 +00:00 · 2002-06-03 03:33:44 +00:00 · 0a70506190
commit 0a70506190
parent f8749a02dd
5 changed files with 120 additions and 27 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -1434,6 +1434,45 @@ u_internalToTitle(UChar32 c, UCharIterator *iter,
 * a full mapping, i.e., a string.
 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
 * then only the lowercase mapping is stored.
+ *
+ * Some special cases are hardcoded because their conditions cannot be
+ * parsed and processed from CaseFolding.txt.
+ *
+ * Unicode 3.2 CaseFolding.txt specifies for its status field:
+
+# C: common case folding, common mappings shared by both simple and full mappings.
+# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
+# S: simple case folding, mappings to single characters where different from F.
+# T: special case for uppercase I and dotted uppercase I
+#    - For non-Turkic languages, this mapping is normally not used.
+#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
+#
+# Usage:
+#  A. To do a simple case folding, use the mappings with status C + S.
+#  B. To do a full case folding, use the mappings with status C + F.
+#
+#    The mappings with status T can be used or omitted depending on the desired case-folding
+#    behavior. (The default option is to exclude them.)
+
+ * Unicode 3.2 has 'T' mappings as follows:
+
+0049; T; 0131; # LATIN CAPITAL LETTER I
+0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+ * while the default mappings for these code points are:
+
+0049; C; 0069; # LATIN CAPITAL LETTER I
+0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
+
+ * U+0130 is otherwise lowercased to U+0069 (UnicodeData.txt).
+ *
+ * In case this code is used with CaseFolding.txt from an older version of Unicode
+ * where CaseFolding.txt contains mappings with a status of 'I' that
+ * have the opposite polarity ('I' mappings are included by default but excluded for Turkic),
+ * we must also hardcode the Unicode 3.2 mappings for the code points 
+ * with 'I' mappings.
+ * Unicode 3.1.1 has 'I' mappings for U+0130 and U+0131.
+ * Unicode 3.2 has a 'T' mapping for U+0130, and lowercases U+0131 to itself (see UnicodeData.txt).
 */

 /* return the simple case folding mapping for c */
@ -1467,11 +1506,26 @@ u_foldCase(UChar32 c, uint32_t options) {
                pe=oldPE;
            } else {
                /* special case folding mappings, hardcoded */
-                if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT && (uint32_t)(c-0x130)<=1) {
-                    /* map dotted I and dotless i to U+0069 small i */
-                    return 0x69;
+                if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
+                    /* default mappings */
+                    if(c==0x49) {
+                        /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
+                        return 0x69;
+                    } else if(c==0x130) {
+                        /* no simple default mapping for U+0130, use UnicodeData.txt */
+                        return 0x69;
+                    }
+                } else {
+                    /* Turkic mappings */
+                    if(c==0x49) {
+                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
+                        return 0x131;
+                    } else if(c==0x130) {
+                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
+                        return 0x69;
+                    }
                }
-                /* return c itself because it is excluded from case folding */
+                /* return c itself because there is no special mapping for it */
                return c;
            }
        }
@ -1526,12 +1580,33 @@ u_internalFoldCase(UChar32 c,
                return length;
            } else {
                /* special case folding mappings, hardcoded */
-                if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT && (uint32_t)(c-0x130)<=1) {
-                    /* map dotted I and dotless i to U+0069 small i */
-                    result =0x69;
-                    /* goto single; */
+                if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
+                    /* default mappings */
+                    if(c==0x49) {
+                        /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
+                        result=0x69;
+                    } else if(c==0x130) {
+                        /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
+                        if(0<destCapacity) {
+                            dest[0]=0x69;
+                        }
+                        if(1<destCapacity) {
+                            dest[1]=0x307;
+                        }
+                        return 2;
+                    }
+                } else {
+                    /* Turkic mappings */
+                    if(c==0x49) {
+                        /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
+                        result=0x131;
+                    } else if(c==0x130) {
+                        /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
+                        result=0x69;
+                    }
                }
-                /* return c itself because it is excluded from case folding */
+                /* return c itself because there is no special mapping for it */
+                /* goto single; */
            }
        } else if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
            i=EXC_LOWERCASE;
--- a/icu4c/source/common/unicode/uchar.h
+++ b/icu4c/source/common/unicode/uchar.h
@ -1746,9 +1746,25 @@ u_toupper(UChar32 c);
 U_CAPI UChar32 U_EXPORT2
 u_totitle(UChar32 c);

-/** Option value for case folding: use all mappings defined in CaseFolding.txt. @draft ICU 1.8 */
+/** Option value for case folding: use default mappings defined in CaseFolding.txt. @stable */
 #define U_FOLD_CASE_DEFAULT 0
-/** Option value for case folding: exclude the mappings for dotted I and dotless i marked with 'I' in CaseFolding.txt. @draft ICU 1.8 */
+
+/**
+ * Option value for case folding:
+ *
+ * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
+ * and dotless i appropriately for Turkic languages (tr, az).
+ *
+ * Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that
+ * are to be included for default mappings and
+ * excluded for the Turkic-specific mappings.
+ *
+ * Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that
+ * are to be excluded for default mappings and
+ * included for the Turkic-specific mappings.
+ *
+ * @stable
+ */
 #define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1

 /**
--- a/icu4c/source/test/cintltst/cstrcase.c
+++ b/icu4c/source/test/cintltst/cstrcase.c
@ -331,17 +331,19 @@ TestCaseFolding() {
    simple[]={
        /* input, default, exclude special i */
        0x61,   0x61,  0x61,
-        0x49,   0x69,  0x69,
-        0x131,  0x69,  0x131,
+        0x49,   0x69,  0x131,
+        0x130,  0x69,  0x69,
+        0x131,  0x131, 0x131,
        0xdf,   0xdf,  0xdf,
        0xfb03, 0xfb03, 0xfb03,
+        0x1040e,0x10436,0x10436,
        0x5ffff,0x5ffff,0x5ffff
    };

    static const UChar
-    mixed[]=                { 0x61, 0x42, 0x131, 0x3d0, 0xdf,       0xfb03,           0xd93f, 0xdfff },
-    foldedDefault[]=        { 0x61, 0x62, 0x69,  0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff },
-    foldedExcludeSpecialI[]={ 0x61, 0x62, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff };
+    mixed[]=                { 0x61, 0x42, 0x130,       0x49,  0x131, 0x3d0, 0xdf,       0xfb03,           0xd93f, 0xdfff },
+    foldedDefault[]=        { 0x61, 0x62, 0x69, 0x307, 0x69,  0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff },
+    foldedExcludeSpecialI[]={ 0x61, 0x62, 0x69,        0x131, 0x131, 0x3b2, 0x73, 0x73, 0x66, 0x66, 0x69, 0xd93f, 0xdfff };

    UVersionInfo unicodeVersion={ 0, 0, 17, 89 }, unicode_3_1={ 3, 1, 0, 0 };

@ -544,9 +546,9 @@ TestCaseCompare() {
    static const UChar

    mixed[]=               { 0x61, 0x42, 0x131, 0x3a3, 0xdf,       0xfb03,           0xd93f, 0xdfff, 0 },
-    otherDefault[]=        { 0x41, 0x62, 0x69,  0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
+    otherDefault[]=        { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
    otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
-    different[]=           { 0x41, 0x62, 0x69,  0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
+    different[]=           { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };

    UVersionInfo unicodeVersion={ 0, 0, 17, 89 }, unicode_3_1={ 3, 1, 0, 0 };

--- a/icu4c/source/test/intltest/strcase.cpp
+++ b/icu4c/source/test/intltest/strcase.cpp
@ -290,8 +290,9 @@ StringCaseTest::TestCaseConversion()
    // test case folding
    {
        UnicodeString
-            s=UNICODE_STRING("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0131", 35).unescape(),
-            f=UNICODE_STRING("ass\\u03bcffi\\U00010434i", 23).unescape(),
+            s=UnicodeString("A\\u00df\\u00b5\\ufb03\\U0001040c\\u0130\\u0131", "").unescape(),
+            f=UnicodeString("ass\\u03bcffi\\U00010434i\\u0307\\u0131", "").unescape(),
+            g=UnicodeString("ass\\u03bcffi\\U00010434i\\u0131", "").unescape(),
            t;

        (t=s).foldCase();
@ -300,10 +301,9 @@ StringCaseTest::TestCaseConversion()
        }

        // alternate handling for dotted I/dotless i (U+0130, U+0131)
-        f.setCharAt(f.length()-1, 0x131);
        (t=s).foldCase(U_FOLD_CASE_EXCLUDE_SPECIAL_I);
-        if(f!=t) {
-            errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + f + "\"");
+        if(g!=t) {
+            errln("error in foldCase(\"" + s + "\", U_FOLD_CASE_EXCLUDE_SPECIAL_I)=\"" + t + "\" but expected \"" + g + "\"");
        }
    }
 }
--- a/icu4c/source/test/intltest/ustrtest.cpp
+++ b/icu4c/source/test/intltest/ustrtest.cpp
@ -299,10 +299,10 @@ UnicodeStringTest::TestCompare()
    /* test caseCompare() */
    {
        static const UChar
-        _mixed[]=               { 0x61, 0x42, 0x131, 0x3a3, 0xdf,       0xfb03,           0xd93f, 0xdfff, 0 },
-        _otherDefault[]=        { 0x41, 0x62, 0x69,  0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
-        _otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
-        _different[]=           { 0x41, 0x62, 0x69,  0x3c3, 0x73, 0x53, 0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };
+        _mixed[]=               { 0x61, 0x42, 0x131, 0x3a3, 0xdf,       0x130,       0x49,  0xfb03,           0xd93f, 0xdfff, 0 },
+        _otherDefault[]=        { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x69, 0x307, 0x69,  0x46, 0x66, 0x49, 0xd93f, 0xdfff, 0 },
+        _otherExcludeSpecialI[]={ 0x41, 0x62, 0x131, 0x3c3, 0x53, 0x73, 0x69,        0x131, 0x66, 0x46, 0x69, 0xd93f, 0xdfff, 0 },
+        _different[]=           { 0x41, 0x62, 0x131, 0x3c3, 0x73, 0x53, 0x130,       0x49,  0x46, 0x66, 0x49, 0xd93f, 0xdffd, 0 };

        UnicodeString
            mixed(TRUE, _mixed, -1),