ICU-5219 Dutch IJ Titlecasing

X-SVN-Rev: 24178
2025-04-13 08:53:20 +00:00 · 2008-06-13 21:07:07 +00:00 · 2008-06-13 21:07:07 +00:00 · ddcb89888d
commit ddcb89888d
parent f06268c175
4 changed files with 58 additions and 22 deletions
--- a/icu4c/source/common/ucase.c
+++ b/icu4c/source/common/ucase.c
@ -805,17 +805,12 @@ ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
 *     zero or more case-ignorable characters.
 */

-enum {
-    LOC_UNKNOWN,
-    LOC_ROOT,
-    LOC_TURKISH,
-    LOC_LITHUANIAN
-};
-
 #define is_a(c) ((c)=='a' || (c)=='A')
+#define is_d(c) ((c)=='d' || (c)=='D')
 #define is_e(c) ((c)=='e' || (c)=='E')
 #define is_i(c) ((c)=='i' || (c)=='I')
 #define is_l(c) ((c)=='l' || (c)=='L')
+#define is_n(c) ((c)=='n' || (c)=='N')
 #define is_r(c) ((c)=='r' || (c)=='R')
 #define is_t(c) ((c)=='t' || (c)=='T')
 #define is_u(c) ((c)=='u' || (c)=='U')
@ -834,11 +829,11 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
    int32_t result;
    char c;

-    if(locCache!=NULL && (result=*locCache)!=LOC_UNKNOWN) {
+    if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
        return result;
    }

-    result=LOC_ROOT;
+    result=UCASE_LOC_ROOT;

    /*
     * This function used to use uloc_getLanguage(), but the current code
@ -859,7 +854,7 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
        if(is_r(c)) {
            c=*locale;
            if(is_sep(c)) {
-                result=LOC_TURKISH;
+                result=UCASE_LOC_TURKISH;
            }
        }
    } else if(is_a(c)) {
@ -871,7 +866,7 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
                c=*locale;
            }
            if(is_sep(c)) {
-                result=LOC_TURKISH;
+                result=UCASE_LOC_TURKISH;
            }
        }
    } else if(is_l(c)) {
@ -883,7 +878,19 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
        if(is_t(c)) {
            c=*locale;
            if(is_sep(c)) {
-                result=LOC_LITHUANIAN;
+                result=UCASE_LOC_LITHUANIAN;
+            }
+        }
+    } else if(is_n(c)) {
+        /* nl or nld? */
+        c=*locale++;
+        if(is_l(c)) {
+            c=*locale++;
+            if(is_d(c)) {
+                c=*locale;
+            }
+            if(is_sep(c)) {
+                result=UCASE_LOC_DUTCH;
            }
        }
    }
@ -1078,7 +1085,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
             * then test for characters that have unconditional mappings in SpecialCasing.txt,
             * then get the UnicodeData.txt mappings.
             */
-            if( loc==LOC_LITHUANIAN &&
+            if( loc==UCASE_LOC_LITHUANIAN &&
                    /* base characters, find accents above */
                    (((c==0x49 || c==0x4a || c==0x12e) &&
                        isFollowedByMoreAbove(csp, iter, context)) ||
@ -1124,7 +1131,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
                    return 0; /* will not occur */
                }
            /* # Turkish and Azeri */
-            } else if(loc==LOC_TURKISH && c==0x130) {
+            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
                /*
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                    # The following rules handle those cases.
@ -1133,7 +1140,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
                 */
                return 0x69;
-            } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
+            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
                /*
                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
                    # This matches the behavior of the canonically equivalent I-dot_above
@ -1142,7 +1149,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
                 */
                return 0; /* remove the dot (continue without output) */
-            } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
+            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
                /*
                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.

@ -1219,7 +1226,7 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
            /* use hardcoded conditions and mappings */
            int32_t loc=ucase_getCaseLocale(locale, locCache);

-            if(loc==LOC_TURKISH && c==0x69) {
+            if(loc==UCASE_LOC_TURKISH && c==0x69) {
                /*
                    # Turkish and Azeri

@ -1232,7 +1239,7 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
                */
                return 0x130;
-            } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
+            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
                /*
                    # Lithuanian

--- a/icu4c/source/common/ucase.h
+++ b/icu4c/source/common/ucase.h
@ -73,6 +73,15 @@ ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *
 U_CFUNC int32_t
 ucase_getCaseLocale(const char *locale, int32_t *locCache);

+/* Casing locale types for ucase_getCaseLocale */
+enum {
+    UCASE_LOC_UNKNOWN,
+    UCASE_LOC_ROOT,
+    UCASE_LOC_TURKISH,
+    UCASE_LOC_LITHUANIAN,
+    UCASE_LOC_DUTCH
+};
+
 /**
 * Bit mask for getting just the options from a string compare options word
 * that are relevant for case-insensitive string comparison.
--- a/icu4c/source/common/ucasemap.c
+++ b/icu4c/source/common/ucasemap.c
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2005-2007, International Business Machines
+*   Copyright (C) 2005-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -359,6 +359,16 @@ _toTitle(UCaseMap *csm,
                c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
                destIndex=appendResult(dest, destIndex, destCapacity, c, s);

+                
+                /* Special case Dutch IJ titlecasing */
+                if ( titleStart+1 < index && 
+                     ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
+                     ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
+                     ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { 
+                            c=0x004A;
+                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                            titleLimit++;
+                }
                /* lowercase [titleLimit..index[ */
                if(titleLimit<index) {
                    if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
--- a/icu4c/source/common/ustrcase.c
+++ b/icu4c/source/common/ustrcase.c
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 2001-2007, International Business Machines
+*   Copyright (C) 2001-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -217,7 +217,7 @@ _toTitle(UCaseMap *csm,
         UErrorCode *pErrorCode) {
    const UChar *s;
    UChar32 c;
-    int32_t prev, titleStart, titleLimit, index, destIndex, length;
+    int32_t prev, titleStart, titleLimit, titleLimitSave, index, indexSave, destIndex, length;
    UBool isFirstIndex;

    if(csm->iter!=NULL) {
@ -296,7 +296,17 @@ _toTitle(UCaseMap *csm,
                csc->cpStart=titleStart;
                csc->cpLimit=titleLimit;
                c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
-                destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                destIndex=appendResult(dest, destIndex, destCapacity, c, s); 
+
+                /* Special case Dutch IJ titlecasing */
+                if ( titleStart+1 < index && 
+                     ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
+                     ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
+                     ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 
+                            c=(UChar32) 0x004A;
+                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
+                            titleLimit++;
+                }

                /* lowercase [titleLimit..index[ */
                if(titleLimit<index) {