ICU-1501 Updated Upper/Lower/TitlecaseTransliterator and checked in new casing tests for surrogates from Java.

X-SVN-Rev: 7235
2025-04-15 09:45:26 +00:00 · 2001-11-30 23:53:55 +00:00 · 2001-11-30 23:53:55 +00:00 · 6f5df69456
commit 6f5df69456
parent 1022f98094
8 changed files with 231 additions and 21 deletions
--- a/icu4c/source/common/uchar.c
+++ b/icu4c/source/common/uchar.c
@ -1066,11 +1066,11 @@ getCaseLocale(const char *locale) {
        return LOC_ROOT;
    }

-    if( (locale[0]=='t' && locale[1]=='r') ||
-        (locale[0]=='a' && locale[1]=='z')
+    if( (lang[0]=='t' && lang[1]=='r') ||
+        (lang[0]=='a' && lang[1]=='z')
    ) {
        return LOC_TURKISH;
-    } else if(locale[0]=='l' && locale[1]=='t') {
+    } else if(lang[0]=='l' && lang[1]=='t') {
        return LOC_LITHUANIAN;
    } else {
        return LOC_ROOT;
--- a/icu4c/source/data/build/makedata.mak
+++ b/icu4c/source/data/build/makedata.mak
@ -234,6 +234,7 @@ CLEAN :
 	-@erase "*.obj"
 	-@erase "*.brk"
 	-@erase "*.dat"
+    -@erase "*.dll"
 	@cd "$(TESTDATAOUT)"
 	-@erase "*.dat"
 	-@erase "*.cnv"
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -234,6 +234,7 @@ CLEAN :
 	-@erase "*.obj"
 	-@erase "*.brk"
 	-@erase "*.dat"
+    -@erase "*.dll"
 	@cd "$(TESTDATAOUT)"
 	-@erase "*.dat"
 	-@erase "*.cnv"
--- a/icu4c/source/i18n/titletrn.cpp
+++ b/icu4c/source/i18n/titletrn.cpp
@ -136,14 +136,13 @@ void TitlecaseTransliterator::handleTransliterate(

    int32_t i = textPos - offsets.contextStart;
    int32_t limit = offsets.limit - offsets.contextStart;
-    UChar32 cp;
+    UChar32 cp, bufferCH;
    int32_t oldLen;
    int32_t newLen;

    for (; i < limit; ) {
        UErrorCode status = U_ZERO_ERROR;
        int32_t s = i;
-        buffer[0] = original.charAt(s);

        UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
        oldLen = UTF_CHAR_LENGTH(cp);
@ -153,7 +152,8 @@ void TitlecaseTransliterator::handleTransliterate(
                newLen = u_internalTitleCase(cp, buffer, u_getMaxCaseExpansion(), loc.getName());
            } else {
                u_strToLower(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
-                newLen = buffer[0] == original.charAt(s) ? -1 : u_strlen(buffer);
+                UTF_GET_CHAR(buffer, 0, s, u_strlen(buffer), bufferCH);
+                newLen = bufferCH == original.char32At(s) ? -1 : u_strlen(buffer);
            }
            doTitle = !CASED->contains(cp);
            if (newLen >= 0) {
--- a/icu4c/source/i18n/tolowtrn.cpp
+++ b/icu4c/source/i18n/tolowtrn.cpp
@ -86,20 +86,21 @@ void LowercaseTransliterator::handleTransliterate(Replaceable& text,
    
    int32_t i = textPos - offsets.contextStart;
    int32_t limit = offsets.limit - offsets.contextStart;
-    UChar32 cp;
+    UChar32 cp, bufferCH;
    int32_t oldLen;
    
    for (; i < limit; ) { 
        UErrorCode status = U_ZERO_ERROR;
        int32_t s = i;
-        buffer[0] = original.charAt(s);
+        bufferCH = original.char32At(s);

        UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
        oldLen = UTF_CHAR_LENGTH(cp);
        i += oldLen;
        u_strToLower(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
        /* Skip checking of status code here because the buffer should not have overflowed. */
-        if ( buffer[0] != original.charAt(s) ) {
+        UTF_GET_CHAR(buffer, 0, s, u_strlen(buffer), cp);
+        if ( bufferCH != cp ) {
            int len = u_strlen(buffer);
            UnicodeString temp(buffer);
            text.handleReplaceBetween(textPos, textPos + oldLen, temp);
--- a/icu4c/source/i18n/toupptrn.cpp
+++ b/icu4c/source/i18n/toupptrn.cpp
@ -89,20 +89,21 @@ void UppercaseTransliterator::handleTransliterate(Replaceable& text,
    
    int32_t i = textPos - offsets.contextStart;
    int32_t limit = offsets.limit - offsets.contextStart;
-    UChar32 cp;
+    UChar32 cp, bufferCH;
    int32_t oldLen;
    
    for (; i < limit; ) {
        UErrorCode status = U_ZERO_ERROR;
        int32_t s = i;
-        buffer[0] = original.charAt(s);
+        bufferCH = original.char32At(s);

        UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
        oldLen = UTF_CHAR_LENGTH(cp);
        i += oldLen;
        u_strToUpper(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
        /* Skip checking of status code here because the buffer should not have overflowed. */
-        if (buffer[0] != original.charAt(s)) {
+        UTF_GET_CHAR(buffer, 0, s, u_strlen(buffer), cp);
+        if (bufferCH != cp) {
            int len = u_strlen(buffer);
            UnicodeString temp(buffer);
            text.handleReplaceBetween(textPos, textPos + oldLen, temp);
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -21,6 +21,7 @@
 #include "unicode/uniset.h"
 #include "unicode/unitohex.h"
 #include "unicode/utypes.h"
+#include "unicode/ustring.h"

 /***********************************************************************

@ -143,6 +144,10 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
        TESTCASE(61,TestEscape);
        TESTCASE(62,TestAnchorMasking);
        TESTCASE(63,TestDisplayName);
+        TESTCASE(64,TestSpecialCases);
+        TESTCASE(65,TestIncrementalProgress);
+        TESTCASE(66,TestSurrogateCasing);
+
        default: name = ""; break;
    }
 }
@ -2964,6 +2969,189 @@ void TransliteratorTest::TestDisplayName() {
    }
 }

+const UnicodeString DESERET_DEE((UChar32)0x10414);
+const UnicodeString DESERET_dee((UChar32)0x1043C);
+
+void TransliteratorTest::TestSpecialCases(void) {
+    const UnicodeString registerRules[] = {
+        "Any-Dev1", "x > X; y > Y;",
+        "Any-Dev2", "XY > Z",
+        "Greek-Latin/FAKE", 
+            CharsToUnicodeString
+            ("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;") 
+    };
+
+    const UnicodeString testCases[] = {
+        // NORMALIZATION, not in C
+        "NFC", CharsToUnicodeString("a\\u0300"), CharsToUnicodeString("\\u00E0"),
+        "NFD", CharsToUnicodeString("\\u00E0"), CharsToUnicodeString("a\\u0300"),
+    
+        // mp -> b BUG
+        "Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
+        "Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
+    
+        // check for devanagari bug
+        "nfd;Dev1;Dev2;nfc", "xy", "Z",
+
+        // ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
+        "Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, 
+                 CharsToUnicodeString("Ab'cd Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee, 
+                 
+        //TODO: enable this test once Titlecase works right
+        /*
+        "Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, 
+                 CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee, 
+                 */
+        "Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, 
+                 CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
+        "Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE, 
+                 CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
+    
+         // FORMS OF S
+        "Greek-Latin/UNGEGN",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"), 
+                               CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
+        "Latin-Greek/UNGEGN",  CharsToUnicodeString("s ss s\\u0331s\\u0331"), 
+                               CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
+        "Greek-Latin",  CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"), 
+                        CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
+        "Latin-Greek",  CharsToUnicodeString("s ss s\\u0331s\\u0331"), 
+                        CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3")
+    };
+
+    UParseError pos;
+    for (int32_t i = 0; i < 6 /*registerRules.length*/; i+=2) {
+        UErrorCode status = U_ZERO_ERROR;
+
+        Transliterator *t = Transliterator::createFromRules(registerRules[0+i], 
+            registerRules[i+1], UTRANS_FORWARD, pos, status);
+        if (U_FAILURE(status)) {
+            errln("Fails: Unable to create the transliterator from rules.");
+        } else {
+            Transliterator::registerInstance(t);
+        }
+    }
+    for (i = 0; i < 36 /*testCases.length*/; i+=3) {
+        UErrorCode status = U_ZERO_ERROR;
+        Transliterator *t = Transliterator::createInstance(testCases[i+0], UTRANS_FORWARD, pos, status);
+        expect(*t, testCases[i+1], testCases[i+2], 0);
+        delete t;
+    }
+}
+
+char* Char32ToEscapedChars(UChar32 ch, char* buffer) {
+    if (ch <= 0xFFFF) {
+        sprintf(buffer, "\\u%04x", ch);
+    } else {
+        sprintf(buffer, "\\u%08x", ch);
+    }
+    return buffer;
+}
+
+void TransliteratorTest::TestSurrogateCasing (void) {
+    // check that casing handles surrogates
+    // titlecase is currently defective
+    char buffer[20];
+    UChar buffer2[20];
+    UChar32 dee;
+    UTF_GET_CHAR(DESERET_dee,0, 0, DESERET_dee.length(), dee);
+    UnicodeString DEE(u_totitle(dee));
+    if (DEE != DESERET_DEE) {
+        err("Fails titlecase of surrogates");
+        err(Char32ToEscapedChars(dee, buffer)); 
+        err(", ");
+        errln(Char32ToEscapedChars(DEE.char32At(0), buffer));
+    }
+        
+    UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
+    UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
+    UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
+    UErrorCode status= U_ZERO_ERROR;
+
+    u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
+    if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
+        errln("Fails: Can't uppercase surrogates.");
+    }
+        
+    status= U_ZERO_ERROR;
+    u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
+    if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
+        errln("Fails: Can't lowercase surrogates.");
+    }
+}
+
+// Check to see that incremental gets at least part way through a reasonable string.
+
+void TransliteratorTest::TestIncrementalProgress(void) {
+    UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
+    int32_t i = 0, j=0, k=0;
+    int32_t sources = Transliterator::countAvailableSources();
+    for (i = 0; i < sources; i++) {
+        UnicodeString source;
+        Transliterator::getAvailableSource(i, source);
+        if (source != UnicodeString("Latin")) continue;
+        int32_t targets = Transliterator::countAvailableTargets(source);
+        for (j = 0; j < targets; j++) {
+            UnicodeString target;
+            Transliterator::getAvailableTarget(j, source, target);
+            int32_t variants = Transliterator::countAvailableVariants(source, target);
+            for (k =0; k< variants; k++) {
+                UnicodeString variant;
+                UParseError err;
+                UErrorCode status = U_ZERO_ERROR;
+
+                Transliterator::getAvailableVariant(k, source, target, variant);
+                UnicodeString id = source + "-" + target + "/" + variant;
+    
+                Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
+                if (U_FAILURE(status)) {
+                    errln("FAIL: Not able to create transliterator from the composed ID.");
+                }
+                status = U_ZERO_ERROR;
+                UnicodeString result = CheckIncrementalAux(t, test);
+                Transliterator *inv = t->createInverse(status);
+                CheckIncrementalAux(inv, result);
+                delete t;
+                delete inv;
+            }
+        }
+    }
+}
+
+UnicodeString TransliteratorTest::CheckIncrementalAux(const Transliterator* t, 
+                                                      const UnicodeString& input) {
+    
+    UTransPosition pos;
+    UnicodeString test = input;
+
+    pos.contextStart = 0;
+    pos.contextLimit = input.length();
+    pos.start = 0;
+    pos.limit = input.length();
+
+    t->transliterate(test, pos.start, pos.limit);
+    UBool gotError = FALSE;
+    if (pos.start == 0) {
+        log("No Progress, ");
+        log(t->getID());
+        log(": ");
+        errln(formatInput(test, input, pos));
+        gotError = TRUE;
+    } else {
+        log("PASS Progress, ");
+        log(t->getID());
+        log(": ");
+        logln(formatInput(test, input, pos));
+    }
+    t->finishTransliteration(test, pos);
+    if (pos.start != pos.limit) {
+        log("Incomplete, ");
+        log(t->getID()); 
+        log(":  ");
+        errln(formatInput(test, input, pos));
+        gotError = TRUE;
+    }
+    return test;
+}
 //======================================================================
 // Support methods
 //======================================================================
@ -3047,6 +3235,7 @@ void TransliteratorTest::expect(const Transliterator& t,
              expectedResult);
 }

+    
 /**
 * @param appendTo result is appended to this param.
 * @param input the string being transliterated
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@ -259,45 +259,59 @@ class TransliteratorTest : public IntlTest {
    /**
     * Make sure parse errors reference the right line.
     */
-    void TestParseError();
+    void TestParseError(void);

    /**
     * Make sure sets on output are disallowed.
     */
-    void TestOutputSet();
+    void TestOutputSet(void);

    /**
     * Test the use variable range pragma, making sure that use of
     * variable range characters is detected and flagged as an error.
     */
-    void TestVariableRange();
+    void TestVariableRange(void);

    /**
     * Test invalid post context error handling
     */
-    void TestInvalidPostContext();
+    void TestInvalidPostContext(void);

    /**
     * Test ID form variants
     */
-    void TestIDForms();
+    void TestIDForms(void);

    /**
     * Mark's toRules test.
     */
-    void TestToRulesMark();
+    void TestToRulesMark(void);

    /**
     * Test Escape and Unescape transliterators.
     */
-    void TestEscape();
+    void TestEscape(void);

-    void TestAnchorMasking();
+    void TestAnchorMasking(void);

    /**
     * Make sure display names of variants look reasonable.
     */
-    void TestDisplayName();
+    void TestDisplayName(void);
+    
+    /** 
+     * Check to see if case mapping works correctly.
+     */
+    void TestSpecialCases(void);
+    /**
+     * Check to see that incremental gets at least part way through a reasonable string.
+     */
+    void TestIncrementalProgress(void);
+
+    /** 
+     * Check that casing handles surrogates.
+     */
+    void TestSurrogateCasing (void);

    //======================================================================
    // Support methods
@ -333,6 +347,9 @@ class TransliteratorTest : public IntlTest {

    void checkRules(const UnicodeString& label, Transliterator& t2,
                    const UnicodeString& testRulesForward);
+    UnicodeString CheckIncrementalAux(const Transliterator* t, 
+                             const UnicodeString& input);
+
 };

 #endif