From 039ecd6fd02078b7a184417714a45a90d2499812 Mon Sep 17 00:00:00 2001 From: Elango Date: Tue, 15 Feb 2022 23:27:24 +0000 Subject: [PATCH] ICU-21141 Fix titlecase of accented Dutch ij digraph See #1869 --- icu4c/source/common/ucasemap.cpp | 109 +++++++++++++-- icu4c/source/common/ustrcase.cpp | 129 ++++++++++++++---- icu4c/source/test/intltest/strcase.cpp | 101 ++++++++++++++ .../src/com/ibm/icu/impl/CaseMapImpl.java | 120 +++++++++++++--- .../icu/dev/test/lang/UCharacterCaseTest.java | 63 +++++++++ 5 files changed, 465 insertions(+), 57 deletions(-) diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index ed72bda828f..b6e7f2b744d 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -420,6 +420,96 @@ void toUpper(int32_t caseLocale, uint32_t options, #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0]; + +constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1]; + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit, + ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar32 c2; + c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) { + withAcute = true; + unchanged1 = 2; // ACUTE is 2 code units in UTF-8 + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { + return start; + } + if (doTitleJ) { + unchanged2 = 2; // ACUTE is 2 code units in UTF-8 + } else { + unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8 + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U8_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode); + start += unchanged1; + if (doTitleJ) { + ByteSinkUtil::appendCodePoint(1, u'J', sink, edits); + ++start; + } + ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC void U_CALLCONV ucasemap_internalUTF8ToTitle( int32_t caseLocale, uint32_t options, BreakIterator *iter, @@ -505,18 +595,13 @@ ucasemap_internalUTF8ToTitle( /* Special case Dutch IJ titlecasing */ if (titleStart+1 < index && - caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { - if (src[titleStart+1] == 0x006A) { - ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits); - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1, - sink, options, edits, errorCode)) { - return; - } - titleLimit++; + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode); } } diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index 36b19e75f2d..acd37a598ab 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -36,6 +36,12 @@ #include "ustr_imp.h" #include "uassert.h" +/** + * Code point for COMBINING ACUTE ACCENT + * @internal + */ +#define ACUTE u'\u0301' + U_NAMESPACE_BEGIN namespace { @@ -396,6 +402,93 @@ U_NAMESPACE_USE #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit, + UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options, + icu::Edits *edits) { + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE) { + withAcute = true; + unchanged1 = 1; + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if (index == segmentLimit || src[index++] != ACUTE) { return start; } + if (doTitleJ) { + unchanged2 = 1; + } else { + ++unchanged1; + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U16_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits); + start += unchanged1; + if (doTitleJ) { + destIndex = appendUChar(dest, destIndex, destCapacity, u'J'); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + ++start; + } + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC int32_t U_CALLCONV ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, UChar *dest, int32_t destCapacity, @@ -412,14 +505,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it csc.limit=srcLength; int32_t destIndex=0; int32_t prev=0; - UBool isFirstIndex=TRUE; + bool isFirstIndex=true; /* titlecasing loop */ while(prevfirst(); } else { index=iter->next(); @@ -446,7 +539,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it // Stop with titleStartaddReplace(1, 1); - } - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+titleStart+1, 1, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - titleLimit++; + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, + dest, destIndex, destCapacity, options, + edits); } } diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp index 006bcd64ed3..14df2a36bdb 100644 --- a/icu4c/source/test/intltest/strcase.cpp +++ b/icu4c/source/test/intltest/strcase.cpp @@ -51,6 +51,7 @@ public: void *iter, const char *localeID, uint32_t options); void TestCasing(); void TestTitleOptions(); + void TestDutchTitle(); void TestFullCaseFoldingIterator(); void TestGreekUpper(); void TestArmenian(); @@ -95,6 +96,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION TESTCASE_AUTO(TestCasing); TESTCASE_AUTO(TestTitleOptions); + TESTCASE_AUTO(TestDutchTitle); #endif TESTCASE_AUTO(TestFullCaseFoldingIterator); TESTCASE_AUTO(TestGreekUpper); @@ -451,6 +453,7 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input, } if(result!=output) { dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); + dataerrln(UnicodeString("input = [") + input + "], expected = [" + output + "], actual = [" + result + "]"); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { @@ -667,6 +670,104 @@ StringCaseTest::TestTitleOptions() { #endif } +#if !UCONFIG_NO_BREAK_ITERATION +void StringCaseTest::TestDutchTitle() { + IcuTestErrorCode errorCode(*this, "TestDutchTitle"); + + Locale nl("nl"); // Dutch + LocalPointer iter( + BreakIterator::createWordInstance(nl, errorCode)); + + // Dutch titlecase check in English + TestCasingImpl( + u"ijssel igloo IJMUIDEN", + u"Ijssel Igloo Ijmuiden", + TEST_TITLE, + nullptr, + "en", + 0); + + // Dutch titlecase check in Dutch + TestCasingImpl( + u"ijssel igloo IJMUIDEN", + u"IJssel Igloo IJmuiden", + TEST_TITLE, + nullptr, + "nl", + 0); + + // Dutch titlecase check in Dutch with nolowercase option + if (U_SUCCESS(errorCode)) { + iter->setText(u"ijssel igloo IjMUIdEN iPoD ijenough"); + TestCasingImpl( + u"ijssel igloo IjMUIdEN iPoD ijenough", + u"IJssel Igloo IJMUIdEN IPoD IJenough", + TEST_TITLE, + nullptr, + "nl", + U_TITLECASE_NO_LOWERCASE); + } + + errorCode.reset(); + + // Accented IJ testing + + struct dutchTitleTestCase { + const UnicodeString input; + const UnicodeString expectedFull; + const UnicodeString expectedOnlyChanged; + } dutchTitleTestCases[] = { + // input, expectedFull, expectedOnlyChanged + {u"ij", u"IJ", u"IJ"}, + {u"IJ", u"IJ", u""}, + {u"íj́", u"ÍJ́", u"ÍJ"}, + {u"ÍJ́", u"ÍJ́", u""}, + {u"íJ́", u"ÍJ́", u"Í"}, + {u"Ij́", u"Ij́", u""}, + {u"ij́", u"Ij́", u"I"}, + {u"ïj́", u"Ïj́", u"Ï"}, + {u"íj\u0308", u"Íj\u0308", u"Í"}, + {u"íj́\U0001D16E", u"Íj́\U0001D16E", u"Í"}, + {u"íj\u1ABE", u"Íj\u1ABE", u"Í"}, + + {u"ijabc", u"IJabc", u"IJ"}, + {u"IJabc", u"IJabc", u""}, + {u"íj́abc", u"ÍJ́abc", u"ÍJ"}, + {u"ÍJ́abc", u"ÍJ́abc", u""}, + {u"íJ́abc", u"ÍJ́abc", u"Í"}, + {u"Ij́abc", u"Ij́abc", u""}, + {u"ij́abc", u"Ij́abc", u"I"}, + {u"ïj́abc", u"Ïj́abc", u"Ï"}, + {u"íjabc\u0308", u"Íjabc\u0308", u"Í"}, + {u"íj́abc\U0001D16E", u"ÍJ́abc\U0001D16E", u"ÍJ"}, + {u"íjabc\u1ABE", u"Íjabc\u1ABE", u"Í"}, + }; + + for (const auto& cas : dutchTitleTestCases) { + const UnicodeString &input = cas.input; + const UnicodeString &expectedFull = cas.expectedFull; + const UnicodeString &expectedOnlyChanged = cas.expectedOnlyChanged; + + for (const auto& isOnlyChanged : {true, false}) { + uint32_t testOptions = U_TITLECASE_NO_LOWERCASE + | (isOnlyChanged ? U_OMIT_UNCHANGED_TEXT : 0); + + const UnicodeString &expected = isOnlyChanged ? expectedOnlyChanged : expectedFull; + + TestCasingImpl( + input, + expected, + TEST_TITLE, + nullptr, + "nl", + testOptions + ); + } + + } +} +#endif + void StringCaseTest::TestFullCaseFoldingIterator() { UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi"); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java index 9c48035acc6..052e52c592f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CaseMapImpl.java @@ -70,6 +70,10 @@ public final class CaseMapImpl { cpStart=cpLimit=limit; } + public void moveTo(int i) { + cpStart=cpLimit=i; + } + /** * Iterate forward through the string to fetch the next code point * to be case-mapped, and set the context indexes for it. @@ -189,6 +193,13 @@ public final class CaseMapImpl { return options | newOption; } + private static final char ACUTE = '\u0301'; + + private static final int U_GC_M_MASK = + (1 << UCharacterCategory.NON_SPACING_MARK) | + (1 << UCharacterCategory.COMBINING_SPACING_MARK) | + (1 << UCharacterCategory.ENCLOSING_MARK); + private static final int LNS = (1 << UCharacterCategory.UPPERCASE_LETTER) | (1 << UCharacterCategory.LOWERCASE_LETTER) | @@ -726,34 +737,25 @@ public final class CaseMapImpl { } if(titleStart int maybeTitleDutchIJ( + CharSequence src, int c, int start, int segmentLimit, + A dest, int options, Edits edits) throws IOException { + int index = start; + boolean withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + boolean doTitleJ = false; // true if the j needs to be titlecased + int unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + char c2 = src.charAt(index++); + + // Is the first letter an i/I with accent? + if (c == 'I') { + if (c2 == ACUTE) { + withAcute = true; + unchanged1 = 1; + if (index == segmentLimit) { return start; } + c2 = src.charAt(index++); + } + } else { // Í + withAcute = true; + } + // Is the next character a j/J? + if (c2 == 'j') { + doTitleJ = true; + } else if (c2 == 'J') { + ++unchanged1; + } else { + return start; + } + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; } + if (doTitleJ) { + unchanged2 = 1; + } else { + ++unchanged1; + } + } + // There must not be another combining mark. + if (index < segmentLimit) { + int cp = Character.codePointAt(src, index); + int bit = 1 << UCharacter.getType(cp); + if ((bit & U_GC_M_MASK) != 0) { + return start; + } + } + // Output the rest of the Dutch IJ. + appendUnchanged(src, start, unchanged1, dest, options, edits); + start += unchanged1; + if (doTitleJ) { + dest.append('J'); + if (edits != null) { + edits.addReplace(1, 1); + } + ++start; + } + appendUnchanged(src, start, unchanged2, dest, options, edits); + assert start + unchanged2 == index; + return index; + } + public static String fold(int options, CharSequence src) { if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) { if (src.length() == 0) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java index 4562ee9a270..f56f2950e31 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java @@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang; import java.io.BufferedReader; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Locale; @@ -23,6 +24,7 @@ import org.junit.runners.JUnit4; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; +import com.ibm.icu.impl.CaseMapImpl; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; @@ -467,6 +469,67 @@ public final class UCharacterCaseTest extends TestFmwk assertEquals("Dutch titlecase check in Dutch with nolowercase option", "IJssel Igloo IJMUIdEN IPoD IJenough", UCharacter.toTitleCase(LOC_DUTCH, "ijssel igloo IjMUIdEN iPoD ijenough", iter, options)); + + // Accented IJ testing + + String[][] dutchIJCasesData = { + // input, expectedFull, expOnlyChanged + {"ij", "IJ", "IJ"}, + {"IJ", "IJ", ""}, + {"íj́", "ÍJ́", "ÍJ"}, + {"ÍJ́", "ÍJ́", ""}, + {"íJ́", "ÍJ́", "Í"}, + {"Ij́", "Ij́", ""}, + {"ij́", "Ij́", "I"}, + {"ïj́", "Ïj́", "Ï"}, + {"íj\u0308", "Íj\u0308", "Í"}, + {"íj́\uD834\uDD6E", "Íj́\uD834\uDD6E", "Í"}, // \uD834\uDD6E == \U0001D16E + {"íj\u1ABE", "Íj\u1ABE", "Í"}, + + {"ijabc", "IJabc", "IJ"}, + {"IJabc", "IJabc", ""}, + {"íj́abc", "ÍJ́abc", "ÍJ"}, + {"ÍJ́abc", "ÍJ́abc", ""}, + {"íJ́abc", "ÍJ́abc", "Í"}, + {"Ij́abc", "Ij́abc", ""}, + {"ij́abc", "Ij́abc", "I"}, + {"ïj́abc", "Ïj́abc", "Ï"}, + {"íjabc\u0308", "Íjabc\u0308", "Í"}, + {"íj́abc\uD834\uDD6E", "ÍJ́abc\uD834\uDD6E", "ÍJ"}, + {"íjabc\u1ABE", "Íjabc\u1ABE", "Í"}, + }; + + for (String[] caseDatum : dutchIJCasesData) { + String input = caseDatum[0]; + String expectedFull = caseDatum[1]; + String expectedOnlyChanged = caseDatum[2]; + + for (boolean isOnlyChanged : Arrays.asList(true, false)) { + String testMsg = "Dutch accented ij" + + (isOnlyChanged ? ", only changes" : ""); + + int testOptions = UCharacter.TITLECASE_NO_LOWERCASE + | (isOnlyChanged ? CaseMapImpl.OMIT_UNCHANGED_TEXT : 0); + + CaseMap.Title titleCaseMapBase = CaseMap.toTitle().noLowercase(); + CaseMap.Title titleCaseMap = isOnlyChanged ? titleCaseMapBase.omitUnchangedText() : titleCaseMapBase; + + String expected = isOnlyChanged ? expectedOnlyChanged : expectedFull; + + // Newer API for title casing + StringBuilder resultBuilder = new StringBuilder(); + Edits edits = new Edits(); + titleCaseMap.apply(DUTCH_LOCALE_, null, input, resultBuilder, edits); + String result = resultBuilder.toString(); + assertEquals(testMsg + ", [" + input + "]", + expected, result); + + // Older API for title casing (vs. Newer API) + String oldApiResult = UCharacter.toTitleCase(LOC_DUTCH, input, null, testOptions); + assertEquals(testMsg + ", Title.apply() vs UCharacter.toTitleCase()" + ", [" + input + "]", + result, oldApiResult); + } + } } @Test