ICU-21141 Fix titlecase of accented Dutch ij digraph

See #1869
This commit is contained in:
Elango 2022-02-15 23:27:24 +00:00
parent 398489b915
commit 039ecd6fd0
5 changed files with 465 additions and 57 deletions

View file

@ -420,6 +420,96 @@ void toUpper(int32_t caseLocale, uint32_t options,
#if !UCONFIG_NO_BREAK_ITERATION
namespace {
constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
/**
* Input: c is a letter I with or without acute accent.
* start is the index in src after c, and is less than segmentLimit.
* If a plain i/I is followed by a plain j/J,
* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
* then we output accordingly.
*
* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
*/
int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
int32_t index = start;
bool withAcute = false;
// If the conditions are met, then the following variables tell us what to output.
int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
bool doTitleJ = false; // true if the j needs to be titlecased
int32_t unchanged2 = 0; // after the j (0 or 1)
// next character after the first letter
UChar32 c2;
c2 = src[index++];
// Is the first letter an i/I with accent?
if (c == u'I') {
if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
withAcute = true;
unchanged1 = 2; // ACUTE is 2 code units in UTF-8
if (index == segmentLimit) { return start; }
c2 = src[index++];
}
} else { // Í
withAcute = true;
}
// Is the next character a j/J?
if (c2 == u'j') {
doTitleJ = true;
} else if (c2 == u'J') {
++unchanged1;
} else {
return start;
}
// A plain i/I must be followed by a plain j/J.
// An i/I with acute must be followed by a j/J with acute.
if (withAcute) {
if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
return start;
}
if (doTitleJ) {
unchanged2 = 2; // ACUTE is 2 code units in UTF-8
} else {
unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
}
}
// There must not be another combining mark.
if (index < segmentLimit) {
int32_t cp;
int32_t i = index;
U8_NEXT(src, i, segmentLimit, cp);
uint32_t typeMask = U_GET_GC_MASK(cp);
if ((typeMask & U_GC_M_MASK) != 0) {
return start;
}
}
// Output the rest of the Dutch IJ.
ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
start += unchanged1;
if (doTitleJ) {
ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
++start;
}
ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
U_ASSERT(start + unchanged2 == index);
return index;
}
} // namespace
U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(
int32_t caseLocale, uint32_t options, BreakIterator *iter,
@ -505,18 +595,13 @@ ucasemap_internalUTF8ToTitle(
/* Special case Dutch IJ titlecasing */
if (titleStart+1 < index &&
caseLocale == UCASE_LOC_DUTCH &&
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
if (src[titleStart+1] == 0x006A) {
ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
titleLimit++;
} else if (src[titleStart+1] == 0x004A) {
// Keep the capital J from getting lowercased.
if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
sink, options, edits, errorCode)) {
return;
}
titleLimit++;
caseLocale == UCASE_LOC_DUTCH) {
if (c < 0) {
c = ~c;
}
if (c == u'I' || c == u'Í') {
titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
}
}

View file

@ -36,6 +36,12 @@
#include "ustr_imp.h"
#include "uassert.h"
/**
* Code point for COMBINING ACUTE ACCENT
* @internal
*/
#define ACUTE u'\u0301'
U_NAMESPACE_BEGIN
namespace {
@ -396,6 +402,93 @@ U_NAMESPACE_USE
#if !UCONFIG_NO_BREAK_ITERATION
namespace {
/**
* Input: c is a letter I with or without acute accent.
* start is the index in src after c, and is less than segmentLimit.
* If a plain i/I is followed by a plain j/J,
* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
* then we output accordingly.
*
* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
*/
int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
icu::Edits *edits) {
int32_t index = start;
bool withAcute = false;
// If the conditions are met, then the following variables tell us what to output.
int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
bool doTitleJ = false; // true if the j needs to be titlecased
int32_t unchanged2 = 0; // after the j (0 or 1)
// next character after the first letter
UChar c2 = src[index++];
// Is the first letter an i/I with accent?
if (c == u'I') {
if (c2 == ACUTE) {
withAcute = true;
unchanged1 = 1;
if (index == segmentLimit) { return start; }
c2 = src[index++];
}
} else { // Í
withAcute = true;
}
// Is the next character a j/J?
if (c2 == u'j') {
doTitleJ = true;
} else if (c2 == u'J') {
++unchanged1;
} else {
return start;
}
// A plain i/I must be followed by a plain j/J.
// An i/I with acute must be followed by a j/J with acute.
if (withAcute) {
if (index == segmentLimit || src[index++] != ACUTE) { return start; }
if (doTitleJ) {
unchanged2 = 1;
} else {
++unchanged1;
}
}
// There must not be another combining mark.
if (index < segmentLimit) {
int32_t cp;
int32_t i = index;
U16_NEXT(src, i, segmentLimit, cp);
uint32_t typeMask = U_GET_GC_MASK(cp);
if ((typeMask & U_GC_M_MASK) != 0) {
return start;
}
}
// Output the rest of the Dutch IJ.
destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
start += unchanged1;
if (doTitleJ) {
destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
if (edits != nullptr) {
edits->addReplace(1, 1);
}
++start;
}
destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
U_ASSERT(start + unchanged2 == index);
return index;
}
} // namespace
U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
UChar *dest, int32_t destCapacity,
@ -412,14 +505,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
csc.limit=srcLength;
int32_t destIndex=0;
int32_t prev=0;
UBool isFirstIndex=TRUE;
bool isFirstIndex=true;
/* titlecasing loop */
while(prev<srcLength) {
/* find next index where to titlecase */
int32_t index;
if(isFirstIndex) {
isFirstIndex=FALSE;
isFirstIndex=false;
index=iter->first();
} else {
index=iter->next();
@ -446,7 +539,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
// Stop with titleStart<titleLimit<=index
// if there is a character to be titlecased,
// or else stop with titleStart==titleLimit==index.
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
@ -479,27 +572,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
/* Special case Dutch IJ titlecasing */
if (titleStart+1 < index &&
caseLocale == UCASE_LOC_DUTCH &&
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
if (src[titleStart+1] == 0x006A) {
destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if(edits!=NULL) {
edits->addReplace(1, 1);
}
titleLimit++;
} else if (src[titleStart+1] == 0x004A) {
// Keep the capital J from getting lowercased.
destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+titleStart+1, 1, options, edits);
if(destIndex<0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
titleLimit++;
caseLocale == UCASE_LOC_DUTCH) {
if (c < 0) {
c = ~c;
}
if (c == u'I' || c == u'Í') {
titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
dest, destIndex, destCapacity, options,
edits);
}
}

View file

@ -51,6 +51,7 @@ public:
void *iter, const char *localeID, uint32_t options);
void TestCasing();
void TestTitleOptions();
void TestDutchTitle();
void TestFullCaseFoldingIterator();
void TestGreekUpper();
void TestArmenian();
@ -95,6 +96,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILE_IO && !UCONFIG_NO_LEGACY_CONVERSION
TESTCASE_AUTO(TestCasing);
TESTCASE_AUTO(TestTitleOptions);
TESTCASE_AUTO(TestDutchTitle);
#endif
TESTCASE_AUTO(TestFullCaseFoldingIterator);
TESTCASE_AUTO(TestGreekUpper);
@ -451,6 +453,7 @@ StringCaseTest::TestCasingImpl(const UnicodeString &input,
}
if(result!=output) {
dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
dataerrln(UnicodeString("input = [") + input + "], expected = [" + output + "], actual = [" + result + "]");
}
#if !UCONFIG_NO_BREAK_ITERATION
if(whichCase==TEST_TITLE && options==0) {
@ -667,6 +670,104 @@ StringCaseTest::TestTitleOptions() {
#endif
}
#if !UCONFIG_NO_BREAK_ITERATION
void StringCaseTest::TestDutchTitle() {
IcuTestErrorCode errorCode(*this, "TestDutchTitle");
Locale nl("nl"); // Dutch
LocalPointer<BreakIterator> iter(
BreakIterator::createWordInstance(nl, errorCode));
// Dutch titlecase check in English
TestCasingImpl(
u"ijssel igloo IJMUIDEN",
u"Ijssel Igloo Ijmuiden",
TEST_TITLE,
nullptr,
"en",
0);
// Dutch titlecase check in Dutch
TestCasingImpl(
u"ijssel igloo IJMUIDEN",
u"IJssel Igloo IJmuiden",
TEST_TITLE,
nullptr,
"nl",
0);
// Dutch titlecase check in Dutch with nolowercase option
if (U_SUCCESS(errorCode)) {
iter->setText(u"ijssel igloo IjMUIdEN iPoD ijenough");
TestCasingImpl(
u"ijssel igloo IjMUIdEN iPoD ijenough",
u"IJssel Igloo IJMUIdEN IPoD IJenough",
TEST_TITLE,
nullptr,
"nl",
U_TITLECASE_NO_LOWERCASE);
}
errorCode.reset();
// Accented IJ testing
struct dutchTitleTestCase {
const UnicodeString input;
const UnicodeString expectedFull;
const UnicodeString expectedOnlyChanged;
} dutchTitleTestCases[] = {
// input, expectedFull, expectedOnlyChanged
{u"ij", u"IJ", u"IJ"},
{u"IJ", u"IJ", u""},
{u"íj́", u"ÍJ́", u"ÍJ"},
{u"ÍJ́", u"ÍJ́", u""},
{u"íJ́", u"ÍJ́", u"Í"},
{u"Ij́", u"Ij́", u""},
{u"ij́", u"Ij́", u"I"},
{u"ïj́", u"Ïj́", u"Ï"},
{u"íj\u0308", u"Íj\u0308", u"Í"},
{u"íj́\U0001D16E", u"Íj́\U0001D16E", u"Í"},
{u"íj\u1ABE", u"Íj\u1ABE", u"Í"},
{u"ijabc", u"IJabc", u"IJ"},
{u"IJabc", u"IJabc", u""},
{u"íj́abc", u"ÍJ́abc", u"ÍJ"},
{u"ÍJ́abc", u"ÍJ́abc", u""},
{u"íJ́abc", u"ÍJ́abc", u"Í"},
{u"Ij́abc", u"Ij́abc", u""},
{u"ij́abc", u"Ij́abc", u"I"},
{u"ïj́abc", u"Ïj́abc", u"Ï"},
{u"íjabc\u0308", u"Íjabc\u0308", u"Í"},
{u"íj́abc\U0001D16E", u"ÍJ́abc\U0001D16E", u"ÍJ"},
{u"íjabc\u1ABE", u"Íjabc\u1ABE", u"Í"},
};
for (const auto& cas : dutchTitleTestCases) {
const UnicodeString &input = cas.input;
const UnicodeString &expectedFull = cas.expectedFull;
const UnicodeString &expectedOnlyChanged = cas.expectedOnlyChanged;
for (const auto& isOnlyChanged : {true, false}) {
uint32_t testOptions = U_TITLECASE_NO_LOWERCASE
| (isOnlyChanged ? U_OMIT_UNCHANGED_TEXT : 0);
const UnicodeString &expected = isOnlyChanged ? expectedOnlyChanged : expectedFull;
TestCasingImpl(
input,
expected,
TEST_TITLE,
nullptr,
"nl",
testOptions
);
}
}
}
#endif
void
StringCaseTest::TestFullCaseFoldingIterator() {
UnicodeString ffi=UNICODE_STRING_SIMPLE("ffi");

View file

@ -70,6 +70,10 @@ public final class CaseMapImpl {
cpStart=cpLimit=limit;
}
public void moveTo(int i) {
cpStart=cpLimit=i;
}
/**
* Iterate forward through the string to fetch the next code point
* to be case-mapped, and set the context indexes for it.
@ -189,6 +193,13 @@ public final class CaseMapImpl {
return options | newOption;
}
private static final char ACUTE = '\u0301';
private static final int U_GC_M_MASK =
(1 << UCharacterCategory.NON_SPACING_MARK) |
(1 << UCharacterCategory.COMBINING_SPACING_MARK) |
(1 << UCharacterCategory.ENCLOSING_MARK);
private static final int LNS =
(1 << UCharacterCategory.UPPERCASE_LETTER) |
(1 << UCharacterCategory.LOWERCASE_LETTER) |
@ -726,34 +737,25 @@ public final class CaseMapImpl {
}
if(titleStart<index) {
int titleLimit=iter.getCPLimit();
// titlecase c which is from [titleStart..titleLimit[
c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
appendResult(c, dest, iter.getCPLength(), options, edits);
// Special case Dutch IJ titlecasing
int titleLimit;
if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
char c1 = src.charAt(titleStart);
if ((c1 == 'i' || c1 == 'I')) {
char c2 = src.charAt(titleStart+1);
if (c2 == 'j') {
dest.append('J');
if (edits != null) {
edits.addReplace(1, 1);
}
c = iter.nextCaseMapCP();
titleLimit++;
assert c == c2;
assert titleLimit == iter.getCPLimit();
} else if (c2 == 'J') {
// Keep the capital J from getting lowercased.
appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
c = iter.nextCaseMapCP();
titleLimit++;
assert c == c2;
assert titleLimit == iter.getCPLimit();
}
if (c < 0) {
c = ~c;
}
if (c == 'I' || c == 'Í') {
titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, dest, options, edits);
iter.moveTo(titleLimit);
}
else {
titleLimit = iter.getCPLimit();
}
} else {
titleLimit = iter.getCPLimit();
}
// lowercase [titleLimit..index[
@ -779,6 +781,82 @@ public final class CaseMapImpl {
}
}
/**
* Input: c is a letter I with or without acute accent.
* start is the index in src after c, and is less than segmentLimit.
* If a plain i/I is followed by a plain j/J,
* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
* then we output accordingly.
*
* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
* @throws IOException
*/
private static <A extends Appendable> int maybeTitleDutchIJ(
CharSequence src, int c, int start, int segmentLimit,
A dest, int options, Edits edits) throws IOException {
int index = start;
boolean withAcute = false;
// If the conditions are met, then the following variables tell us what to output.
int unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
boolean doTitleJ = false; // true if the j needs to be titlecased
int unchanged2 = 0; // after the j (0 or 1)
// next character after the first letter
char c2 = src.charAt(index++);
// Is the first letter an i/I with accent?
if (c == 'I') {
if (c2 == ACUTE) {
withAcute = true;
unchanged1 = 1;
if (index == segmentLimit) { return start; }
c2 = src.charAt(index++);
}
} else { // Í
withAcute = true;
}
// Is the next character a j/J?
if (c2 == 'j') {
doTitleJ = true;
} else if (c2 == 'J') {
++unchanged1;
} else {
return start;
}
// A plain i/I must be followed by a plain j/J.
// An i/I with acute must be followed by a j/J with acute.
if (withAcute) {
if (index == segmentLimit || src.charAt(index++) != ACUTE) { return start; }
if (doTitleJ) {
unchanged2 = 1;
} else {
++unchanged1;
}
}
// There must not be another combining mark.
if (index < segmentLimit) {
int cp = Character.codePointAt(src, index);
int bit = 1 << UCharacter.getType(cp);
if ((bit & U_GC_M_MASK) != 0) {
return start;
}
}
// Output the rest of the Dutch IJ.
appendUnchanged(src, start, unchanged1, dest, options, edits);
start += unchanged1;
if (doTitleJ) {
dest.append('J');
if (edits != null) {
edits.addReplace(1, 1);
}
++start;
}
appendUnchanged(src, start, unchanged2, dest, options, edits);
assert start + unchanged2 == index;
return index;
}
public static String fold(int options, CharSequence src) {
if (src.length() <= 100 && (options & OMIT_UNCHANGED_TEXT) == 0) {
if (src.length() == 0) {

View file

@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
@ -23,6 +24,7 @@ import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.impl.CaseMapImpl;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
@ -467,6 +469,67 @@ public final class UCharacterCaseTest extends TestFmwk
assertEquals("Dutch titlecase check in Dutch with nolowercase option",
"IJssel Igloo IJMUIdEN IPoD IJenough",
UCharacter.toTitleCase(LOC_DUTCH, "ijssel igloo IjMUIdEN iPoD ijenough", iter, options));
// Accented IJ testing
String[][] dutchIJCasesData = {
// input, expectedFull, expOnlyChanged
{"ij", "IJ", "IJ"},
{"IJ", "IJ", ""},
{"íj́", "ÍJ́", "ÍJ"},
{"ÍJ́", "ÍJ́", ""},
{"íJ́", "ÍJ́", "Í"},
{"Ij́", "Ij́", ""},
{"ij́", "Ij́", "I"},
{"ïj́", "Ïj́", "Ï"},
{"íj\u0308", "Íj\u0308", "Í"},
{"íj́\uD834\uDD6E", "Íj́\uD834\uDD6E", "Í"}, // \uD834\uDD6E == \U0001D16E
{"íj\u1ABE", "Íj\u1ABE", "Í"},
{"ijabc", "IJabc", "IJ"},
{"IJabc", "IJabc", ""},
{"íj́abc", "ÍJ́abc", "ÍJ"},
{"ÍJ́abc", "ÍJ́abc", ""},
{"íJ́abc", "ÍJ́abc", "Í"},
{"Ij́abc", "Ij́abc", ""},
{"ij́abc", "Ij́abc", "I"},
{"ïj́abc", "Ïj́abc", "Ï"},
{"íjabc\u0308", "Íjabc\u0308", "Í"},
{"íj́abc\uD834\uDD6E", "ÍJ́abc\uD834\uDD6E", "ÍJ"},
{"íjabc\u1ABE", "Íjabc\u1ABE", "Í"},
};
for (String[] caseDatum : dutchIJCasesData) {
String input = caseDatum[0];
String expectedFull = caseDatum[1];
String expectedOnlyChanged = caseDatum[2];
for (boolean isOnlyChanged : Arrays.asList(true, false)) {
String testMsg = "Dutch accented ij"
+ (isOnlyChanged ? ", only changes" : "");
int testOptions = UCharacter.TITLECASE_NO_LOWERCASE
| (isOnlyChanged ? CaseMapImpl.OMIT_UNCHANGED_TEXT : 0);
CaseMap.Title titleCaseMapBase = CaseMap.toTitle().noLowercase();
CaseMap.Title titleCaseMap = isOnlyChanged ? titleCaseMapBase.omitUnchangedText() : titleCaseMapBase;
String expected = isOnlyChanged ? expectedOnlyChanged : expectedFull;
// Newer API for title casing
StringBuilder resultBuilder = new StringBuilder();
Edits edits = new Edits();
titleCaseMap.apply(DUTCH_LOCALE_, null, input, resultBuilder, edits);
String result = resultBuilder.toString();
assertEquals(testMsg + ", [" + input + "]",
expected, result);
// Older API for title casing (vs. Newer API)
String oldApiResult = UCharacter.toTitleCase(LOC_DUTCH, input, null, testOptions);
assertEquals(testMsg + ", Title.apply() vs UCharacter.toTitleCase()" + ", [" + input + "]",
result, oldApiResult);
}
}
}
@Test