mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-15 09:45:26 +00:00
ICU-1501 Updated Upper/Lower/TitlecaseTransliterator and checked in new casing tests for surrogates from Java.
X-SVN-Rev: 7235
This commit is contained in:
parent
1022f98094
commit
6f5df69456
8 changed files with 231 additions and 21 deletions
|
@ -1066,11 +1066,11 @@ getCaseLocale(const char *locale) {
|
|||
return LOC_ROOT;
|
||||
}
|
||||
|
||||
if( (locale[0]=='t' && locale[1]=='r') ||
|
||||
(locale[0]=='a' && locale[1]=='z')
|
||||
if( (lang[0]=='t' && lang[1]=='r') ||
|
||||
(lang[0]=='a' && lang[1]=='z')
|
||||
) {
|
||||
return LOC_TURKISH;
|
||||
} else if(locale[0]=='l' && locale[1]=='t') {
|
||||
} else if(lang[0]=='l' && lang[1]=='t') {
|
||||
return LOC_LITHUANIAN;
|
||||
} else {
|
||||
return LOC_ROOT;
|
||||
|
|
|
@ -234,6 +234,7 @@ CLEAN :
|
|||
-@erase "*.obj"
|
||||
-@erase "*.brk"
|
||||
-@erase "*.dat"
|
||||
-@erase "*.dll"
|
||||
@cd "$(TESTDATAOUT)"
|
||||
-@erase "*.dat"
|
||||
-@erase "*.cnv"
|
||||
|
|
|
@ -234,6 +234,7 @@ CLEAN :
|
|||
-@erase "*.obj"
|
||||
-@erase "*.brk"
|
||||
-@erase "*.dat"
|
||||
-@erase "*.dll"
|
||||
@cd "$(TESTDATAOUT)"
|
||||
-@erase "*.dat"
|
||||
-@erase "*.cnv"
|
||||
|
|
|
@ -136,14 +136,13 @@ void TitlecaseTransliterator::handleTransliterate(
|
|||
|
||||
int32_t i = textPos - offsets.contextStart;
|
||||
int32_t limit = offsets.limit - offsets.contextStart;
|
||||
UChar32 cp;
|
||||
UChar32 cp, bufferCH;
|
||||
int32_t oldLen;
|
||||
int32_t newLen;
|
||||
|
||||
for (; i < limit; ) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t s = i;
|
||||
buffer[0] = original.charAt(s);
|
||||
|
||||
UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
|
||||
oldLen = UTF_CHAR_LENGTH(cp);
|
||||
|
@ -153,7 +152,8 @@ void TitlecaseTransliterator::handleTransliterate(
|
|||
newLen = u_internalTitleCase(cp, buffer, u_getMaxCaseExpansion(), loc.getName());
|
||||
} else {
|
||||
u_strToLower(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
|
||||
newLen = buffer[0] == original.charAt(s) ? -1 : u_strlen(buffer);
|
||||
UTF_GET_CHAR(buffer, 0, s, u_strlen(buffer), bufferCH);
|
||||
newLen = bufferCH == original.char32At(s) ? -1 : u_strlen(buffer);
|
||||
}
|
||||
doTitle = !CASED->contains(cp);
|
||||
if (newLen >= 0) {
|
||||
|
|
|
@ -86,20 +86,21 @@ void LowercaseTransliterator::handleTransliterate(Replaceable& text,
|
|||
|
||||
int32_t i = textPos - offsets.contextStart;
|
||||
int32_t limit = offsets.limit - offsets.contextStart;
|
||||
UChar32 cp;
|
||||
UChar32 cp, bufferCH;
|
||||
int32_t oldLen;
|
||||
|
||||
for (; i < limit; ) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t s = i;
|
||||
buffer[0] = original.charAt(s);
|
||||
bufferCH = original.char32At(s);
|
||||
|
||||
UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
|
||||
oldLen = UTF_CHAR_LENGTH(cp);
|
||||
i += oldLen;
|
||||
u_strToLower(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
|
||||
/* Skip checking of status code here because the buffer should not have overflowed. */
|
||||
if ( buffer[0] != original.charAt(s) ) {
|
||||
UTF_GET_CHAR(buffer, 0, s, u_strlen(buffer), cp);
|
||||
if ( bufferCH != cp ) {
|
||||
int len = u_strlen(buffer);
|
||||
UnicodeString temp(buffer);
|
||||
text.handleReplaceBetween(textPos, textPos + oldLen, temp);
|
||||
|
|
|
@ -89,20 +89,21 @@ void UppercaseTransliterator::handleTransliterate(Replaceable& text,
|
|||
|
||||
int32_t i = textPos - offsets.contextStart;
|
||||
int32_t limit = offsets.limit - offsets.contextStart;
|
||||
UChar32 cp;
|
||||
UChar32 cp, bufferCH;
|
||||
int32_t oldLen;
|
||||
|
||||
for (; i < limit; ) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t s = i;
|
||||
buffer[0] = original.charAt(s);
|
||||
bufferCH = original.char32At(s);
|
||||
|
||||
UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
|
||||
oldLen = UTF_CHAR_LENGTH(cp);
|
||||
i += oldLen;
|
||||
u_strToUpper(buffer, u_getMaxCaseExpansion(), original.getBuffer()+s, i-s, loc.getName(), &status);
|
||||
/* Skip checking of status code here because the buffer should not have overflowed. */
|
||||
if (buffer[0] != original.charAt(s)) {
|
||||
UTF_GET_CHAR(buffer, 0, s, u_strlen(buffer), cp);
|
||||
if (bufferCH != cp) {
|
||||
int len = u_strlen(buffer);
|
||||
UnicodeString temp(buffer);
|
||||
text.handleReplaceBetween(textPos, textPos + oldLen, temp);
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "unicode/uniset.h"
|
||||
#include "unicode/unitohex.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
||||
/***********************************************************************
|
||||
|
||||
|
@ -143,6 +144,10 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE(61,TestEscape);
|
||||
TESTCASE(62,TestAnchorMasking);
|
||||
TESTCASE(63,TestDisplayName);
|
||||
TESTCASE(64,TestSpecialCases);
|
||||
TESTCASE(65,TestIncrementalProgress);
|
||||
TESTCASE(66,TestSurrogateCasing);
|
||||
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -2964,6 +2969,189 @@ void TransliteratorTest::TestDisplayName() {
|
|||
}
|
||||
}
|
||||
|
||||
const UnicodeString DESERET_DEE((UChar32)0x10414);
|
||||
const UnicodeString DESERET_dee((UChar32)0x1043C);
|
||||
|
||||
void TransliteratorTest::TestSpecialCases(void) {
|
||||
const UnicodeString registerRules[] = {
|
||||
"Any-Dev1", "x > X; y > Y;",
|
||||
"Any-Dev2", "XY > Z",
|
||||
"Greek-Latin/FAKE",
|
||||
CharsToUnicodeString
|
||||
("[^[:L:][:M:]] { \\u03bc\\u03c0 > b ; \\u03bc\\u03c0 } [^[:L:][:M:]] > b ; [^[:L:][:M:]] { [\\u039c\\u03bc][\\u03a0\\u03c0] > B ; [\\u039c\\u03bc][\\u03a0\\u03c0] } [^[:L:][:M:]] > B ;")
|
||||
};
|
||||
|
||||
const UnicodeString testCases[] = {
|
||||
// NORMALIZATION, not in C
|
||||
"NFC", CharsToUnicodeString("a\\u0300"), CharsToUnicodeString("\\u00E0"),
|
||||
"NFD", CharsToUnicodeString("\\u00E0"), CharsToUnicodeString("a\\u0300"),
|
||||
|
||||
// mp -> b BUG
|
||||
"Greek-Latin/UNGEGN", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
|
||||
"Greek-Latin/FAKE", CharsToUnicodeString("(\\u03BC\\u03C0)"), "(b)",
|
||||
|
||||
// check for devanagari bug
|
||||
"nfd;Dev1;Dev2;nfc", "xy", "Z",
|
||||
|
||||
// ff, i, dotless-i, I, dotted-I, LJLjlj deseret deeDEE
|
||||
"Title", CharsToUnicodeString("ab'cD ffi\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
|
||||
CharsToUnicodeString("Ab'cd Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
|
||||
|
||||
//TODO: enable this test once Titlecase works right
|
||||
/*
|
||||
"Title", CharsToUnicodeString("\\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
|
||||
CharsToUnicodeString("Ffi\\u0131ii \\u01C8\\u01C9\\u01C9 ") + DESERET_DEE + DESERET_dee,
|
||||
*/
|
||||
"Upper", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
|
||||
CharsToUnicodeString("AB'CD FFIII\\u0130 \\u01C7\\u01C7\\u01C7 ") + DESERET_DEE + DESERET_DEE,
|
||||
"Lower", CharsToUnicodeString("ab'cD \\uFB00i\\u0131I\\u0130 \\u01C7\\u01C8\\u01C9 ") + DESERET_dee + DESERET_DEE,
|
||||
CharsToUnicodeString("ab'cd \\uFB00i\\u0131ii \\u01C9\\u01C9\\u01C9 ") + DESERET_dee + DESERET_dee,
|
||||
|
||||
// FORMS OF S
|
||||
"Greek-Latin/UNGEGN", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
|
||||
CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
|
||||
"Latin-Greek/UNGEGN", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
|
||||
CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3") ,
|
||||
"Greek-Latin", CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3"),
|
||||
CharsToUnicodeString("s ss s\\u0331s\\u0331") ,
|
||||
"Latin-Greek", CharsToUnicodeString("s ss s\\u0331s\\u0331"),
|
||||
CharsToUnicodeString("\\u03C3 \\u03C3\\u03C2 \\u03C2\\u03C3")
|
||||
};
|
||||
|
||||
UParseError pos;
|
||||
for (int32_t i = 0; i < 6 /*registerRules.length*/; i+=2) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
Transliterator *t = Transliterator::createFromRules(registerRules[0+i],
|
||||
registerRules[i+1], UTRANS_FORWARD, pos, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Fails: Unable to create the transliterator from rules.");
|
||||
} else {
|
||||
Transliterator::registerInstance(t);
|
||||
}
|
||||
}
|
||||
for (i = 0; i < 36 /*testCases.length*/; i+=3) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Transliterator *t = Transliterator::createInstance(testCases[i+0], UTRANS_FORWARD, pos, status);
|
||||
expect(*t, testCases[i+1], testCases[i+2], 0);
|
||||
delete t;
|
||||
}
|
||||
}
|
||||
|
||||
char* Char32ToEscapedChars(UChar32 ch, char* buffer) {
|
||||
if (ch <= 0xFFFF) {
|
||||
sprintf(buffer, "\\u%04x", ch);
|
||||
} else {
|
||||
sprintf(buffer, "\\u%08x", ch);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void TransliteratorTest::TestSurrogateCasing (void) {
|
||||
// check that casing handles surrogates
|
||||
// titlecase is currently defective
|
||||
char buffer[20];
|
||||
UChar buffer2[20];
|
||||
UChar32 dee;
|
||||
UTF_GET_CHAR(DESERET_dee,0, 0, DESERET_dee.length(), dee);
|
||||
UnicodeString DEE(u_totitle(dee));
|
||||
if (DEE != DESERET_DEE) {
|
||||
err("Fails titlecase of surrogates");
|
||||
err(Char32ToEscapedChars(dee, buffer));
|
||||
err(", ");
|
||||
errln(Char32ToEscapedChars(DEE.char32At(0), buffer));
|
||||
}
|
||||
|
||||
UnicodeString deeDEETest=DESERET_dee + DESERET_DEE;
|
||||
UnicodeString deedeeTest = DESERET_dee + DESERET_dee;
|
||||
UnicodeString DEEDEETest = DESERET_DEE + DESERET_DEE;
|
||||
UErrorCode status= U_ZERO_ERROR;
|
||||
|
||||
u_strToUpper(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
|
||||
if (U_FAILURE(status) || (UnicodeString(buffer2)!= DEEDEETest)) {
|
||||
errln("Fails: Can't uppercase surrogates.");
|
||||
}
|
||||
|
||||
status= U_ZERO_ERROR;
|
||||
u_strToLower(buffer2, 20, deeDEETest.getBuffer(), deeDEETest.length(), NULL, &status);
|
||||
if (U_FAILURE(status) || (UnicodeString(buffer2)!= deedeeTest)) {
|
||||
errln("Fails: Can't lowercase surrogates.");
|
||||
}
|
||||
}
|
||||
|
||||
// Check to see that incremental gets at least part way through a reasonable string.
|
||||
|
||||
void TransliteratorTest::TestIncrementalProgress(void) {
|
||||
UnicodeString test("The Quick Brown Fox Jumped Over The Lazy Dog.");
|
||||
int32_t i = 0, j=0, k=0;
|
||||
int32_t sources = Transliterator::countAvailableSources();
|
||||
for (i = 0; i < sources; i++) {
|
||||
UnicodeString source;
|
||||
Transliterator::getAvailableSource(i, source);
|
||||
if (source != UnicodeString("Latin")) continue;
|
||||
int32_t targets = Transliterator::countAvailableTargets(source);
|
||||
for (j = 0; j < targets; j++) {
|
||||
UnicodeString target;
|
||||
Transliterator::getAvailableTarget(j, source, target);
|
||||
int32_t variants = Transliterator::countAvailableVariants(source, target);
|
||||
for (k =0; k< variants; k++) {
|
||||
UnicodeString variant;
|
||||
UParseError err;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
Transliterator::getAvailableVariant(k, source, target, variant);
|
||||
UnicodeString id = source + "-" + target + "/" + variant;
|
||||
|
||||
Transliterator *t = Transliterator::createInstance(id, UTRANS_FORWARD, err, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("FAIL: Not able to create transliterator from the composed ID.");
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
UnicodeString result = CheckIncrementalAux(t, test);
|
||||
Transliterator *inv = t->createInverse(status);
|
||||
CheckIncrementalAux(inv, result);
|
||||
delete t;
|
||||
delete inv;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString TransliteratorTest::CheckIncrementalAux(const Transliterator* t,
|
||||
const UnicodeString& input) {
|
||||
|
||||
UTransPosition pos;
|
||||
UnicodeString test = input;
|
||||
|
||||
pos.contextStart = 0;
|
||||
pos.contextLimit = input.length();
|
||||
pos.start = 0;
|
||||
pos.limit = input.length();
|
||||
|
||||
t->transliterate(test, pos.start, pos.limit);
|
||||
UBool gotError = FALSE;
|
||||
if (pos.start == 0) {
|
||||
log("No Progress, ");
|
||||
log(t->getID());
|
||||
log(": ");
|
||||
errln(formatInput(test, input, pos));
|
||||
gotError = TRUE;
|
||||
} else {
|
||||
log("PASS Progress, ");
|
||||
log(t->getID());
|
||||
log(": ");
|
||||
logln(formatInput(test, input, pos));
|
||||
}
|
||||
t->finishTransliteration(test, pos);
|
||||
if (pos.start != pos.limit) {
|
||||
log("Incomplete, ");
|
||||
log(t->getID());
|
||||
log(": ");
|
||||
errln(formatInput(test, input, pos));
|
||||
gotError = TRUE;
|
||||
}
|
||||
return test;
|
||||
}
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
@ -3047,6 +3235,7 @@ void TransliteratorTest::expect(const Transliterator& t,
|
|||
expectedResult);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param appendTo result is appended to this param.
|
||||
* @param input the string being transliterated
|
||||
|
|
|
@ -259,45 +259,59 @@ class TransliteratorTest : public IntlTest {
|
|||
/**
|
||||
* Make sure parse errors reference the right line.
|
||||
*/
|
||||
void TestParseError();
|
||||
void TestParseError(void);
|
||||
|
||||
/**
|
||||
* Make sure sets on output are disallowed.
|
||||
*/
|
||||
void TestOutputSet();
|
||||
void TestOutputSet(void);
|
||||
|
||||
/**
|
||||
* Test the use variable range pragma, making sure that use of
|
||||
* variable range characters is detected and flagged as an error.
|
||||
*/
|
||||
void TestVariableRange();
|
||||
void TestVariableRange(void);
|
||||
|
||||
/**
|
||||
* Test invalid post context error handling
|
||||
*/
|
||||
void TestInvalidPostContext();
|
||||
void TestInvalidPostContext(void);
|
||||
|
||||
/**
|
||||
* Test ID form variants
|
||||
*/
|
||||
void TestIDForms();
|
||||
void TestIDForms(void);
|
||||
|
||||
/**
|
||||
* Mark's toRules test.
|
||||
*/
|
||||
void TestToRulesMark();
|
||||
void TestToRulesMark(void);
|
||||
|
||||
/**
|
||||
* Test Escape and Unescape transliterators.
|
||||
*/
|
||||
void TestEscape();
|
||||
void TestEscape(void);
|
||||
|
||||
void TestAnchorMasking();
|
||||
void TestAnchorMasking(void);
|
||||
|
||||
/**
|
||||
* Make sure display names of variants look reasonable.
|
||||
*/
|
||||
void TestDisplayName();
|
||||
void TestDisplayName(void);
|
||||
|
||||
/**
|
||||
* Check to see if case mapping works correctly.
|
||||
*/
|
||||
void TestSpecialCases(void);
|
||||
/**
|
||||
* Check to see that incremental gets at least part way through a reasonable string.
|
||||
*/
|
||||
void TestIncrementalProgress(void);
|
||||
|
||||
/**
|
||||
* Check that casing handles surrogates.
|
||||
*/
|
||||
void TestSurrogateCasing (void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
|
@ -333,6 +347,9 @@ class TransliteratorTest : public IntlTest {
|
|||
|
||||
void checkRules(const UnicodeString& label, Transliterator& t2,
|
||||
const UnicodeString& testRulesForward);
|
||||
UnicodeString CheckIncrementalAux(const Transliterator* t,
|
||||
const UnicodeString& input);
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue