diff --git a/icu4c/source/common/ucasemap.cpp b/icu4c/source/common/ucasemap.cpp index 3a1423685fc..391140d6c5e 100644 --- a/icu4c/source/common/ucasemap.cpp +++ b/icu4c/source/common/ucasemap.cpp @@ -20,6 +20,8 @@ #include "unicode/utypes.h" #include "unicode/brkiter.h" +#include "unicode/casemap.h" +#include "unicode/edits.h" #include "unicode/ubrk.h" #include "unicode/uloc.h" #include "unicode/ustring.h" @@ -32,10 +34,32 @@ #include "unicode/utf16.h" #include "cmemory.h" #include "cstring.h" +#include "uassert.h" #include "ucase.h" #include "ucasemap_imp.h" #include "ustr_imp.h" +U_NAMESPACE_BEGIN + +namespace { + +// TODO: share with UTF-16? inline in ucasemap_imp.h? +int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, + Edits *edits, UErrorCode &errorCode) { + if (U_SUCCESS(errorCode)) { + if (destIndex > destCapacity) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + } else if (edits != NULL) { + edits->copyErrorTo(errorCode); + } + } + return destIndex; +} + +} // namespace + +U_NAMESPACE_END + U_NAMESPACE_USE /* UCaseMap service object -------------------------------------------------- */ @@ -124,12 +148,13 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) { /* UTF-8 string case mappings ----------------------------------------------- */ -/* TODO(markus): Move to a new, separate utf8case.c file. */ +/* TODO(markus): Move to a new, separate utf8case.cpp file. */ /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ static inline int32_t appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, - int32_t result, const UChar *s) { + int32_t result, const UChar *s, + int32_t cpLength, uint32_t options, icu::Edits *edits) { UChar32 c; int32_t length; UErrorCode errorCode; @@ -137,86 +162,126 @@ appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity, /* decode the result */ if(result<0) { /* (not) original code point */ + if(edits!=NULL) { + edits->addUnchanged(cpLength); + if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) { + return destIndex; + } + } c=~result; - length=U8_LENGTH(c); - } else if(result<=UCASE_MAX_STRING_LENGTH) { - c=U_SENTINEL; - length=result; + if(destIndex(INT32_MAX-destIndex)) { + return -1; // integer overflow + } + if(edits!=NULL) { + edits->addReplace(cpLength, length); + } + // We might have an overflow, but we know the actual length. + return destIndex+length; + } else if(destIndexaddReplace(cpLength, 1); + } + return destIndex; + } else { + c=result; + length=U8_LENGTH(c); + if(edits!=NULL) { + edits->addReplace(cpLength, length); + } + } } + // c>=0 single code point if(length>(INT32_MAX-destIndex)) { return -1; // integer overflow } if(destIndex=0) { - /* code point */ - UBool isError=FALSE; - U8_APPEND(dest, destIndex, destCapacity, c, isError); - if(isError) { - /* overflow, nothing written */ - destIndex+=length; - } - } else { - /* string */ - int32_t destLength; - errorCode=U_ZERO_ERROR; - u_strToUTF8( - (char *)(dest+destIndex), destCapacity-destIndex, &destLength, - s, length, - &errorCode); - if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) { - return -1; - } - if(destLength>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - destIndex+=destLength; - /* we might have an overflow, but we know the actual length */ + UBool isError=FALSE; + U8_APPEND(dest, destIndex, destCapacity, c, isError); + if(isError) { + /* overflow, nothing written */ + destIndex+=length; } } else { /* preflight */ - if(c>=0) { - destIndex+=length; - } else { - int32_t destLength; - errorCode=U_ZERO_ERROR; - u_strToUTF8( - NULL, 0, &destLength, - s, length, - &errorCode); - if(U_FAILURE(errorCode) && errorCode != U_BUFFER_OVERFLOW_ERROR) { - return -1; - } - if(destLength>(INT32_MAX-destIndex)) { - return -1; // integer overflow - } - destIndex+=destLength; - } + destIndex+=length; } return destIndex; } static inline int32_t -appendUChar(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar c) { - int32_t length=U8_LENGTH(c); - if(length>(INT32_MAX-destIndex)) { +appendASCII(uint8_t *dest, int32_t destIndex, int32_t destCapacity, uint8_t c) { + if(destIndex> 6) | 0xc0); } +static inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); } + +static inline int32_t +appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, UChar32 c) { + U_ASSERT(0x370 <= c && c <= 0x3ff); // 2-byte UTF-8, main Greek block + if(2>(INT32_MAX-destIndex)) { + return -1; // integer overflow + } + int32_t limit=destIndex+2; if(limit<=destCapacity) { - U8_APPEND_UNSAFE(dest, destIndex, c); + dest+=destIndex; + dest[0]=getTwoByteLead(c); + dest[1]=getTwoByteTrail(c); } return limit; } static inline int32_t -appendString(uint8_t *dest, int32_t destIndex, int32_t destCapacity, - const uint8_t *s, int32_t length) { +appendTwoBytes(uint8_t *dest, int32_t destIndex, int32_t destCapacity, const char *s) { + if(2>(INT32_MAX-destIndex)) { + return -1; // integer overflow + } + int32_t limit=destIndex+2; + if(limit<=destCapacity) { + dest+=destIndex; + dest[0]=(uint8_t)s[0]; + dest[1]=(uint8_t)s[1]; + } + return limit; +} + +static inline int32_t +appendUnchanged(uint8_t *dest, int32_t destIndex, int32_t destCapacity, + const uint8_t *s, int32_t length, uint32_t options, icu::Edits *edits) { if(length>0) { + if(edits!=NULL) { + edits->addUnchanged(length); + if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) { + return destIndex; + } + } if(length>(INT32_MAX-destIndex)) { return -1; // integer overflow } @@ -265,47 +330,41 @@ utf8_caseContextIterator(void *context, int8_t dir) { * context [0..srcLength[ into account. */ static int32_t -_caseMap(int32_t caseLocale, uint32_t /* TODO: options */, UCaseMapFull *map, +_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, uint8_t *dest, int32_t destCapacity, const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, - UErrorCode *pErrorCode) { - const UChar *s = NULL; - UChar32 c, c2 = 0; - int32_t srcIndex, destIndex; - + icu::Edits *edits, + UErrorCode &errorCode) { /* case mapping loop */ - srcIndex=srcStart; - destIndex=0; + int32_t srcIndex=srcStart; + int32_t destIndex=0; while(srcIndexcpStart=srcIndex; + int32_t cpStart; + csc->cpStart=cpStart=srcIndex; + UChar32 c; U8_NEXT(src, srcIndex, srcLimit, c); csc->cpLimit=srcIndex; if(c<0) { // Malformed UTF-8. - destIndex=appendString(dest, destIndex, destCapacity, src+csc->cpStart, srcIndex-csc->cpStart); + destIndex=appendUnchanged(dest, destIndex, destCapacity, + src+cpStart, srcIndex-cpStart, options, edits); if(destIndex<0) { - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } continue; } + const UChar *s; c=map(c, utf8_caseContextIterator, csc, &s, caseLocale); - if((destIndexdestCapacity) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } return destIndex; } @@ -316,13 +375,9 @@ ucasemap_internalUTF8ToTitle( int32_t caseLocale, uint32_t options, BreakIterator *iter, uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode) { - const UChar *s; - UChar32 c; - int32_t prev, titleStart, titleLimit, idx, destIndex; - UBool isFirstIndex; - - if(U_FAILURE(*pErrorCode)) { + icu::Edits *edits, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } @@ -330,21 +385,22 @@ ucasemap_internalUTF8ToTitle( UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - destIndex=0; - prev=0; - isFirstIndex=TRUE; + int32_t destIndex=0; + int32_t prev=0; + UBool isFirstIndex=TRUE; /* titlecasing loop */ while(prevfirst(); + index=iter->first(); } else { - idx=iter->next(); + index=iter->next(); } - if(idx==UBRK_DONE || idx>srcLength) { - idx=srcLength; + if(index==UBRK_DONE || index>srcLength) { + index=srcLength; } /* @@ -360,29 +416,32 @@ ucasemap_internalUTF8ToTitle( * b) first case letter (titlecase) [titleStart..titleLimit[ * c) subsequent characters (lowercase) [titleLimit..index[ */ - if(prev=0) { csc.cpStart=titleStart; csc.cpLimit=titleLimit; + const UChar *s; c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale); - destIndex=appendResult(dest, destIndex, destCapacity, c, s); + destIndex=appendResult(dest, destIndex, destCapacity, c, s, + titleLimit-titleStart, options, edits); } else { // Malformed UTF-8. - destIndex=appendString(dest, destIndex, destCapacity, src+titleStart, titleLimit-titleStart); + destIndex=appendUnchanged(dest, destIndex, destCapacity, + src+titleStart, titleLimit-titleStart, options, edits); } if(destIndex<0) { - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } /* Special case Dutch IJ titlecasing */ - if (titleStart+1 < idx && + if (titleStart+1 < index && caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) && - (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) { - destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); - titleLimit++; + (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { + if (src[titleStart+1] == 0x006A) { + destIndex=appendASCII(dest, destIndex, destCapacity, 0x004A); + if(destIndex<0) { + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + if(edits!=NULL) { + edits->addReplace(1, 1); + } + titleLimit++; + } else if (src[titleStart+1] == 0x004A) { + // Keep the capital J from getting lowercased. + destIndex=appendUnchanged(dest, destIndex, destCapacity, + src+titleStart+1, 1, options, edits); + if(destIndex<0) { + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + titleLimit++; + } } + /* lowercase [titleLimit..index[ */ - if(titleLimitdestCapacity) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } - return destIndex; + return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } #endif @@ -471,10 +549,11 @@ UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) { } // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java. -int32_t toUpper(int32_t caseLocale, uint32_t /* TODO: options */, +int32_t toUpper(uint32_t options, uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode) { + Edits *edits, + UErrorCode &errorCode) { int32_t destIndex=0; uint32_t state = 0; for (int32_t i = 0; i < srcLength;) { @@ -550,40 +629,75 @@ int32_t toUpper(int32_t caseLocale, uint32_t /* TODO: options */, data &= ~HAS_EITHER_DIALYTIKA; } } - destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); - if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { - destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika + + UBool change = TRUE; + if (edits != NULL) { + // Find out first whether we are changing the text. + U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block + change = (i + 2) > nextIndex || + src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) || + numYpogegrammeni > 0; + int32_t i2 = i + 2; + if ((data & HAS_EITHER_DIALYTIKA) != 0) { + change |= (i2 + 2) > nextIndex || + src[i2] != (uint8_t)u8"\u0308"[0] || + src[i2 + 1] != (uint8_t)u8"\u0308"[1]; + i2 += 2; + } + if (addTonos) { + change |= (i2 + 2) > nextIndex || + src[i2] != (uint8_t)u8"\u0301"[0] || + src[i2 + 1] != (uint8_t)u8"\u0301"[1]; + i2 += 2; + } + int32_t oldLength = nextIndex - i; + int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399 + change |= oldLength != newLength; + if (change) { + if (edits != NULL) { + edits->addReplace(oldLength, newLength); + } + } else { + if (edits != NULL) { + edits->addUnchanged(oldLength); + } + // Write unchanged text? + change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0; + } } - if (destIndex >= 0 && addTonos) { - destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); - } - while (destIndex >= 0 && numYpogegrammeni > 0) { - destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); - --numYpogegrammeni; - } - if(destIndex<0) { - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - } else if(c>=0) { - const UChar *s; - UChar32 c2 = 0; - c=ucase_toFullUpper(c, NULL, NULL, &s, caseLocale); - if((destIndex= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { + destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0308"); // restore or add a dialytika + } + if (destIndex >= 0 && addTonos) { + destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0301"); + } + while (destIndex >= 0 && numYpogegrammeni > 0) { + destIndex=appendTwoBytes(dest, destIndex, destCapacity, u8"\u0399"); + --numYpogegrammeni; + } if(destIndex<0) { - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } + } else if(c>=0) { + const UChar *s; + c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); + destIndex = appendResult(dest, destIndex, destCapacity, c, s, + nextIndex - i, options, edits); + if (destIndex < 0) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } } else { // Malformed UTF-8. - destIndex=appendString(dest, destIndex, destCapacity, src+i, nextIndex-i); + destIndex=appendUnchanged(dest, destIndex, destCapacity, + src+i, nextIndex-i, options, edits); if(destIndex<0) { - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } } @@ -591,9 +705,6 @@ int32_t toUpper(int32_t caseLocale, uint32_t /* TODO: options */, state = nextState; } - if(destIndex>destCapacity) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } return destIndex; } @@ -604,77 +715,76 @@ static int32_t U_CALLCONV ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode) { + icu::Edits *edits, + UErrorCode &errorCode) { UCaseContext csc=UCASECONTEXT_INITIALIZER; csc.p=(void *)src; csc.limit=srcLength; - return _caseMap( + int32_t destIndex = _caseMap( caseLocale, options, ucase_toFullLower, dest, destCapacity, src, &csc, 0, srcLength, - pErrorCode); + edits, errorCode); + return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } static int32_t U_CALLCONV ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode) { + icu::Edits *edits, + UErrorCode &errorCode) { + int32_t destIndex; if (caseLocale == UCASE_LOC_GREEK) { - return GreekUpper::toUpper(caseLocale, options, dest, destCapacity, src, srcLength, pErrorCode); + destIndex = GreekUpper::toUpper(options, dest, destCapacity, + src, srcLength, edits, errorCode); + } else { + UCaseContext csc=UCASECONTEXT_INITIALIZER; + csc.p=(void *)src; + csc.limit=srcLength; + destIndex = _caseMap( + caseLocale, options, ucase_toFullUpper, + dest, destCapacity, + src, &csc, 0, srcLength, + edits, errorCode); } - UCaseContext csc=UCASECONTEXT_INITIALIZER; - csc.p=(void *)src; - csc.limit=srcLength; - return _caseMap( - caseLocale, options, ucase_toFullUpper, - dest, destCapacity, - src, &csc, 0, srcLength, - pErrorCode); + return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } static int32_t U_CALLCONV ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode) { - int32_t srcIndex, destIndex; - - const UChar *s; - UChar32 c, c2; - int32_t start; - + icu::Edits *edits, + UErrorCode &errorCode) { /* case mapping loop */ - srcIndex=destIndex=0; - while(srcIndexdestCapacity) { - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - } - return destIndex; + return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } U_CFUNC int32_t @@ -682,11 +792,12 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, UTF8CaseMapper *stringCaseMapper, - UErrorCode *pErrorCode) { + icu::Edits *edits, + UErrorCode &errorCode) { int32_t destLength; /* check argument values */ - if(U_FAILURE(*pErrorCode)) { + if(U_FAILURE(errorCode)) { return 0; } if( destCapacity<0 || @@ -694,7 +805,7 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P src==NULL || srcLength<-1 ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + errorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } @@ -708,13 +819,16 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P ((src>=dest && src<(dest+destCapacity)) || (dest>=src && dest<(src+srcLength))) ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + errorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } + if(edits!=NULL) { + edits->reset(); + } destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR - dest, destCapacity, src, srcLength, pErrorCode); - return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode); + dest, destCapacity, src, srcLength, edits, errorCode); + return u_terminateChars((char *)dest, destCapacity, destLength, &errorCode); } /* public API functions */ @@ -728,7 +842,7 @@ ucasemap_utf8ToLower(const UCaseMap *csm, csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, - ucasemap_internalUTF8ToLower, pErrorCode); + ucasemap_internalUTF8ToLower, NULL, *pErrorCode); } U_CAPI int32_t U_EXPORT2 @@ -740,7 +854,7 @@ ucasemap_utf8ToUpper(const UCaseMap *csm, csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, - ucasemap_internalUTF8ToUpper, pErrorCode); + ucasemap_internalUTF8ToUpper, NULL, *pErrorCode); } U_CAPI int32_t U_EXPORT2 @@ -752,5 +866,45 @@ ucasemap_utf8FoldCase(const UCaseMap *csm, UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, - ucasemap_internalUTF8Fold, pErrorCode); + ucasemap_internalUTF8Fold, NULL, *pErrorCode); } + +U_NAMESPACE_BEGIN + +int32_t CaseMap::utf8ToLower( + const char *locale, uint32_t options, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode) { + return ucasemap_mapUTF8( + ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL + (uint8_t *)dest, destCapacity, + (const uint8_t *)src, srcLength, + ucasemap_internalUTF8ToLower, edits, errorCode); +} + +int32_t CaseMap::utf8ToUpper( + const char *locale, uint32_t options, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode) { + return ucasemap_mapUTF8( + ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL + (uint8_t *)dest, destCapacity, + (const uint8_t *)src, srcLength, + ucasemap_internalUTF8ToUpper, edits, errorCode); +} + +int32_t CaseMap::utf8Fold( + uint32_t options, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode) { + return ucasemap_mapUTF8( + UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL + (uint8_t *)dest, destCapacity, + (const uint8_t *)src, srcLength, + ucasemap_internalUTF8Fold, edits, errorCode); +} + +U_NAMESPACE_END diff --git a/icu4c/source/common/ucasemap_imp.h b/icu4c/source/common/ucasemap_imp.h index e400f4af1d0..79204226b00 100644 --- a/icu4c/source/common/ucasemap_imp.h +++ b/icu4c/source/common/ucasemap_imp.h @@ -172,7 +172,8 @@ UTF8CaseMapper(int32_t caseLocale, uint32_t options, #endif uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode); + icu::Edits *edits, + UErrorCode &errorCode); #if !UCONFIG_NO_BREAK_ITERATION @@ -182,7 +183,8 @@ ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options, icu::BreakIterator *iter, uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, - UErrorCode *pErrorCode); + icu::Edits *edits, + UErrorCode &errorCode); #endif @@ -195,7 +197,8 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P uint8_t *dest, int32_t destCapacity, const uint8_t *src, int32_t srcLength, UTF8CaseMapper *stringCaseMapper, - UErrorCode *pErrorCode); + icu::Edits *edits, + UErrorCode &errorCode); U_NAMESPACE_BEGIN namespace GreekUpper { diff --git a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp index e956894cf83..a253850fa29 100644 --- a/icu4c/source/common/ucasemap_titlecase_brkiter.cpp +++ b/icu4c/source/common/ucasemap_titlecase_brkiter.cpp @@ -23,11 +23,45 @@ #include "unicode/brkiter.h" #include "unicode/ubrk.h" +#include "unicode/casemap.h" #include "unicode/ucasemap.h" #include "cmemory.h" #include "ucase.h" #include "ucasemap_imp.h" +U_NAMESPACE_BEGIN + +int32_t CaseMap::utf8ToTitle( + const char *locale, uint32_t options, BreakIterator *iter, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return 0; + } + UText utext=UTEXT_INITIALIZER; + utext_openUTF8(&utext, src, srcLength, &errorCode); + LocalPointer ownedIter; + if(iter==NULL) { + iter=BreakIterator::createWordInstance(Locale(locale), errorCode); + ownedIter.adoptInstead(iter); + } + if(U_FAILURE(errorCode)) { + utext_close(&utext); + return 0; + } + iter->setText(&utext, errorCode); + int32_t length=ucasemap_mapUTF8( + ustrcase_getCaseLocale(locale), options, iter, + (uint8_t *)dest, destCapacity, + (const uint8_t *)src, srcLength, + ucasemap_internalUTF8ToTitle, edits, errorCode); + utext_close(&utext); + return length; +} + +U_NAMESPACE_END + U_NAMESPACE_USE U_CAPI const UBreakIterator * U_EXPORT2 @@ -65,7 +99,7 @@ ucasemap_utf8ToTitle(UCaseMap *csm, csm->caseLocale, csm->options, csm->iter, (uint8_t *)dest, destCapacity, (const uint8_t *)src, srcLength, - ucasemap_internalUTF8ToTitle, pErrorCode); + ucasemap_internalUTF8ToTitle, NULL, *pErrorCode); utext_close(&utext); return length; } diff --git a/icu4c/source/common/unicode/casemap.h b/icu4c/source/common/unicode/casemap.h index 2e685eef7ae..98184820d53 100644 --- a/icu4c/source/common/unicode/casemap.h +++ b/icu4c/source/common/unicode/casemap.h @@ -47,6 +47,7 @@ public: * without writing any of the result string. * @param edits Records edits for index mapping, working with styled text, * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. * This function calls edits->reset() first. edits can be NULL. * @param errorCode Reference to an in/out error code value * which must not indicate a failure before the function call. @@ -81,6 +82,7 @@ public: * without writing any of the result string. * @param edits Records edits for index mapping, working with styled text, * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. * This function calls edits->reset() first. edits can be NULL. * @param errorCode Reference to an in/out error code value * which must not indicate a failure before the function call. @@ -127,6 +129,7 @@ public: * without writing any of the result string. * @param edits Records edits for index mapping, working with styled text, * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. * This function calls edits->reset() first. edits can be NULL. * @param errorCode Reference to an in/out error code value * which must not indicate a failure before the function call. @@ -168,6 +171,7 @@ public: * without writing any of the result string. * @param edits Records edits for index mapping, working with styled text, * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. * This function calls edits->reset() first. edits can be NULL. * @param errorCode Reference to an in/out error code value * which must not indicate a failure before the function call. @@ -184,6 +188,164 @@ public: char16_t *dest, int32_t destCapacity, Edits *edits, UErrorCode &errorCode); + /** + * Lowercases a UTF-8 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful. + * When the result would be longer than destCapacity, + * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucasemap_utf8ToLower + * @draft ICU 59 + */ + static int32_t utf8ToLower( + const char *locale, uint32_t options, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + + /** + * Uppercases a UTF-8 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful. + * When the result would be longer than destCapacity, + * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucasemap_utf8ToUpper + * @draft ICU 59 + */ + static int32_t utf8ToUpper( + const char *locale, uint32_t options, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + +#if !UCONFIG_NO_BREAK_ITERATION + + /** + * Titlecases a UTF-8 string and optionally records edits. + * Casing is locale-dependent and context-sensitive. + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * Titlecasing uses a break iterator to find the first characters of words + * that are to be titlecased. It titlecases those characters and lowercases + * all others. (This can be modified with options bits.) + * + * @param locale The locale ID. ("" = root locale, NULL = default locale.) + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT, + * U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT. + * @param iter A break iterator to find the first characters of words that are to be titlecased. + * It is set to the source string (setText()) + * and used one or more times for iteration (first() and next()). + * If NULL, then a word break iterator for the locale is used + * (or something equivalent). + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful. + * When the result would be longer than destCapacity, + * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucasemap_utf8ToTitle + * @draft ICU 59 + */ + static int32_t utf8ToTitle( + const char *locale, uint32_t options, BreakIterator *iter, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + +#endif // UCONFIG_NO_BREAK_ITERATION + + /** + * Case-folds a UTF-8 string and optionally records edits. + * + * Case folding is locale-independent and not context-sensitive, + * but there is an option for whether to include or exclude mappings for dotted I + * and dotless i that are marked with 'T' in CaseFolding.txt. + * + * The result may be longer or shorter than the original. + * The source string and the destination buffer must not overlap. + * + * @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT, + * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I. + * @param src The original string. + * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. + * @param dest A buffer for the result string. The result will be NUL-terminated if + * the buffer is large enough. + * The contents is undefined in case of failure. + * @param destCapacity The size of the buffer (number of bytes). If it is 0, then + * dest may be NULL and the function will only return the length of the result + * without writing any of the result string. + * @param edits Records edits for index mapping, working with styled text, + * and getting only changes (if any). + * The Edits contents is undefined if any error occurs. + * This function calls edits->reset() first. edits can be NULL. + * @param errorCode Reference to an in/out error code value + * which must not indicate a failure before the function call. + * @return The length of the result string, if successful. + * When the result would be longer than destCapacity, + * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set. + * + * @see ucasemap_utf8FoldCase + * @draft ICU 59 + */ + static int32_t utf8Fold( + uint32_t options, + const char *src, int32_t srcLength, + char *dest, int32_t destCapacity, Edits *edits, + UErrorCode &errorCode); + private: CaseMap() = delete; CaseMap(const CaseMap &other) = delete; diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index 0e38a42e103..b12e7a7c0b3 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -1000,7 +1000,7 @@ int32_t toUpper(uint32_t options, state = nextState; } - return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); + return destIndex; } } // namespace GreekUpper @@ -1031,17 +1031,20 @@ ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_IT const UChar *src, int32_t srcLength, icu::Edits *edits, UErrorCode &errorCode) { + int32_t destIndex; if (caseLocale == UCASE_LOC_GREEK) { - return GreekUpper::toUpper(options, dest, destCapacity, src, srcLength, edits, errorCode); + destIndex = GreekUpper::toUpper(options, dest, destCapacity, + src, srcLength, edits, errorCode); + } else { + UCaseContext csc=UCASECONTEXT_INITIALIZER; + csc.p=(void *)src; + csc.limit=srcLength; + destIndex = _caseMap( + caseLocale, options, ucase_toFullUpper, + dest, destCapacity, + src, &csc, 0, srcLength, + edits, errorCode); } - UCaseContext csc=UCASECONTEXT_INITIALIZER; - csc.p=(void *)src; - csc.limit=srcLength; - int32_t destIndex = _caseMap( - caseLocale, options, ucase_toFullUpper, - dest, destCapacity, - src, &csc, 0, srcLength, - edits, errorCode); return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); } diff --git a/icu4c/source/test/intltest/strcase.cpp b/icu4c/source/test/intltest/strcase.cpp index 99a11af5ac1..037a090ea3f 100644 --- a/icu4c/source/test/intltest/strcase.cpp +++ b/icu4c/source/test/intltest/strcase.cpp @@ -59,10 +59,11 @@ public: void TestBufferOverflow(); void TestEdits(); void TestCaseMapWithEdits(); + void TestCaseMapUTF8WithEdits(); void TestLongUnicodeString(); private: - void assertGreekUpper(const char *s, const char *expected); + void assertGreekUpper(const char16_t *s, const char16_t *expected); void checkEditsIter( const UnicodeString &name, Edits::Iterator ei1, Edits::Iterator ei2, // two equal iterators const EditChange expected[], int32_t expLength, UBool withUnchanged, @@ -96,6 +97,7 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha TESTCASE_AUTO(TestBufferOverflow); TESTCASE_AUTO(TestEdits); TESTCASE_AUTO(TestCaseMapWithEdits); + TESTCASE_AUTO(TestCaseMapUTF8WithEdits); TESTCASE_AUTO(TestLongUnicodeString); TESTCASE_AUTO_END; } @@ -629,9 +631,9 @@ StringCaseTest::TestFullCaseFoldingIterator() { } void -StringCaseTest::assertGreekUpper(const char *s, const char *expected) { - UnicodeString s16 = UnicodeString(s).unescape(); - UnicodeString expected16 = UnicodeString(expected).unescape(); +StringCaseTest::assertGreekUpper(const char16_t *s, const char16_t *expected) { + UnicodeString s16(s); + UnicodeString expected16(expected); UnicodeString msg = UnicodeString("UnicodeString::toUpper/Greek(\"") + s16 + "\")"; UnicodeString result16(s16); result16.toUpper(GREEK_LOCALE_); @@ -713,86 +715,31 @@ StringCaseTest::assertGreekUpper(const char *s, const char *expected) { void StringCaseTest::TestGreekUpper() { - // See UCharacterCaseTest.java for human-readable strings. - // http://bugs.icu-project.org/trac/ticket/5456 - assertGreekUpper("\\u03AC\\u03B4\\u03B9\\u03BA\\u03BF\\u03C2, " - "\\u03BA\\u03B5\\u03AF\\u03BC\\u03B5\\u03BD\\u03BF, " - "\\u03AF\\u03C1\\u03B9\\u03B4\\u03B1", - "\\u0391\\u0394\\u0399\\u039A\\u039F\\u03A3, " - "\\u039A\\u0395\\u0399\\u039C\\u0395\\u039D\\u039F, " - "\\u0399\\u03A1\\u0399\\u0394\\u0391"); + assertGreekUpper(u"άδικος, κείμενο, ίριδα", u"ΑΔΙΚΟΣ, ΚΕΙΜΕΝΟ, ΙΡΙΔΑ"); // https://bugzilla.mozilla.org/show_bug.cgi?id=307039 // https://bug307039.bmoattachments.org/attachment.cgi?id=194893 - assertGreekUpper("\\u03A0\\u03B1\\u03C4\\u03AC\\u03C4\\u03B1", - "\\u03A0\\u0391\\u03A4\\u0391\\u03A4\\u0391"); - assertGreekUpper("\\u0391\\u03AD\\u03C1\\u03B1\\u03C2, " - "\\u039C\\u03C5\\u03C3\\u03C4\\u03AE\\u03C1\\u03B9\\u03BF, " - "\\u03A9\\u03C1\\u03B1\\u03AF\\u03BF", - "\\u0391\\u0395\\u03A1\\u0391\\u03A3, " - "\\u039C\\u03A5\\u03A3\\u03A4\\u0397\\u03A1\\u0399\\u039F, " - "\\u03A9\\u03A1\\u0391\\u0399\\u039F"); - assertGreekUpper("\\u039C\\u03B1\\u0390\\u03BF\\u03C5, \\u03A0\\u03CC\\u03C1\\u03BF\\u03C2, " - "\\u03A1\\u03CD\\u03B8\\u03BC\\u03B9\\u03C3\\u03B7", - "\\u039C\\u0391\\u03AA\\u039F\\u03A5, \\u03A0\\u039F\\u03A1\\u039F\\u03A3, " - "\\u03A1\\u03A5\\u0398\\u039C\\u0399\\u03A3\\u0397"); - assertGreekUpper("\\u03B0, \\u03A4\\u03B7\\u03C1\\u03CE, \\u039C\\u03AC\\u03B9\\u03BF\\u03C2", - "\\u03AB, \\u03A4\\u0397\\u03A1\\u03A9, \\u039C\\u0391\\u03AA\\u039F\\u03A3"); - assertGreekUpper("\\u03AC\\u03C5\\u03BB\\u03BF\\u03C2", - "\\u0391\\u03AB\\u039B\\u039F\\u03A3"); - assertGreekUpper("\\u0391\\u03AB\\u039B\\u039F\\u03A3", - "\\u0391\\u03AB\\u039B\\u039F\\u03A3"); - assertGreekUpper("\\u0386\\u03BA\\u03BB\\u03B9\\u03C4\\u03B1 " - "\\u03C1\\u03AE\\u03BC\\u03B1\\u03C4\\u03B1 \\u03AE " - "\\u03AC\\u03BA\\u03BB\\u03B9\\u03C4\\u03B5\\u03C2 " - "\\u03BC\\u03B5\\u03C4\\u03BF\\u03C7\\u03AD\\u03C2", - "\\u0391\\u039A\\u039B\\u0399\\u03A4\\u0391 " - "\\u03A1\\u0397\\u039C\\u0391\\u03A4\\u0391 \\u0397\\u0301 " - "\\u0391\\u039A\\u039B\\u0399\\u03A4\\u0395\\u03A3 " - "\\u039C\\u0395\\u03A4\\u039F\\u03A7\\u0395\\u03A3"); + assertGreekUpper(u"Πατάτα", u"ΠΑΤΑΤΑ"); + assertGreekUpper(u"Αέρας, Μυστήριο, Ωραίο", u"ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ, ΩΡΑΙΟ"); + assertGreekUpper(u"Μαΐου, Πόρος, Ρύθμιση", u"ΜΑΪΟΥ, ΠΟΡΟΣ, ΡΥΘΜΙΣΗ"); + assertGreekUpper(u"ΰ, Τηρώ, Μάιος", u"Ϋ, ΤΗΡΩ, ΜΑΪΟΣ"); + assertGreekUpper(u"άυλος", u"ΑΫΛΟΣ"); + assertGreekUpper(u"ΑΫΛΟΣ", u"ΑΫΛΟΣ"); + assertGreekUpper(u"Άκλιτα ρήματα ή άκλιτες μετοχές", u"ΑΚΛΙΤΑ ΡΗΜΑΤΑ Ή ΑΚΛΙΤΕΣ ΜΕΤΟΧΕΣ"); // http://www.unicode.org/udhr/d/udhr_ell_monotonic.html - assertGreekUpper("\\u0395\\u03C0\\u03B5\\u03B9\\u03B4\\u03AE \\u03B7 " - "\\u03B1\\u03BD\\u03B1\\u03B3\\u03BD\\u03CE\\u03C1\\u03B9\\u03C3\\u03B7 " - "\\u03C4\\u03B7\\u03C2 \\u03B1\\u03BE\\u03B9\\u03BF\\u03C0\\u03C1\\u03AD" - "\\u03C0\\u03B5\\u03B9\\u03B1\\u03C2", - "\\u0395\\u03A0\\u0395\\u0399\\u0394\\u0397 \\u0397 " - "\\u0391\\u039D\\u0391\\u0393\\u039D\\u03A9\\u03A1\\u0399\\u03A3\\u0397 " - "\\u03A4\\u0397\\u03A3 \\u0391\\u039E\\u0399\\u039F\\u03A0\\u03A1\\u0395" - "\\u03A0\\u0395\\u0399\\u0391\\u03A3"); - assertGreekUpper("\\u03BD\\u03BF\\u03BC\\u03B9\\u03BA\\u03BF\\u03CD \\u03AE " - "\\u03B4\\u03B9\\u03B5\\u03B8\\u03BD\\u03BF\\u03CD\\u03C2", - "\\u039D\\u039F\\u039C\\u0399\\u039A\\u039F\\u03A5 \\u0397\\u0301 " - "\\u0394\\u0399\\u0395\\u0398\\u039D\\u039F\\u03A5\\u03A3"); + assertGreekUpper(u"Επειδή η αναγνώριση της αξιοπρέπειας", u"ΕΠΕΙΔΗ Η ΑΝΑΓΝΩΡΙΣΗ ΤΗΣ ΑΞΙΟΠΡΕΠΕΙΑΣ"); + assertGreekUpper(u"νομικού ή διεθνούς", u"ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ"); // http://unicode.org/udhr/d/udhr_ell_polytonic.html - assertGreekUpper("\\u1F18\\u03C0\\u03B5\\u03B9\\u03B4\\u1F74 \\u1F21 " - "\\u1F00\\u03BD\\u03B1\\u03B3\\u03BD\\u1F7D\\u03C1\\u03B9\\u03C3\\u03B7", - "\\u0395\\u03A0\\u0395\\u0399\\u0394\\u0397 \\u0397 " - "\\u0391\\u039D\\u0391\\u0393\\u039D\\u03A9\\u03A1\\u0399\\u03A3\\u0397"); - assertGreekUpper("\\u03BD\\u03BF\\u03BC\\u03B9\\u03BA\\u03BF\\u1FE6 \\u1F22 " - "\\u03B4\\u03B9\\u03B5\\u03B8\\u03BD\\u03BF\\u1FE6\\u03C2", - "\\u039D\\u039F\\u039C\\u0399\\u039A\\u039F\\u03A5 \\u0397\\u0301 " - "\\u0394\\u0399\\u0395\\u0398\\u039D\\u039F\\u03A5\\u03A3"); + assertGreekUpper(u"Ἐπειδὴ ἡ ἀναγνώριση", u"ΕΠΕΙΔΗ Η ΑΝΑΓΝΩΡΙΣΗ"); + assertGreekUpper(u"νομικοῦ ἢ διεθνοῦς", u"ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ"); // From Google bug report - assertGreekUpper("\\u039D\\u03AD\\u03BF, " - "\\u0394\\u03B7\\u03BC\\u03B9\\u03BF\\u03C5\\u03C1\\u03B3\\u03AF\\u03B1", - "\\u039D\\u0395\\u039F, " - "\\u0394\\u0397\\u039C\\u0399\\u039F\\u03A5\\u03A1\\u0393\\u0399\\u0391"); + assertGreekUpper(u"Νέο, Δημιουργία", u"ΝΕΟ, ΔΗΜΙΟΥΡΓΙΑ"); // http://crbug.com/234797 - assertGreekUpper("\\u0395\\u03BB\\u03AC\\u03C4\\u03B5 \\u03BD\\u03B1 \\u03C6\\u03AC\\u03C4\\u03B5 " - "\\u03C4\\u03B1 \\u03BA\\u03B1\\u03BB\\u03CD\\u03C4\\u03B5\\u03C1\\u03B1 " - "\\u03C0\\u03B1\\u03CA\\u03B4\\u03AC\\u03BA\\u03B9\\u03B1!", - "\\u0395\\u039B\\u0391\\u03A4\\u0395 \\u039D\\u0391 \\u03A6\\u0391\\u03A4\\u0395 " - "\\u03A4\\u0391 \\u039A\\u0391\\u039B\\u03A5\\u03A4\\u0395\\u03A1\\u0391 " - "\\u03A0\\u0391\\u03AA\\u0394\\u0391\\u039A\\u0399\\u0391!"); - assertGreekUpper("\\u039C\\u03B1\\u0390\\u03BF\\u03C5, \\u03C4\\u03C1\\u03CC\\u03BB\\u03B5\\u03CA", - "\\u039C\\u0391\\u03AA\\u039F\\u03A5, \\u03A4\\u03A1\\u039F\\u039B\\u0395\\u03AA"); - assertGreekUpper("\\u03A4\\u03BF \\u03AD\\u03BD\\u03B1 \\u03AE \\u03C4\\u03BF " - "\\u03AC\\u03BB\\u03BB\\u03BF.", - "\\u03A4\\u039F \\u0395\\u039D\\u0391 \\u0397\\u0301 \\u03A4\\u039F " - "\\u0391\\u039B\\u039B\\u039F."); + assertGreekUpper(u"Ελάτε να φάτε τα καλύτερα παϊδάκια!", u"ΕΛΑΤΕ ΝΑ ΦΑΤΕ ΤΑ ΚΑΛΥΤΕΡΑ ΠΑΪΔΑΚΙΑ!"); + assertGreekUpper(u"Μαΐου, τρόλεϊ", u"ΜΑΪΟΥ, ΤΡΟΛΕΪ"); + assertGreekUpper(u"Το ένα ή το άλλο.", u"ΤΟ ΕΝΑ Ή ΤΟ ΑΛΛΟ."); // http://multilingualtypesetting.co.uk/blog/greek-typesetting-tips/ - assertGreekUpper("\\u03C1\\u03C9\\u03BC\\u03AD\\u03B9\\u03BA\\u03B1", - "\\u03A1\\u03A9\\u039C\\u0395\\u03AA\\u039A\\u0391"); + assertGreekUpper(u"ρωμέικα", u"ΡΩΜΕΪΚΑ"); } void @@ -939,7 +886,7 @@ void StringCaseTest::checkEditsIter( } } // TODO: remove casts from u"" when merging into trunk - UnicodeString msg = UnicodeString(name).append((const UChar *)u" end"); + UnicodeString msg = UnicodeString(name).append(u" end"); assertFalse(msg, ei1.next(errorCode)); assertFalse(msg, ei1.hasChange()); assertEquals(msg, 0, ei1.oldLength()); @@ -979,10 +926,10 @@ void StringCaseTest::TestEdits() { { FALSE, 10003, 10003 }, { TRUE, 103103, 104013 } }; - checkEditsIter((const UChar *)u"coarse", + checkEditsIter(u"coarse", edits.getCoarseIterator(), edits.getCoarseIterator(), coarseExpectedChanges, UPRV_LENGTHOF(coarseExpectedChanges), TRUE, errorCode); - checkEditsIter((const UChar *)u"coarse changes", + checkEditsIter(u"coarse changes", edits.getCoarseChangesIterator(), edits.getCoarseChangesIterator(), coarseExpectedChanges, UPRV_LENGTHOF(coarseExpectedChanges), FALSE, errorCode); @@ -996,10 +943,10 @@ void StringCaseTest::TestEdits() { { TRUE, 3000, 4000 }, { TRUE, 100000, 100000 } }; - checkEditsIter((const UChar *)u"fine", + checkEditsIter(u"fine", edits.getFineIterator(), edits.getFineIterator(), fineExpectedChanges, UPRV_LENGTHOF(fineExpectedChanges), TRUE, errorCode); - checkEditsIter((const UChar *)u"fine changes", + checkEditsIter(u"fine changes", edits.getFineChangesIterator(), edits.getFineChangesIterator(), fineExpectedChanges, UPRV_LENGTHOF(fineExpectedChanges), FALSE, errorCode); @@ -1016,23 +963,23 @@ void StringCaseTest::TestCaseMapWithEdits() { Edits edits; int32_t length = CaseMap::toLower("tr", UCASEMAP_OMIT_UNCHANGED_TEXT, - (const UChar *)u"IstanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode); - assertEquals((const UChar *)u"toLower(Istanbul)", UnicodeString((const UChar *)u"ıb"), UnicodeString(TRUE, dest, length)); + u"IstanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), UnicodeString(TRUE, dest, length)); static const EditChange lowerExpectedChanges[] = { { TRUE, 1, 1 }, { FALSE, 4, 4 }, { TRUE, 1, 1 }, { FALSE, 2, 2 } }; - checkEditsIter((const UChar *)u"toLower(Istanbul)", + checkEditsIter(u"toLower(IstanBul)", edits.getFineIterator(), edits.getFineIterator(), lowerExpectedChanges, UPRV_LENGTHOF(lowerExpectedChanges), TRUE, errorCode); edits.reset(); length = CaseMap::toUpper("el", UCASEMAP_OMIT_UNCHANGED_TEXT, - (const UChar *)u"Πατάτα", 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode); - assertEquals((const UChar *)u"toUpper(Πατάτα)", UnicodeString((const UChar *)u"ΑΤΑΤΑ"), UnicodeString(TRUE, dest, length)); + u"Πατάτα", 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"), UnicodeString(TRUE, dest, length)); static const EditChange upperExpectedChanges[] = { { FALSE, 1, 1 }, { TRUE, 1, 1 }, @@ -1041,7 +988,7 @@ void StringCaseTest::TestCaseMapWithEdits() { { TRUE, 1, 1 }, { TRUE, 1, 1 } }; - checkEditsIter((const UChar *)u"toUpper(Πατάτα)", + checkEditsIter(u"toUpper(Πατάτα)", edits.getFineIterator(), edits.getFineIterator(), upperExpectedChanges, UPRV_LENGTHOF(upperExpectedChanges), TRUE, errorCode); @@ -1051,23 +998,23 @@ void StringCaseTest::TestCaseMapWithEdits() { UCASEMAP_OMIT_UNCHANGED_TEXT | U_TITLECASE_NO_BREAK_ADJUSTMENT | U_TITLECASE_NO_LOWERCASE, - NULL, (const UChar *)u"IjssEL IglOo", 12, + NULL, u"IjssEL IglOo", 12, dest, UPRV_LENGTHOF(dest), &edits, errorCode); - assertEquals((const UChar *)u"toTitle(IjssEL IglOo)", UnicodeString((const UChar *)u"J"), UnicodeString(TRUE, dest, length)); + assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"), UnicodeString(TRUE, dest, length)); static const EditChange titleExpectedChanges[] = { { FALSE, 1, 1 }, { TRUE, 1, 1 }, { FALSE, 10, 10 } }; - checkEditsIter((const UChar *)u"toTitle(IjssEL IglOo)", + checkEditsIter(u"toTitle(IjssEL IglOo)", edits.getFineIterator(), edits.getFineIterator(), titleExpectedChanges, UPRV_LENGTHOF(titleExpectedChanges), TRUE, errorCode); edits.reset(); length = CaseMap::fold(UCASEMAP_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I, - (const UChar *)u"IßtanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode); - assertEquals((const UChar *)u"foldCase(IßtanBul)", UnicodeString((const UChar *)u"ıssb"), UnicodeString(TRUE, dest, length)); + u"IßtanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"), UnicodeString(TRUE, dest, length)); static const EditChange foldExpectedChanges[] = { { TRUE, 1, 1 }, { TRUE, 1, 2 }, @@ -1075,7 +1022,82 @@ void StringCaseTest::TestCaseMapWithEdits() { { TRUE, 1, 1 }, { FALSE, 2, 2 } }; - checkEditsIter((const UChar *)u"foldCase(IßtanBul)", + checkEditsIter(u"foldCase(IßtanBul)", + edits.getFineIterator(), edits.getFineIterator(), + foldExpectedChanges, UPRV_LENGTHOF(foldExpectedChanges), + TRUE, errorCode); +} + +void StringCaseTest::TestCaseMapUTF8WithEdits() { + IcuTestErrorCode errorCode(*this, "TestEdits"); + char dest[50]; + Edits edits; + + int32_t length = CaseMap::utf8ToLower("tr", UCASEMAP_OMIT_UNCHANGED_TEXT, + u8"IstanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"toLower(IstanBul)", UnicodeString(u"ıb"), + UnicodeString::fromUTF8(StringPiece(dest, length))); + static const EditChange lowerExpectedChanges[] = { + { TRUE, 1, 2 }, + { FALSE, 4, 4 }, + { TRUE, 1, 1 }, + { FALSE, 2, 2 } + }; + checkEditsIter(u"toLower(IstanBul)", + edits.getFineIterator(), edits.getFineIterator(), + lowerExpectedChanges, UPRV_LENGTHOF(lowerExpectedChanges), + TRUE, errorCode); + + edits.reset(); + length = CaseMap::utf8ToUpper("el", UCASEMAP_OMIT_UNCHANGED_TEXT, + u8"Πατάτα", 6 * 2, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"toUpper(Πατάτα)", UnicodeString(u"ΑΤΑΤΑ"), + UnicodeString::fromUTF8(StringPiece(dest, length))); + static const EditChange upperExpectedChanges[] = { + { FALSE, 2, 2 }, + { TRUE, 2, 2 }, + { TRUE, 2, 2 }, + { TRUE, 2, 2 }, + { TRUE, 2, 2 }, + { TRUE, 2, 2 } + }; + checkEditsIter(u"toUpper(Πατάτα)", + edits.getFineIterator(), edits.getFineIterator(), + upperExpectedChanges, UPRV_LENGTHOF(upperExpectedChanges), + TRUE, errorCode); + + edits.reset(); + length = CaseMap::utf8ToTitle("nl", + UCASEMAP_OMIT_UNCHANGED_TEXT | + U_TITLECASE_NO_BREAK_ADJUSTMENT | + U_TITLECASE_NO_LOWERCASE, + NULL, u8"IjssEL IglOo", 12, + dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"toTitle(IjssEL IglOo)", UnicodeString(u"J"), + UnicodeString::fromUTF8(StringPiece(dest, length))); + static const EditChange titleExpectedChanges[] = { + { FALSE, 1, 1 }, + { TRUE, 1, 1 }, + { FALSE, 10, 10 } + }; + checkEditsIter(u"toTitle(IjssEL IglOo)", + edits.getFineIterator(), edits.getFineIterator(), + titleExpectedChanges, UPRV_LENGTHOF(titleExpectedChanges), + TRUE, errorCode); + + edits.reset(); + length = CaseMap::utf8Fold(UCASEMAP_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I, + u8"IßtanBul", 1 + 2 + 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode); + assertEquals(u"foldCase(IßtanBul)", UnicodeString(u"ıssb"), + UnicodeString::fromUTF8(StringPiece(dest, length))); + static const EditChange foldExpectedChanges[] = { + { TRUE, 1, 2 }, + { TRUE, 2, 2 }, + { FALSE, 3, 3 }, + { TRUE, 1, 1 }, + { FALSE, 2, 2 } + }; + checkEditsIter(u"foldCase(IßtanBul)", edits.getFineIterator(), edits.getFineIterator(), foldExpectedChanges, UPRV_LENGTHOF(foldExpectedChanges), TRUE, errorCode);