diff --git a/icu4c/source/common/unicode/ucasemap.h b/icu4c/source/common/unicode/ucasemap.h index a97609d150e..25a1af25dd5 100644 --- a/icu4c/source/common/unicode/ucasemap.h +++ b/icu4c/source/common/unicode/ucasemap.h @@ -117,13 +117,41 @@ public: */ Edits() : array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), - errorCode(U_ZERO_ERROR) {} + omit(FALSE), errorCode(U_ZERO_ERROR) {} ~Edits(); + /** * Resets the data but may not release memory. * @internal ICU 59 technology preview */ void reset(); + + /** + * Controls whether the case mapping function is to write or omit + * characters that do not change. + * The complete result can be computed by applying just the changes + * to the original string. + * @see omitUnchanged + * @see writeUnchanged + * @internal ICU 59 technology preview + */ + Edits &setWriteUnchanged(UBool write) { + omit = !write; + return *this; + } + /** + * @return TRUE if the case mapping function is to omit characters that do not change. + * @see setWriteUnchanged + * @internal ICU 59 technology preview + */ + UBool omitUnchanged() const { return omit; } + /** + * @return TRUE if the case mapping function is to write characters that do not change. + * @see setWriteUnchanged + * @internal ICU 59 technology preview + */ + UBool writeUnchanged() const { return !omit; } + /** * Adds a record for an unchanged segment of text. * @internal ICU 59 technology preview @@ -148,6 +176,75 @@ public: * @internal ICU 59 technology preview */ int32_t lengthDelta() const { return delta; } + /** + * @return TRUE if there are any change edits + * @internal ICU 59 technology preview + */ + UBool hasChanges() const; + + /** + * Access to the list of edits. + * @see getCoarseIterator + * @see getFineIterator + * @internal ICU 59 technology preview + */ + struct Iterator final : public UMemory { + /** + * Advances to the next edit. + * @return TRUE if there is another edit + * @internal ICU 59 technology preview + */ + UBool next(UErrorCode &errorCode); + + /** + * TRUE if this edit replaces oldLength units with newLength different ones. + * FALSE if oldLength units remain unchanged. + * @internal ICU 59 technology preview + */ + UBool changed; + /** + * Number of units in the original string which are replaced or remain unchanged. + * @internal ICU 59 technology preview + */ + int32_t oldLength; + /** + * Number of units in the modified string, if changed is TRUE. + * Same as oldLength if changed is FALSE. + * @internal ICU 59 technology preview + */ + int32_t newLength; + + private: + friend class Edits; + + Iterator(const uint16_t *a, int32_t len, UBool crs) : + array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {} + + int32_t readLength(int32_t head); + + const uint16_t *array; + int32_t index, length; + int32_t width, remaining; + UBool coarse; + }; + + /** + * Returns an Iterator for coarse-grained changes for simple string updates. + * @return an Iterator that merges adjacent changes. + * @internal ICU 59 technology preview + */ + Iterator getCoarseIterator() const { + return Iterator(array, length, TRUE); + } + + /** + * Returns an Iterator for fine-grained changes for modifying text with metadata. + * @return an Iterator that separates adjacent changes. + * @internal ICU 59 technology preview + */ + Iterator getFineIterator() const { + return Iterator(array, length, FALSE); + } private: Edits(const Edits &) = delete; @@ -165,6 +262,7 @@ private: int32_t capacity; int32_t length; int32_t delta; + UBool omit; UErrorCode errorCode; uint16_t stackArray[STACK_CAPACITY]; }; @@ -188,7 +286,9 @@ private: * * @internal ICU 59 technology preview */ -#define UCASEMAP_OMIT_UNCHANGED 0x4000 +// TODO: does not work well as an option because we would need to set/reset it on UCaseMaps +// that are often const, replaced for now by Edits.setWriteUnchanged(UBool) +// #define UCASEMAP_OMIT_UNCHANGED 0x4000 #endif // U_HIDE_INTERNAL_API @@ -520,6 +620,8 @@ ucasemap_utf8FoldCase(const UCaseMap *csm, const char *src, int32_t srcLength, UErrorCode *pErrorCode); +#if U_SHOW_CPLUSPLUS_API + // Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper. /** * Internal string case mapping function type. @@ -535,4 +637,5 @@ UStringCaseMapper(const UCaseMap *csm, icu::Edits *edits, UErrorCode *pErrorCode); +#endif // U_SHOW_CPLUSPLUS_API #endif diff --git a/icu4c/source/common/unistr_case.cpp b/icu4c/source/common/unistr_case.cpp index 1715b6ec66e..32fb20e87e6 100644 --- a/icu4c/source/common/unistr_case.cpp +++ b/icu4c/source/common/unistr_case.cpp @@ -25,6 +25,7 @@ #include "unicode/ustring.h" #include "unicode/unistr.h" #include "unicode/uchar.h" +#include "uassert.h" #include "uelement.h" #include "ustr_imp.h" @@ -94,49 +95,104 @@ UnicodeString::caseMap(const UCaseMap *csm, return *this; } + UChar oldBuffer[2 * US_STACKBUF_SIZE]; + UChar *oldArray; + int32_t oldLength = length(); + int32_t newLength; + UBool writable = isBufferWritable(); + UErrorCode errorCode = U_ZERO_ERROR; + + // Try to avoid heap-allocating a new character array for this string. + if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) { + // Short string: Copy the contents into a temporary buffer and + // case-map back into the current array, or into the stack buffer. + UChar *buffer = getArrayStart(); + int32_t capacity; + oldArray = oldBuffer; + u_memcpy(oldBuffer, buffer, oldLength); + if (writable) { + capacity = getCapacity(); + } else { + // Switch from the read-only alias or shared heap buffer to the stack buffer. + if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) { + return *this; + } + U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer); + buffer = fUnion.fStackFields.fBuffer; + capacity = US_STACKBUF_SIZE; + } + newLength = stringCaseMapper(csm, buffer, capacity, oldArray, oldLength, NULL, &errorCode); + if (U_SUCCESS(errorCode)) { + setLength(newLength); + return *this; + } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { + // common overflow handling below + } else { + setToBogus(); + return *this; + } + } else { + // Longer string or read-only buffer: + // Collect only changes and then apply them to this string. + // Case mapping often changes only small parts of a string, + // and often does not change its length. + oldArray = getArrayStart(); + Edits edits; + edits.setWriteUnchanged(FALSE); + UChar replacementChars[200]; + int32_t replacementLength = stringCaseMapper( + csm, replacementChars, UPRV_LENGTHOF(replacementChars), + oldArray, oldLength, &edits, &errorCode); + UErrorCode editsError = U_ZERO_ERROR; + if (edits.setErrorCode(editsError)) { + setToBogus(); + return *this; + } + newLength = oldLength + edits.lengthDelta(); + if (U_SUCCESS(errorCode)) { + if (!cloneArrayIfNeeded(newLength, newLength)) { + return *this; + } + int32_t index = 0; // index into this string + int32_t replIndex = 0; // index into replacementChars + for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) { + if (iter.changed) { + doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength); + replIndex += iter.newLength; + } + index += iter.newLength; + } + if (U_FAILURE(errorCode)) { + setToBogus(); + } + U_ASSERT(replIndex == replacementLength); + return *this; + } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) { + // common overflow handling below + } else { + setToBogus(); + return *this; + } + } + + // Handle buffer overflow, newLength is known. // We need to allocate a new buffer for the internal string case mapping function. // This is very similar to how doReplace() keeps the old array pointer // and deletes the old array itself after it is done. // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array. - UChar oldStackBuffer[US_STACKBUF_SIZE]; - UChar *oldArray; - int32_t oldLength; - - if(fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) { - // copy the stack buffer contents because it will be overwritten - oldArray = oldStackBuffer; - oldLength = getShortLength(); - u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength); - } else { - oldArray = getArrayStart(); - oldLength = length(); - } - - int32_t capacity; - if(oldLength <= US_STACKBUF_SIZE) { - capacity = US_STACKBUF_SIZE; - } else { - capacity = oldLength + 20; - } int32_t *bufferToDelete = 0; - if(!cloneArrayIfNeeded(capacity, capacity, FALSE, &bufferToDelete, TRUE)) { + if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) { return *this; } - - // Case-map, and if the result is too long, then reallocate and repeat. - UErrorCode errorCode; - int32_t newLength; - do { - errorCode = U_ZERO_ERROR; - newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(), - oldArray, oldLength, &errorCode); - setLength(newLength); - } while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(newLength, newLength, FALSE)); - + errorCode = U_ZERO_ERROR; + newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(), + oldArray, oldLength, NULL, &errorCode); if (bufferToDelete) { uprv_free(bufferToDelete); } - if(U_FAILURE(errorCode)) { + if (U_SUCCESS(errorCode)) { + setLength(newLength); + } else { setToBogus(); } return *this; diff --git a/icu4c/source/common/unistr_titlecase_brkiter.cpp b/icu4c/source/common/unistr_titlecase_brkiter.cpp index 3d6737cfc5e..c909133cdbe 100644 --- a/icu4c/source/common/unistr_titlecase_brkiter.cpp +++ b/icu4c/source/common/unistr_titlecase_brkiter.cpp @@ -32,9 +32,10 @@ static int32_t U_CALLCONV unistr_case_internalToTitle(const UCaseMap *csm, UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, + icu::Edits *edits, UErrorCode *pErrorCode) { ubrk_setText(csm->iter, src, srcLength, pErrorCode); - return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, pErrorCode); + return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, edits, pErrorCode); } /* diff --git a/icu4c/source/common/ustr_imp.h b/icu4c/source/common/ustr_imp.h index 12a9ac9ca31..544d05c3768 100644 --- a/icu4c/source/common/ustr_imp.h +++ b/icu4c/source/common/ustr_imp.h @@ -102,6 +102,10 @@ uprv_haveProperties(UErrorCode *pErrorCode); /*U_CFUNC int8_t uprv_loadPropsData(UErrorCode *errorCode);*/ +#ifdef __cplusplus +// TODO: Consider moving these case mapping definitions +// into a new internal header like ucasemap_imp.h. + /* * Internal string casing functions implementing * ustring.h/ustrcase.c and UnicodeString case mapping functions. @@ -117,10 +121,6 @@ struct UCaseMap { uint32_t options; }; -#ifndef __UCASEMAP_H__ -typedef struct UCaseMap UCaseMap; -#endif - #if UCONFIG_NO_BREAK_ITERATION # define UCASEMAP_INITIALIZER { NULL, { 0 }, 0, 0 } #else @@ -209,8 +209,6 @@ ucasemap_mapUTF8(const UCaseMap *csm, UTF8CaseMapper *stringCaseMapper, UErrorCode *pErrorCode); -#ifdef __cplusplus - U_NAMESPACE_BEGIN namespace GreekUpper { diff --git a/icu4c/source/common/ustr_titlecase_brkiter.cpp b/icu4c/source/common/ustr_titlecase_brkiter.cpp index 21a53be4ff7..d5e5a2c2415 100644 --- a/icu4c/source/common/ustr_titlecase_brkiter.cpp +++ b/icu4c/source/common/ustr_titlecase_brkiter.cpp @@ -66,7 +66,7 @@ u_strToTitle(UChar *dest, int32_t destCapacity, &csm, dest, destCapacity, src, srcLength, - ustrcase_internalToTitle, pErrorCode); + ustrcase_internalToTitle, NULL, pErrorCode); if(titleIter==NULL && csm.iter!=NULL) { ubrk_close(csm.iter); } diff --git a/icu4c/source/common/ustrcase.cpp b/icu4c/source/common/ustrcase.cpp index 7459e3572ed..c833345788a 100644 --- a/icu4c/source/common/ustrcase.cpp +++ b/icu4c/source/common/ustrcase.cpp @@ -208,12 +208,120 @@ UBool Edits::growArray() { } UBool Edits::setErrorCode(UErrorCode &outErrorCode) { - if(U_FAILURE(outErrorCode)) { return TRUE; } - if(U_SUCCESS(errorCode)) { return FALSE; } + if (U_FAILURE(outErrorCode)) { return TRUE; } + if (U_SUCCESS(errorCode)) { return FALSE; } outErrorCode = errorCode; return TRUE; } +UBool Edits::hasChanges() const { + if (delta != 0) { + return TRUE; + } + for (int32_t i = 0; i < length; ++i) { + if (array[i] > MAX_UNCHANGED) { + return TRUE; + } + } + return FALSE; +} + +UBool Edits::Iterator::next(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { return FALSE; } + // Always set all relevant public fields: Do not rely on them not having been touched. + if (remaining > 0) { + // Fine-grained iterator: Continue a sequence of equal-length changes. + changed = TRUE; + oldLength = newLength = width; + --remaining; + return TRUE; + } + if (index >= length) { + return FALSE; + } + int32_t u = array[index++]; + if (u <= MAX_UNCHANGED) { + // Combine adjacent unchanged ranges. + changed = FALSE; + oldLength = u + 1; + while (index < length && (u = array[index]) <= MAX_UNCHANGED) { + ++index; + if (u >= (INT32_MAX - oldLength)) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + oldLength += u + 1; + } + newLength = oldLength; + return TRUE; + } + changed = TRUE; + if (u <= MAX_SHORT_CHANGE) { + if (coarse) { + int32_t w = u >> 12; + int32_t len = (u & 0xfff) + 1; + oldLength = newLength = w * len; + } else { + // Split a sequence of equal-length changes that was compressed into one unit. + oldLength = newLength = width = u >> 12; + remaining = u & 0xfff; + return TRUE; + } + } else { + U_ASSERT(u <= 0x7fff); + oldLength = readLength((u >> 6) & 0x3f); + newLength = readLength(u & 0x3f); + if (!coarse) { + return TRUE; + } + } + // Combine adjacent changes. + while (index < length && (u = array[index]) > MAX_UNCHANGED) { + ++index; + if (u <= MAX_SHORT_CHANGE) { + int32_t w = u >> 12; + int32_t len = (u & 0xfff) + 1; + len = w * len; + if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + oldLength += len; + newLength += len; + } else { + U_ASSERT(u <= 0x7fff); + int32_t oldLen = readLength((u >> 6) & 0x3f); + int32_t newLen = readLength(u & 0x3f); + if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) { + errorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return FALSE; + } + oldLength += oldLen; + newLength += newLen; + } + } + return TRUE; +} + +int32_t Edits::Iterator::readLength(int32_t head) { + if (head < LENGTH_IN_1TRAIL) { + return head; + } else if (head < LENGTH_IN_2TRAIL) { + U_ASSERT(index < length); + U_ASSERT(array[index] >= 0x8000); + return array[index++]; + } else { + U_ASSERT((index + 2) <= length); + U_ASSERT(array[index] >= 0x8000); + U_ASSERT(array[index + 1] >= 0x8000); + int32_t len = ((head & 1) << 30) | + ((int32_t)(array[index] & 0x7fff) << 15) | + (array[index + 1] & 0x7fff); + index += 2; + return len; + } +} + U_NAMESPACE_END U_NAMESPACE_USE @@ -224,7 +332,7 @@ U_NAMESPACE_USE static inline int32_t appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, int32_t result, const UChar *s, - uint32_t options, int32_t cpLength, icu::Edits *edits) { + int32_t cpLength, icu::Edits *edits) { UChar32 c; int32_t length; @@ -233,9 +341,9 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, /* (not) original code point */ if(edits!=NULL) { edits->addUnchanged(cpLength); - } - if(options & UCASEMAP_OMIT_UNCHANGED) { - return destIndex; + if(edits->omitUnchanged()) { + return destIndex; + } } c=~result; if(destIndex0) { + if(edits!=NULL) { + edits->addUnchanged(length); + if(edits->omitUnchanged()) { + return destIndex; + } + } if(length>(INT32_MAX-destIndex)) { return -1; // integer overflow } @@ -315,9 +429,6 @@ appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity, u_memcpy(dest+destIndex, s, length); } destIndex+=length; - if(edits!=NULL) { - edits->addUnchanged(length); - } } return destIndex; } @@ -379,7 +490,7 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map, const UChar *s; c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); destIndex = appendResult(dest, destIndex, destCapacity, c, s, - csm->options, srcIndex - cpStart, edits); + srcIndex - cpStart, edits); if (destIndex < 0) { *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; return 0; @@ -482,7 +593,7 @@ ustrcase_internalToTitle(const UCaseMap *csm, c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); destIndex=appendResult(dest, destIndex, destCapacity, c, s, - csm->options, titleLimit-titleStart, edits); + titleLimit-titleStart, edits); if(destIndex<0) { *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; @@ -1104,7 +1215,7 @@ int32_t toUpper(const UCaseMap *csm, } UBool change; - if ((csm->options & UCASEMAP_OMIT_UNCHANGED) == 0 && edits == NULL) { + if (edits == NULL) { change = TRUE; // common, simple usage } else { // Find out first whether we are changing the text. @@ -1130,7 +1241,7 @@ int32_t toUpper(const UCaseMap *csm, edits->addUnchanged(oldLength); } // Write unchanged text? - change |= (csm->options & UCASEMAP_OMIT_UNCHANGED) == 0; + change = edits->writeUnchanged(); } } @@ -1155,7 +1266,7 @@ int32_t toUpper(const UCaseMap *csm, const UChar *s; c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache); destIndex = appendResult(dest, destIndex, destCapacity, c, s, - csm->options, nextIndex - i, edits); + nextIndex - i, edits); if (destIndex < 0) { *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; return 0; @@ -1228,7 +1339,7 @@ ustrcase_internalFold(const UCaseMap *csm, const UChar *s; c = ucase_toFullFolding(csm->csp, c, &s, csm->options); destIndex = appendResult(dest, destIndex, destCapacity, c, s, - csm->options, srcIndex - cpStart, edits); + srcIndex - cpStart, edits); if (destIndex < 0) { *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; return 0;