ICU-12410 add Edits::Iterator and change UnicodeString case mappings to get & apply Edits for longer strings

X-SVN-Rev: 39547
This commit is contained in:
Markus Scherer 2017-01-04 21:20:08 +00:00
parent 6fc7fe3b65
commit 20994f490a
6 changed files with 327 additions and 58 deletions

View file

@ -117,13 +117,41 @@ public:
*/
Edits() :
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
errorCode(U_ZERO_ERROR) {}
omit(FALSE), errorCode(U_ZERO_ERROR) {}
~Edits();
/**
* Resets the data but may not release memory.
* @internal ICU 59 technology preview
*/
void reset();
/**
* Controls whether the case mapping function is to write or omit
* characters that do not change.
* The complete result can be computed by applying just the changes
* to the original string.
* @see omitUnchanged
* @see writeUnchanged
* @internal ICU 59 technology preview
*/
Edits &setWriteUnchanged(UBool write) {
omit = !write;
return *this;
}
/**
* @return TRUE if the case mapping function is to omit characters that do not change.
* @see setWriteUnchanged
* @internal ICU 59 technology preview
*/
UBool omitUnchanged() const { return omit; }
/**
* @return TRUE if the case mapping function is to write characters that do not change.
* @see setWriteUnchanged
* @internal ICU 59 technology preview
*/
UBool writeUnchanged() const { return !omit; }
/**
* Adds a record for an unchanged segment of text.
* @internal ICU 59 technology preview
@ -148,6 +176,75 @@ public:
* @internal ICU 59 technology preview
*/
int32_t lengthDelta() const { return delta; }
/**
* @return TRUE if there are any change edits
* @internal ICU 59 technology preview
*/
UBool hasChanges() const;
/**
* Access to the list of edits.
* @see getCoarseIterator
* @see getFineIterator
* @internal ICU 59 technology preview
*/
struct Iterator final : public UMemory {
/**
* Advances to the next edit.
* @return TRUE if there is another edit
* @internal ICU 59 technology preview
*/
UBool next(UErrorCode &errorCode);
/**
* TRUE if this edit replaces oldLength units with newLength different ones.
* FALSE if oldLength units remain unchanged.
* @internal ICU 59 technology preview
*/
UBool changed;
/**
* Number of units in the original string which are replaced or remain unchanged.
* @internal ICU 59 technology preview
*/
int32_t oldLength;
/**
* Number of units in the modified string, if changed is TRUE.
* Same as oldLength if changed is FALSE.
* @internal ICU 59 technology preview
*/
int32_t newLength;
private:
friend class Edits;
Iterator(const uint16_t *a, int32_t len, UBool crs) :
array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {}
int32_t readLength(int32_t head);
const uint16_t *array;
int32_t index, length;
int32_t width, remaining;
UBool coarse;
};
/**
* Returns an Iterator for coarse-grained changes for simple string updates.
* @return an Iterator that merges adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getCoarseIterator() const {
return Iterator(array, length, TRUE);
}
/**
* Returns an Iterator for fine-grained changes for modifying text with metadata.
* @return an Iterator that separates adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getFineIterator() const {
return Iterator(array, length, FALSE);
}
private:
Edits(const Edits &) = delete;
@ -165,6 +262,7 @@ private:
int32_t capacity;
int32_t length;
int32_t delta;
UBool omit;
UErrorCode errorCode;
uint16_t stackArray[STACK_CAPACITY];
};
@ -188,7 +286,9 @@ private:
*
* @internal ICU 59 technology preview
*/
#define UCASEMAP_OMIT_UNCHANGED 0x4000
// TODO: does not work well as an option because we would need to set/reset it on UCaseMaps
// that are often const, replaced for now by Edits.setWriteUnchanged(UBool)
// #define UCASEMAP_OMIT_UNCHANGED 0x4000
#endif // U_HIDE_INTERNAL_API
@ -520,6 +620,8 @@ ucasemap_utf8FoldCase(const UCaseMap *csm,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#if U_SHOW_CPLUSPLUS_API
// Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper.
/**
* Internal string case mapping function type.
@ -535,4 +637,5 @@ UStringCaseMapper(const UCaseMap *csm,
icu::Edits *edits,
UErrorCode *pErrorCode);
#endif // U_SHOW_CPLUSPLUS_API
#endif

View file

@ -25,6 +25,7 @@
#include "unicode/ustring.h"
#include "unicode/unistr.h"
#include "unicode/uchar.h"
#include "uassert.h"
#include "uelement.h"
#include "ustr_imp.h"
@ -94,49 +95,104 @@ UnicodeString::caseMap(const UCaseMap *csm,
return *this;
}
UChar oldBuffer[2 * US_STACKBUF_SIZE];
UChar *oldArray;
int32_t oldLength = length();
int32_t newLength;
UBool writable = isBufferWritable();
UErrorCode errorCode = U_ZERO_ERROR;
// Try to avoid heap-allocating a new character array for this string.
if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
// Short string: Copy the contents into a temporary buffer and
// case-map back into the current array, or into the stack buffer.
UChar *buffer = getArrayStart();
int32_t capacity;
oldArray = oldBuffer;
u_memcpy(oldBuffer, buffer, oldLength);
if (writable) {
capacity = getCapacity();
} else {
// Switch from the read-only alias or shared heap buffer to the stack buffer.
if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
return *this;
}
U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
buffer = fUnion.fStackFields.fBuffer;
capacity = US_STACKBUF_SIZE;
}
newLength = stringCaseMapper(csm, buffer, capacity, oldArray, oldLength, NULL, &errorCode);
if (U_SUCCESS(errorCode)) {
setLength(newLength);
return *this;
} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
// common overflow handling below
} else {
setToBogus();
return *this;
}
} else {
// Longer string or read-only buffer:
// Collect only changes and then apply them to this string.
// Case mapping often changes only small parts of a string,
// and often does not change its length.
oldArray = getArrayStart();
Edits edits;
edits.setWriteUnchanged(FALSE);
UChar replacementChars[200];
int32_t replacementLength = stringCaseMapper(
csm, replacementChars, UPRV_LENGTHOF(replacementChars),
oldArray, oldLength, &edits, &errorCode);
UErrorCode editsError = U_ZERO_ERROR;
if (edits.setErrorCode(editsError)) {
setToBogus();
return *this;
}
newLength = oldLength + edits.lengthDelta();
if (U_SUCCESS(errorCode)) {
if (!cloneArrayIfNeeded(newLength, newLength)) {
return *this;
}
int32_t index = 0; // index into this string
int32_t replIndex = 0; // index into replacementChars
for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) {
if (iter.changed) {
doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength);
replIndex += iter.newLength;
}
index += iter.newLength;
}
if (U_FAILURE(errorCode)) {
setToBogus();
}
U_ASSERT(replIndex == replacementLength);
return *this;
} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
// common overflow handling below
} else {
setToBogus();
return *this;
}
}
// Handle buffer overflow, newLength is known.
// We need to allocate a new buffer for the internal string case mapping function.
// This is very similar to how doReplace() keeps the old array pointer
// and deletes the old array itself after it is done.
// In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
UChar oldStackBuffer[US_STACKBUF_SIZE];
UChar *oldArray;
int32_t oldLength;
if(fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) {
// copy the stack buffer contents because it will be overwritten
oldArray = oldStackBuffer;
oldLength = getShortLength();
u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
} else {
oldArray = getArrayStart();
oldLength = length();
}
int32_t capacity;
if(oldLength <= US_STACKBUF_SIZE) {
capacity = US_STACKBUF_SIZE;
} else {
capacity = oldLength + 20;
}
int32_t *bufferToDelete = 0;
if(!cloneArrayIfNeeded(capacity, capacity, FALSE, &bufferToDelete, TRUE)) {
if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
return *this;
}
// Case-map, and if the result is too long, then reallocate and repeat.
UErrorCode errorCode;
int32_t newLength;
do {
errorCode = U_ZERO_ERROR;
newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
oldArray, oldLength, &errorCode);
setLength(newLength);
} while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(newLength, newLength, FALSE));
errorCode = U_ZERO_ERROR;
newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
oldArray, oldLength, NULL, &errorCode);
if (bufferToDelete) {
uprv_free(bufferToDelete);
}
if(U_FAILURE(errorCode)) {
if (U_SUCCESS(errorCode)) {
setLength(newLength);
} else {
setToBogus();
}
return *this;

View file

@ -32,9 +32,10 @@ static int32_t U_CALLCONV
unistr_case_internalToTitle(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) {
ubrk_setText(csm->iter, src, srcLength, pErrorCode);
return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, pErrorCode);
return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, edits, pErrorCode);
}
/*

View file

@ -102,6 +102,10 @@ uprv_haveProperties(UErrorCode *pErrorCode);
/*U_CFUNC int8_t
uprv_loadPropsData(UErrorCode *errorCode);*/
#ifdef __cplusplus
// TODO: Consider moving these case mapping definitions
// into a new internal header like ucasemap_imp.h.
/*
* Internal string casing functions implementing
* ustring.h/ustrcase.c and UnicodeString case mapping functions.
@ -117,10 +121,6 @@ struct UCaseMap {
uint32_t options;
};
#ifndef __UCASEMAP_H__
typedef struct UCaseMap UCaseMap;
#endif
#if UCONFIG_NO_BREAK_ITERATION
# define UCASEMAP_INITIALIZER { NULL, { 0 }, 0, 0 }
#else
@ -209,8 +209,6 @@ ucasemap_mapUTF8(const UCaseMap *csm,
UTF8CaseMapper *stringCaseMapper,
UErrorCode *pErrorCode);
#ifdef __cplusplus
U_NAMESPACE_BEGIN
namespace GreekUpper {

View file

@ -66,7 +66,7 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
&csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalToTitle, pErrorCode);
ustrcase_internalToTitle, NULL, pErrorCode);
if(titleIter==NULL && csm.iter!=NULL) {
ubrk_close(csm.iter);
}

View file

@ -208,12 +208,120 @@ UBool Edits::growArray() {
}
UBool Edits::setErrorCode(UErrorCode &outErrorCode) {
if(U_FAILURE(outErrorCode)) { return TRUE; }
if(U_SUCCESS(errorCode)) { return FALSE; }
if (U_FAILURE(outErrorCode)) { return TRUE; }
if (U_SUCCESS(errorCode)) { return FALSE; }
outErrorCode = errorCode;
return TRUE;
}
UBool Edits::hasChanges() const {
if (delta != 0) {
return TRUE;
}
for (int32_t i = 0; i < length; ++i) {
if (array[i] > MAX_UNCHANGED) {
return TRUE;
}
}
return FALSE;
}
UBool Edits::Iterator::next(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
// Always set all relevant public fields: Do not rely on them not having been touched.
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of equal-length changes.
changed = TRUE;
oldLength = newLength = width;
--remaining;
return TRUE;
}
if (index >= length) {
return FALSE;
}
int32_t u = array[index++];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = FALSE;
oldLength = u + 1;
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
++index;
if (u >= (INT32_MAX - oldLength)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
oldLength += u + 1;
}
newLength = oldLength;
return TRUE;
}
changed = TRUE;
if (u <= MAX_SHORT_CHANGE) {
if (coarse) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
oldLength = newLength = w * len;
} else {
// Split a sequence of equal-length changes that was compressed into one unit.
oldLength = newLength = width = u >> 12;
remaining = u & 0xfff;
return TRUE;
}
} else {
U_ASSERT(u <= 0x7fff);
oldLength = readLength((u >> 6) & 0x3f);
newLength = readLength(u & 0x3f);
if (!coarse) {
return TRUE;
}
}
// Combine adjacent changes.
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index;
if (u <= MAX_SHORT_CHANGE) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
len = w * len;
if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
oldLength += len;
newLength += len;
} else {
U_ASSERT(u <= 0x7fff);
int32_t oldLen = readLength((u >> 6) & 0x3f);
int32_t newLen = readLength(u & 0x3f);
if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
oldLength += oldLen;
newLength += newLen;
}
}
return TRUE;
}
int32_t Edits::Iterator::readLength(int32_t head) {
if (head < LENGTH_IN_1TRAIL) {
return head;
} else if (head < LENGTH_IN_2TRAIL) {
U_ASSERT(index < length);
U_ASSERT(array[index] >= 0x8000);
return array[index++];
} else {
U_ASSERT((index + 2) <= length);
U_ASSERT(array[index] >= 0x8000);
U_ASSERT(array[index + 1] >= 0x8000);
int32_t len = ((head & 1) << 30) |
((int32_t)(array[index] & 0x7fff) << 15) |
(array[index + 1] & 0x7fff);
index += 2;
return len;
}
}
U_NAMESPACE_END
U_NAMESPACE_USE
@ -224,7 +332,7 @@ U_NAMESPACE_USE
static inline int32_t
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
int32_t result, const UChar *s,
uint32_t options, int32_t cpLength, icu::Edits *edits) {
int32_t cpLength, icu::Edits *edits) {
UChar32 c;
int32_t length;
@ -233,9 +341,9 @@ appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
/* (not) original code point */
if(edits!=NULL) {
edits->addUnchanged(cpLength);
}
if(options & UCASEMAP_OMIT_UNCHANGED) {
return destIndex;
if(edits->omitUnchanged()) {
return destIndex;
}
}
c=~result;
if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
@ -308,6 +416,12 @@ static inline int32_t
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length, icu::Edits *edits) {
if(length>0) {
if(edits!=NULL) {
edits->addUnchanged(length);
if(edits->omitUnchanged()) {
return destIndex;
}
}
if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow
}
@ -315,9 +429,6 @@ appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
u_memcpy(dest+destIndex, s, length);
}
destIndex+=length;
if(edits!=NULL) {
edits->addUnchanged(length);
}
}
return destIndex;
}
@ -379,7 +490,7 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
const UChar *s;
c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
csm->options, srcIndex - cpStart, edits);
srcIndex - cpStart, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
@ -482,7 +593,7 @@ ustrcase_internalToTitle(const UCaseMap *csm,
c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s,
csm->locale, &locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
csm->options, titleLimit-titleStart, edits);
titleLimit-titleStart, edits);
if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
@ -1104,7 +1215,7 @@ int32_t toUpper(const UCaseMap *csm,
}
UBool change;
if ((csm->options & UCASEMAP_OMIT_UNCHANGED) == 0 && edits == NULL) {
if (edits == NULL) {
change = TRUE; // common, simple usage
} else {
// Find out first whether we are changing the text.
@ -1130,7 +1241,7 @@ int32_t toUpper(const UCaseMap *csm,
edits->addUnchanged(oldLength);
}
// Write unchanged text?
change |= (csm->options & UCASEMAP_OMIT_UNCHANGED) == 0;
change = edits->writeUnchanged();
}
}
@ -1155,7 +1266,7 @@ int32_t toUpper(const UCaseMap *csm,
const UChar *s;
c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
csm->options, nextIndex - i, edits);
nextIndex - i, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
@ -1228,7 +1339,7 @@ ustrcase_internalFold(const UCaseMap *csm,
const UChar *s;
c = ucase_toFullFolding(csm->csp, c, &s, csm->options);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
csm->options, srcIndex - cpStart, edits);
srcIndex - cpStart, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;