ICU-13270 icu::Edits add numberOfChanges(); Edits::Iterator add findDestinationIndex(), destinationIndexFromSourceIndex(), sourceIndexFromDestinationIndex()

X-SVN-Rev: 40286
This commit is contained in:
Markus Scherer 2017-07-24 22:43:53 +00:00
parent 112d214d29
commit 9a3a03c417
7 changed files with 487 additions and 94 deletions

View file

@ -40,7 +40,7 @@ Edits::~Edits() {
}
void Edits::reset() {
length = delta = 0;
length = delta = numChanges = 0;
}
void Edits::addUnchanged(int32_t unchangedLength) {
@ -76,6 +76,7 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
// Replacement of short oldLength text units by same-length new text.
// Merge into previous short-replacement record, if any.
++numChanges;
int32_t last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
@ -93,6 +94,7 @@ void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if (oldLength == 0 && newLength == 0) {
return;
}
++numChanges;
int32_t newDelta = newLength - oldLength;
if (newDelta != 0) {
if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
@ -182,18 +184,6 @@ UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
return TRUE;
}
UBool Edits::hasChanges() const {
if (delta != 0) {
return TRUE;
}
for (int32_t i = 0; i < length; ++i) {
if (array[i] > MAX_UNCHANGED) {
return TRUE;
}
}
return FALSE;
}
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
array(a), index(0), length(len), remaining(0),
onlyChanges_(oc), coarse(crs),
@ -308,39 +298,97 @@ UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
return TRUE;
}
UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
if (i < srcIndex) {
int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || i < 0) { return -1; }
int32_t spanStart, spanLength;
if (findSource) { // find source index
spanStart = srcIndex;
spanLength = oldLength_;
} else { // find destination index
spanStart = destIndex;
spanLength = newLength_;
}
// If we are at the start or limit of an empty span, then we search from
// the start of the string so that we always return
// the first of several consecutive empty spans, for consistent results.
// We do not currently track the properties of the previous span,
// so for now we always reset if we are at the start of the current span.
if (i <= spanStart) {
// Reset the iterator to the start.
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (srcIndex + oldLength_)) {
} else if (i < (spanStart + spanLength)) {
// The index is in the current span.
return TRUE;
return 0;
}
while (next(FALSE, errorCode)) {
if (i < (srcIndex + oldLength_)) {
// The index is in the current span.
return TRUE;
if (findSource) {
spanStart = srcIndex;
spanLength = oldLength_;
} else {
spanStart = destIndex;
spanLength = newLength_;
}
if (i == spanStart || i < (spanStart + spanLength)) {
// The index is in the current span, or at an empty one.
return 0;
}
if (remaining > 0) {
// Is the index in one of the remaining compressed edits?
// srcIndex is the start of the current span, before the remaining ones.
int32_t len = (remaining + 1) * oldLength_;
if (i < (srcIndex + len)) {
int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
len = n * oldLength_;
// spanStart is the start of the current span, before the remaining ones.
int32_t len = (remaining + 1) * spanLength;
if (i < (spanStart + len)) {
int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining
len = n * spanLength;
srcIndex += len;
replIndex += len;
destIndex += len;
remaining -= n;
return TRUE;
return 0;
}
// Make next() skip all of these edits at once.
oldLength_ = newLength_ = len;
remaining = 0;
}
}
return FALSE;
return 1;
}
int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) {
int32_t where = findIndex(i, TRUE, errorCode);
if (where < 0) {
// Error or before the string.
return 0;
}
if (where > 0 || i == srcIndex) {
// At or after string length, or at start of the found span.
return destIndex;
}
if (changed) {
// In a change span, map to its end.
return destIndex + newLength_;
} else {
// In an unchanged span, offset 1:1 within it.
return destIndex + (i - srcIndex);
}
}
int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) {
int32_t where = findIndex(i, FALSE, errorCode);
if (where < 0) {
// Error or before the string.
return 0;
}
if (where > 0 || i == destIndex) {
// At or after string length, or at start of the found span.
return srcIndex;
}
if (changed) {
// In a change span, map to its end.
return srcIndex + oldLength_;
} else {
// In an unchanged span, offset within it.
return srcIndex + (i - destIndex);
}
}
U_NAMESPACE_END

View file

@ -36,7 +36,7 @@ public:
* @draft ICU 59
*/
Edits() :
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
errorCode(U_ZERO_ERROR) {}
/**
* Destructor.
@ -66,6 +66,9 @@ public:
* Sets the UErrorCode if an error occurred while recording edits.
* Preserves older error codes in the outErrorCode.
* Normally called from inside ICU string transformation functions, not user code.
* @param outErrorCode Set to an error code if it does not contain one already
* and an error occurred while recording edits.
* Otherwise unchanged.
* @return TRUE if U_FAILURE(outErrorCode)
* @draft ICU 59
*/
@ -81,7 +84,13 @@ public:
* @return TRUE if there are any change edits
* @draft ICU 59
*/
UBool hasChanges() const;
UBool hasChanges() const { return numChanges != 0; }
/**
* @return the number of change edits
* @draft ICU 60
*/
int32_t numberOfChanges() const { return numChanges; }
/**
* Access to the list of edits.
@ -103,6 +112,9 @@ public:
/**
* Advances to the next edit.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return TRUE if there is another edit
* @draft ICU 59
*/
@ -121,10 +133,86 @@ public:
* if the source index is out of bounds for the source string.
*
* @param i source index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return TRUE if the edit for the source index was found
* @draft ICU 59
*/
UBool findSourceIndex(int32_t i, UErrorCode &errorCode);
UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
return findIndex(i, TRUE, errorCode) == 0;
}
/**
* Finds the edit that contains the destination index.
* The destination index may be found in a non-change
* even if normal iteration would skip non-changes.
* Normal iteration can continue from a found edit.
*
* The iterator state before this search logically does not matter.
* (It may affect the performance of the search.)
*
* The iterator state after this search is undefined
* if the source index is out of bounds for the source string.
*
* @param i destination index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return TRUE if the edit for the destination index was found
* @draft ICU 60
*/
UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
return findIndex(i, FALSE, errorCode) == 0;
}
/**
* Returns the destination index corresponding to the given source index.
* If the source index is inside a change edit (not at its start),
* then the destination index at the end of that edit is returned,
* since there is no information about index mapping inside a change edit.
*
* (This means that indexes to the start and middle of an edit,
* for example around a grapheme cluster, are mapped to indexes
* encompassing the entire edit.
* The alternative, mapping an interior index to the start,
* would map such an interval to an empty one.)
*
* This operation will usually but not always modify this object.
* The iterator state after this search is undefined.
*
* @param i source index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return destination index; undefined if i is not 0..string length
* @draft ICU 60
*/
int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
/**
* Returns the source index corresponding to the given destination index.
* If the destination index is inside a change edit (not at its start),
* then the source index at the end of that edit is returned,
* since there is no information about index mapping inside a change edit.
*
* (This means that indexes to the start and middle of an edit,
* for example around a grapheme cluster, are mapped to indexes
* encompassing the entire edit.
* The alternative, mapping an interior index to the start,
* would map such an interval to an empty one.)
*
* This operation will usually but not always modify this object.
* The iterator state after this search is undefined.
*
* @param i destination index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return source index; undefined if i is not 0..string length
* @draft ICU 60
*/
int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
/**
* @return TRUE if this edit replaces oldLength() units with newLength() different ones.
@ -170,6 +258,8 @@ public:
void updateIndexes();
UBool noNext();
UBool next(UBool onlyChanges, UErrorCode &errorCode);
/** @return -1: error or i<0; 0: found; 1: i>=string length */
int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
const uint16_t *array;
int32_t index, length;
@ -234,6 +324,7 @@ private:
int32_t capacity;
int32_t length;
int32_t delta;
int32_t numChanges;
UErrorCode errorCode;
uint16_t stackArray[STACK_CAPACITY];
};

View file

@ -906,13 +906,15 @@ void StringCaseTest::TestBufferOverflow() {
void StringCaseTest::TestEdits() {
IcuTestErrorCode errorCode(*this, "TestEdits");
Edits edits;
assertFalse("new Edits", edits.hasChanges());
assertFalse("new Edits hasChanges", edits.hasChanges());
assertEquals("new Edits numberOfChanges", 0, edits.numberOfChanges());
assertEquals("new Edits", 0, edits.lengthDelta());
edits.addUnchanged(1); // multiple unchanged ranges are combined
edits.addUnchanged(10000); // too long, and they are split
edits.addReplace(0, 0);
edits.addUnchanged(2);
assertFalse("unchanged 10003", edits.hasChanges());
assertFalse("unchanged 10003 hasChanges", edits.hasChanges());
assertEquals("unchanged 10003 numberOfChanges", 0, edits.numberOfChanges());
assertEquals("unchanged 10003", 0, edits.lengthDelta());
edits.addReplace(1, 1); // multiple short equal-length edits are compressed
edits.addUnchanged(0);
@ -922,7 +924,8 @@ void StringCaseTest::TestEdits() {
edits.addReplace(100, 0);
edits.addReplace(3000, 4000); // variable-length encoding
edits.addReplace(100000, 100000);
assertTrue("some edits", edits.hasChanges());
assertTrue("some edits hasChanges", edits.hasChanges());
assertEquals("some edits numberOfChanges", 7, edits.numberOfChanges());
assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
UErrorCode outErrorCode = U_ZERO_ERROR;
assertFalse("edits done: copyErrorTo", edits.copyErrorTo(outErrorCode));
@ -956,7 +959,8 @@ void StringCaseTest::TestEdits() {
fineExpectedChanges, UPRV_LENGTHOF(fineExpectedChanges), FALSE, errorCode);
edits.reset();
assertFalse("reset", edits.hasChanges());
assertFalse("reset hasChanges", edits.hasChanges());
assertEquals("reset numberOfChanges", 0, edits.numberOfChanges());
assertEquals("reset", 0, edits.lengthDelta());
Edits::Iterator ei = edits.getCoarseChangesIterator();
assertFalse("reset then iterator", ei.next(errorCode));

View file

@ -71,32 +71,35 @@ void TestUtility::checkEditsIter(
Edits::Iterator ei1, Edits::Iterator ei2, // two equal iterators
const EditChange expected[], int32_t expLength, UBool withUnchanged,
UErrorCode &errorCode) {
test.assertFalse(name, ei2.findSourceIndex(-1, errorCode));
test.assertFalse(name + u":" + __LINE__, ei2.findSourceIndex(-1, errorCode));
test.assertFalse(name + u":" + __LINE__, ei2.findDestinationIndex(-1, errorCode));
int32_t expSrcIndex = 0;
int32_t expDestIndex = 0;
int32_t expReplIndex = 0;
int32_t expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
int32_t expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
for (int32_t expIndex = 0; expIndex < expLength; ++expIndex) {
const EditChange &expect = expected[expIndex];
UnicodeString msg = UnicodeString(name).append(u' ') + expIndex;
if (withUnchanged || expect.change) {
test.assertTrue(msg, ei1.next(errorCode));
test.assertEquals(msg, expect.change, ei1.hasChange());
test.assertEquals(msg, expect.oldLength, ei1.oldLength());
test.assertEquals(msg, expect.newLength, ei1.newLength());
test.assertEquals(msg, expSrcIndex, ei1.sourceIndex());
test.assertEquals(msg, expDestIndex, ei1.destinationIndex());
test.assertEquals(msg, expReplIndex, ei1.replacementIndex());
test.assertTrue(msg + u":" + __LINE__, ei1.next(errorCode));
test.assertEquals(msg + u":" + __LINE__, expect.change, ei1.hasChange());
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei1.oldLength());
test.assertEquals(msg + u":" + __LINE__, expect.newLength, ei1.newLength());
test.assertEquals(msg + u":" + __LINE__, expSrcIndex, ei1.sourceIndex());
test.assertEquals(msg + u":" + __LINE__, expDestIndex, ei1.destinationIndex());
test.assertEquals(msg + u":" + __LINE__, expReplIndex, ei1.replacementIndex());
}
if (expect.oldLength > 0) {
test.assertTrue(msg, ei2.findSourceIndex(expSrcIndex, errorCode));
test.assertEquals(msg, expect.change, ei2.hasChange());
test.assertEquals(msg, expect.oldLength, ei2.oldLength());
test.assertEquals(msg, expect.newLength, ei2.newLength());
test.assertEquals(msg, expSrcIndex, ei2.sourceIndex());
test.assertEquals(msg, expDestIndex, ei2.destinationIndex());
test.assertEquals(msg, expReplIndex, ei2.replacementIndex());
if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
test.assertTrue(msg + u":" + __LINE__, ei2.findSourceIndex(expSrcIndex, errorCode));
test.assertEquals(msg + u":" + __LINE__, expect.change, ei2.hasChange());
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei2.oldLength());
test.assertEquals(msg + u":" + __LINE__, expect.newLength, ei2.newLength());
test.assertEquals(msg + u":" + __LINE__, expSrcIndex, ei2.sourceIndex());
test.assertEquals(msg + u":" + __LINE__, expDestIndex, ei2.destinationIndex());
test.assertEquals(msg + u":" + __LINE__, expReplIndex, ei2.replacementIndex());
if (!withUnchanged) {
// For some iterators, move past the current range
// so that findSourceIndex() has to look before the current index.
@ -105,20 +108,75 @@ void TestUtility::checkEditsIter(
}
}
expSrcIndex += expect.oldLength;
expDestIndex += expect.newLength;
if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
test.assertTrue(msg + u":" + __LINE__, ei2.findDestinationIndex(expDestIndex, errorCode));
test.assertEquals(msg + u":" + __LINE__, expect.change, ei2.hasChange());
test.assertEquals(msg + u":" + __LINE__, expect.oldLength, ei2.oldLength());
test.assertEquals(msg + u":" + __LINE__, expect.newLength, ei2.newLength());
test.assertEquals(msg + u":" + __LINE__, expSrcIndex, ei2.sourceIndex());
test.assertEquals(msg + u":" + __LINE__, expDestIndex, ei2.destinationIndex());
test.assertEquals(msg + u":" + __LINE__, expReplIndex, ei2.replacementIndex());
if (!withUnchanged) {
// For some iterators, move past the current range
// so that findSourceIndex() has to look before the current index.
ei2.next(errorCode);
ei2.next(errorCode);
}
}
// Span starts.
test.assertEquals(name + u":" + __LINE__, expDestIndexFromSrc,
ei2.destinationIndexFromSourceIndex(expSrcIndex, errorCode));
test.assertEquals(name + u":" + __LINE__, expSrcIndexFromDest,
ei2.sourceIndexFromDestinationIndex(expDestIndex, errorCode));
// Inside unchanged span map offsets 1:1.
if (!expect.change && expect.oldLength >= 2) {
test.assertEquals(name + u":" + __LINE__, expDestIndex + 1,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1, errorCode));
test.assertEquals(name + u":" + __LINE__, expSrcIndex + 1,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1, errorCode));
}
// Inside change span map to the span limit.
int32_t expSrcLimit = expSrcIndex + expect.oldLength;
int32_t expDestLimit = expDestIndex + expect.newLength;
if (expect.change) {
if (expect.oldLength >= 2) {
test.assertEquals(name + u":" + __LINE__, expDestLimit,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1, errorCode));
}
if (expect.newLength >= 2) {
test.assertEquals(name + u":" + __LINE__, expSrcLimit,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1, errorCode));
}
}
expSrcIndex = expSrcLimit;
expDestIndex = expDestLimit;
if (expect.change) {
expReplIndex += expect.newLength;
}
if (expect.newLength > 0) {
expSrcIndexFromDest = expSrcIndex;
}
if (expect.oldLength > 0) {
expDestIndexFromSrc = expDestIndex;
}
}
UnicodeString msg = UnicodeString(name).append(u" end");
test.assertFalse(msg, ei1.next(errorCode));
test.assertFalse(msg, ei1.hasChange());
test.assertEquals(msg, 0, ei1.oldLength());
test.assertEquals(msg, 0, ei1.newLength());
test.assertEquals(msg, expSrcIndex, ei1.sourceIndex());
test.assertEquals(msg, expDestIndex, ei1.destinationIndex());
test.assertEquals(msg, expReplIndex, ei1.replacementIndex());
test.assertFalse(msg + u":" + __LINE__, ei1.next(errorCode));
test.assertFalse(msg + u":" + __LINE__, ei1.hasChange());
test.assertEquals(msg + u":" + __LINE__, 0, ei1.oldLength());
test.assertEquals(msg + u":" + __LINE__, 0, ei1.newLength());
test.assertEquals(msg + u":" + __LINE__, expSrcIndex, ei1.sourceIndex());
test.assertEquals(msg + u":" + __LINE__, expDestIndex, ei1.destinationIndex());
test.assertEquals(msg + u":" + __LINE__, expReplIndex, ei1.replacementIndex());
test.assertFalse(name, ei2.findSourceIndex(expSrcIndex, errorCode));
test.assertFalse(name + u":" + __LINE__, ei2.findSourceIndex(expSrcIndex, errorCode));
test.assertFalse(name + u":" + __LINE__, ei2.findDestinationIndex(expDestIndex, errorCode));
test.assertEquals(name + u":" + __LINE__, expDestIndex,
ei2.destinationIndexFromSourceIndex(expSrcIndex, errorCode));
test.assertEquals(name + u":" + __LINE__, expSrcIndex,
ei2.sourceIndexFromDestinationIndex(expDestIndex, errorCode));
}

View file

@ -1562,6 +1562,8 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
{ TRUE, 6, 3 }, // 가\u3133→ 갃
{ FALSE, 2, 2 } // 2 spaces
};
assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges());
assertEquals("normalizeUTF8 with Edits numberOfChanges", 9, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"normalizeUTF8 with Edits",
edits.getFineIterator(), edits.getFineIterator(),
expectedChanges, UPRV_LENGTHOF(expectedChanges),
@ -1577,6 +1579,8 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
nfkc_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
assertEquals("normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 9, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
edits.getFineIterator(), edits.getFineIterator(),
expectedChanges, UPRV_LENGTHOF(expectedChanges),
@ -1604,6 +1608,8 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
{ TRUE, 6, 3 }, // 가\u3133→ 갃
{ FALSE, 2, 2 } // 2 spaces
};
assertTrue("filtered normalizeUTF8 hasChanges", edits.hasChanges());
assertEquals("filtered normalizeUTF8 numberOfChanges", 7, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8",
edits.getFineIterator(), edits.getFineIterator(),
filteredChanges, UPRV_LENGTHOF(filteredChanges),
@ -1621,6 +1627,8 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
assertTrue("filtered normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("filtered normalizeUTF8 omit unchanged numberOfChanges", 7, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
edits.getFineIterator(), edits.getFineIterator(),
filteredChanges, UPRV_LENGTHOF(filteredChanges),

View file

@ -36,6 +36,7 @@ public final class Edits {
private char[] array;
private int length;
private int delta;
private int numChanges;
/**
* Constructs an empty object.
@ -52,7 +53,7 @@ public final class Edits {
* @provisional This API might change or be removed in a future release.
*/
public void reset() {
length = delta = 0;
length = delta = numChanges = 0;
}
private void setLastUnit(int last) {
@ -105,6 +106,7 @@ public final class Edits {
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
// Replacement of short oldLength text units by same-length new text.
// Merge into previous short-replacement record, if any.
++numChanges;
int last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
@ -123,6 +125,7 @@ public final class Edits {
if (oldLength == 0 && newLength == 0) {
return;
}
++numChanges;
int newDelta = newLength - oldLength;
if (newDelta != 0) {
if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) ||
@ -202,17 +205,14 @@ public final class Edits {
* @draft ICU 59
* @provisional This API might change or be removed in a future release.
*/
public boolean hasChanges() {
if (delta != 0) {
return true;
}
for (int i = 0; i < length; ++i) {
if (array[i] > MAX_UNCHANGED) {
return true;
}
}
return false;
}
public boolean hasChanges() { return numChanges != 0; }
/**
* @return the number of change edits
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public int numberOfChanges() { return numChanges; }
/**
* Access to the list of edits.
@ -374,38 +374,162 @@ public final class Edits {
* @provisional This API might change or be removed in a future release.
*/
public boolean findSourceIndex(int i) {
if (i < 0) { return false; }
if (i < srcIndex) {
return findIndex(i, true) == 0;
}
/**
* Finds the edit that contains the destination index.
* The destination index may be found in a non-change
* even if normal iteration would skip non-changes.
* Normal iteration can continue from a found edit.
*
* <p>The iterator state before this search logically does not matter.
* (It may affect the performance of the search.)
*
* <p>The iterator state after this search is undefined
* if the source index is out of bounds for the source string.
*
* @param i destination index
* @return true if the edit for the destination index was found
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public boolean findDestinationIndex(int i) {
return findIndex(i, false) == 0;
}
/** @return -1: error or i<0; 0: found; 1: i>=string length */
private int findIndex(int i, boolean findSource) {
if (i < 0) { return -1; }
int spanStart, spanLength;
if (findSource) { // find source index
spanStart = srcIndex;
spanLength = oldLength_;
} else { // find destination index
spanStart = destIndex;
spanLength = newLength_;
}
// If we are at the start or limit of an empty span, then we search from
// the start of the string so that we always return
// the first of several consecutive empty spans, for consistent results.
// We do not currently track the properties of the previous span,
// so for now we always reset if we are at the start of the current span.
if (i <= spanStart) {
// Reset the iterator to the start.
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
} else if (i < (srcIndex + oldLength_)) {
} else if (i < (spanStart + spanLength)) {
// The index is in the current span.
return true;
return 0;
}
while (next(false)) {
if (i < (srcIndex + oldLength_)) {
// The index is in the current span.
return true;
if (findSource) {
spanStart = srcIndex;
spanLength = oldLength_;
} else {
spanStart = destIndex;
spanLength = newLength_;
}
if (i == spanStart || i < (spanStart + spanLength)) {
// The index is in the current span, or at an empty one.
return 0;
}
if (remaining > 0) {
// Is the index in one of the remaining compressed edits?
// srcIndex is the start of the current span, before the remaining ones.
int len = (remaining + 1) * oldLength_;
if (i < (srcIndex + len)) {
int n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
len = n * oldLength_;
// spanStart is the start of the current span, before the remaining ones.
int len = (remaining + 1) * spanLength;
if (i < (spanStart + len)) {
int n = (i - spanStart) / spanLength; // 1 <= n <= remaining
len = n * spanLength;
srcIndex += len;
replIndex += len;
destIndex += len;
remaining -= n;
return true;
return 0;
}
// Make next() skip all of these edits at once.
oldLength_ = newLength_ = len;
remaining = 0;
}
}
return false;
return 1;
}
/**
* Returns the destination index corresponding to the given source index.
* If the source index is inside a change edit (not at its start),
* then the destination index at the end of that edit is returned,
* since there is no information about index mapping inside a change edit.
*
* <p>(This means that indexes to the start and middle of an edit,
* for example around a grapheme cluster, are mapped to indexes
* encompassing the entire edit.
* The alternative, mapping an interior index to the start,
* would map such an interval to an empty one.)
*
* <p>This operation will usually but not always modify this object.
* The iterator state after this search is undefined.
*
* @param i source index
* @return destination index; undefined if i is not 0..string length
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public int destinationIndexFromSourceIndex(int i) {
int where = findIndex(i, true);
if (where < 0) {
// Error or before the string.
return 0;
}
if (where > 0 || i == srcIndex) {
// At or after string length, or at start of the found span.
return destIndex;
}
if (changed) {
// In a change span, map to its end.
return destIndex + newLength_;
} else {
// In an unchanged span, offset 1:1 within it.
return destIndex + (i - srcIndex);
}
}
/**
* Returns the source index corresponding to the given destination index.
* If the destination index is inside a change edit (not at its start),
* then the source index at the end of that edit is returned,
* since there is no information about index mapping inside a change edit.
*
* <p>(This means that indexes to the start and middle of an edit,
* for example around a grapheme cluster, are mapped to indexes
* encompassing the entire edit.
* The alternative, mapping an interior index to the start,
* would map such an interval to an empty one.)
*
* <p>This operation will usually but not always modify this object.
* The iterator state after this search is undefined.
*
* @param i destination index
* @return source index; undefined if i is not 0..string length
* @draft ICU 60
* @provisional This API might change or be removed in a future release.
*/
public int sourceIndexFromDestinationIndex(int i) {
int where = findIndex(i, false);
if (where < 0) {
// Error or before the string.
return 0;
}
if (where > 0 || i == destIndex) {
// At or after string length, or at start of the found span.
return srcIndex;
}
if (changed) {
// In a change span, map to its end.
return srcIndex + oldLength_;
} else {
// In an unchanged span, offset within it.
return srcIndex + (i - destIndex);
}
}
/**

View file

@ -781,10 +781,13 @@ public final class UCharacterCaseTest extends TestFmwk
String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators
EditChange[] expected, boolean withUnchanged) {
assertFalse(name, ei2.findSourceIndex(-1));
assertFalse(name, ei2.findDestinationIndex(-1));
int expSrcIndex = 0;
int expDestIndex = 0;
int expReplIndex = 0;
int expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
int expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
EditChange expect = expected[expIndex];
String msg = name + ' ' + expIndex;
@ -798,7 +801,7 @@ public final class UCharacterCaseTest extends TestFmwk
assertEquals(msg, expReplIndex, ei1.replacementIndex());
}
if (expect.oldLength > 0) {
if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
assertEquals(msg, expect.change, ei2.hasChange());
assertEquals(msg, expect.oldLength, ei2.oldLength());
@ -814,11 +817,61 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
expSrcIndex += expect.oldLength;
expDestIndex += expect.newLength;
if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
assertTrue(msg, ei2.findDestinationIndex(expDestIndex));
assertEquals(msg, expect.change, ei2.hasChange());
assertEquals(msg, expect.oldLength, ei2.oldLength());
assertEquals(msg, expect.newLength, ei2.newLength());
assertEquals(msg, expSrcIndex, ei2.sourceIndex());
assertEquals(msg, expDestIndex, ei2.destinationIndex());
assertEquals(msg, expReplIndex, ei2.replacementIndex());
if (!withUnchanged) {
// For some iterators, move past the current range
// so that findSourceIndex() has to look before the current index.
ei2.next();
ei2.next();
}
}
// Span starts.
assertEquals(name, expDestIndexFromSrc,
ei2.destinationIndexFromSourceIndex(expSrcIndex));
assertEquals(name, expSrcIndexFromDest,
ei2.sourceIndexFromDestinationIndex(expDestIndex));
// Inside unchanged span map offsets 1:1.
if (!expect.change && expect.oldLength >= 2) {
assertEquals(name, expDestIndex + 1,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
assertEquals(name, expSrcIndex + 1,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
}
// Inside change span map to the span limit.
int expSrcLimit = expSrcIndex + expect.oldLength;
int expDestLimit = expDestIndex + expect.newLength;
if (expect.change) {
if (expect.oldLength >= 2) {
assertEquals(name, expDestLimit,
ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
}
if (expect.newLength >= 2) {
assertEquals(name, expSrcLimit,
ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
}
}
expSrcIndex = expSrcLimit;
expDestIndex = expDestLimit;
if (expect.change) {
expReplIndex += expect.newLength;
}
if (expect.newLength > 0) {
expSrcIndexFromDest = expSrcIndex;
}
if (expect.oldLength > 0) {
expDestIndexFromSrc = expDestIndex;
}
}
String msg = name + " end";
assertFalse(msg, ei1.next());
@ -830,18 +883,23 @@ public final class UCharacterCaseTest extends TestFmwk
assertEquals(msg, expReplIndex, ei1.replacementIndex());
assertFalse(name, ei2.findSourceIndex(expSrcIndex));
assertFalse(name, ei2.findDestinationIndex(expDestIndex));
assertEquals(name, expDestIndex, ei2.destinationIndexFromSourceIndex(expSrcIndex));
assertEquals(name, expSrcIndex, ei2.sourceIndexFromDestinationIndex(expDestIndex));
}
@Test
public void TestEdits() {
Edits edits = new Edits();
assertFalse("new Edits", edits.hasChanges());
assertFalse("new Edits hasChanges", edits.hasChanges());
assertEquals("new Edits numberOfChanges", 0, edits.numberOfChanges());
assertEquals("new Edits", 0, edits.lengthDelta());
edits.addUnchanged(1); // multiple unchanged ranges are combined
edits.addUnchanged(10000); // too long, and they are split
edits.addReplace(0, 0);
edits.addUnchanged(2);
assertFalse("unchanged 10003", edits.hasChanges());
assertFalse("unchanged 10003 hasChanges", edits.hasChanges());
assertEquals("unchanged 10003 numberOfChanges", 0, edits.numberOfChanges());
assertEquals("unchanged 10003", 0, edits.lengthDelta());
edits.addReplace(1, 1); // multiple short equal-length edits are compressed
edits.addUnchanged(0);
@ -851,7 +909,8 @@ public final class UCharacterCaseTest extends TestFmwk
edits.addReplace(100, 0);
edits.addReplace(3000, 4000); // variable-length encoding
edits.addReplace(100000, 100000);
assertTrue("some edits", edits.hasChanges());
assertTrue("some edits hasChanges", edits.hasChanges());
assertEquals("some edits numberOfChanges", 7, edits.numberOfChanges());
assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
EditChange[] coarseExpectedChanges = new EditChange[] {
@ -883,7 +942,8 @@ public final class UCharacterCaseTest extends TestFmwk
fineExpectedChanges, false);
edits.reset();
assertFalse("reset", edits.hasChanges());
assertFalse("reset hasChanges", edits.hasChanges());
assertEquals("reset numberOfChanges", 0, edits.numberOfChanges());
assertEquals("reset", 0, edits.lengthDelta());
Edits.Iterator ei = edits.getCoarseChangesIterator();
assertFalse("reset then iterator", ei.next());