diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java index 9f413cf9fd3..5dc36c87115 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java @@ -409,12 +409,7 @@ public final class Edits { spanStart = destIndex; spanLength = newLength_; } - // If we are at the start or limit of an empty span, then we search from - // the start of the string so that we always return - // the first of several consecutive empty spans, for consistent results. - // We do not currently track the properties of the previous span, - // so for now we always reset if we are at the start of the current span. - if (i <= spanStart) { + if (i < spanStart) { // Reset the iterator to the start. index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; } else if (i < (spanStart + spanLength)) { @@ -429,8 +424,8 @@ public final class Edits { spanStart = destIndex; spanLength = newLength_; } - if (i == spanStart || i < (spanStart + spanLength)) { - // The index is in the current span, or at an empty one. + if (i < (spanStart + spanLength)) { + // The index is in the current span. return 0; } if (remaining > 0) { @@ -615,4 +610,167 @@ public final class Edits { public Iterator getFineIterator() { return new Iterator(array, length, false, false); } + + /** + * Merges the two input Edits and appends the result to this object. + * + *
Consider two string transformations (for example, normalization and case mapping)
+ * where each records Edits in addition to writing an output string.
+ * Edits ab reflect how substrings of input string a
+ * map to substrings of intermediate string b.
+ * Edits bc reflect how substrings of intermediate string b
+ * map to substrings of output string c.
+ * This function merges ab and bc such that the additional edits
+ * recorded in this object reflect how substrings of input string a
+ * map to substrings of output string c.
+ *
+ *
If unrelated Edits are passed in where the output string of the first
+ * has a different length than the input string of the second,
+ * then an IllegalArgumentException is thrown.
+ *
+ * @param ab reflects how substrings of input string a
+ * map to substrings of intermediate string b.
+ * @param bc reflects how substrings of intermediate string b
+ * map to substrings of output string c.
+ * @return this, with the merged edits appended
+ * @draft ICU 60
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Edits mergeAndAppend(Edits ab, Edits bc) {
+ // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
+ // Parallel iteration over both Edits.
+ Iterator abIter = ab.getFineIterator();
+ Iterator bcIter = bc.getFineIterator();
+ boolean abHasNext = true, bcHasNext = true;
+ // Copy iterator state into local variables, so that we can modify and subdivide spans.
+ // ab old & new length, bc old & new length
+ int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
+ // When we have different-intermediate-length changes, we accumulate a larger change.
+ int pending_aLength = 0, pending_cLength = 0;
+ for (;;) {
+ // At this point, for each of the two iterators:
+ // Either we are done with the locally cached current edit,
+ // and its intermediate-string length has been reset,
+ // or we will continue to work with a truncated remainder of this edit.
+ //
+ // If the current edit is done, and the iterator has not yet reached the end,
+ // then we fetch the next edit. This is true for at least one of the iterators.
+ //
+ // Normally it does not matter whether we fetch from ab and then bc or vice versa.
+ // However, the result is observably different when
+ // ab deletions meet bc insertions at the same intermediate-string index.
+ // Some users expect the bc insertions to come first, so we fetch from bc first.
+ if (bc_bLength == 0) {
+ if (bcHasNext && (bcHasNext = bcIter.next())) {
+ bc_bLength = bcIter.oldLength();
+ cLength = bcIter.newLength();
+ if (bc_bLength == 0) {
+ // insertion
+ if (ab_bLength == 0 || !abIter.hasChange()) {
+ addReplace(pending_aLength, pending_cLength + cLength);
+ pending_aLength = pending_cLength = 0;
+ } else {
+ pending_cLength += cLength;
+ }
+ continue;
+ }
+ }
+ // else see if the other iterator is done, too.
+ }
+ if (ab_bLength == 0) {
+ if (abHasNext && (abHasNext = abIter.next())) {
+ aLength = abIter.oldLength();
+ ab_bLength = abIter.newLength();
+ if (ab_bLength == 0) {
+ // deletion
+ if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
+ addReplace(pending_aLength + aLength, pending_cLength);
+ pending_aLength = pending_cLength = 0;
+ } else {
+ pending_aLength += aLength;
+ }
+ continue;
+ }
+ } else if (bc_bLength == 0) {
+ // Both iterators are done at the same time:
+ // The intermediate-string lengths match.
+ break;
+ } else {
+ throw new IllegalArgumentException(
+ "The ab output string is shorter than the bc input string.");
+ }
+ }
+ if (bc_bLength == 0) {
+ throw new IllegalArgumentException(
+ "The bc input string is shorter than the ab output string.");
+ }
+ // Done fetching: ab_bLength > 0 && bc_bLength > 0
+
+ // The current state has two parts:
+ // - Past: We accumulate a longer ac edit in the "pending" variables.
+ // - Current: We have copies of the current ab/bc edits in local variables.
+ // At least one side is newly fetched.
+ // One side might be a truncated remainder of an edit we fetched earlier.
+
+ if (!abIter.hasChange() && !bcIter.hasChange()) {
+ // An unchanged span all the way from string a to string c.
+ if (pending_aLength != 0 || pending_cLength != 0) {
+ addReplace(pending_aLength, pending_cLength);
+ pending_aLength = pending_cLength = 0;
+ }
+ int unchangedLength = aLength <= cLength ? aLength : cLength;
+ addUnchanged(unchangedLength);
+ ab_bLength = aLength -= unchangedLength;
+ bc_bLength = cLength -= unchangedLength;
+ // At least one of the unchanged spans is now empty.
+ continue;
+ }
+ if (!abIter.hasChange() && bcIter.hasChange()) {
+ // Unchanged a->b but changed b->c.
+ if (ab_bLength >= bc_bLength) {
+ // Split the longer unchanged span into change + remainder.
+ addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
+ pending_aLength = pending_cLength = 0;
+ aLength = ab_bLength -= bc_bLength;
+ bc_bLength = 0;
+ continue;
+ }
+ // Handle the shorter unchanged span below like a change.
+ } else if (abIter.hasChange() && !bcIter.hasChange()) {
+ // Changed a->b and then unchanged b->c.
+ if (ab_bLength <= bc_bLength) {
+ // Split the longer unchanged span into change + remainder.
+ addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
+ pending_aLength = pending_cLength = 0;
+ cLength = bc_bLength -= ab_bLength;
+ ab_bLength = 0;
+ continue;
+ }
+ // Handle the shorter unchanged span below like a change.
+ } else { // both abIter.hasChange() && bcIter.hasChange()
+ if (ab_bLength == bc_bLength) {
+ // Changes on both sides up to the same position. Emit & reset.
+ addReplace(pending_aLength + aLength, pending_cLength + cLength);
+ pending_aLength = pending_cLength = 0;
+ ab_bLength = bc_bLength = 0;
+ continue;
+ }
+ }
+ // Accumulate the a->c change, reset the shorter side,
+ // keep a remainder of the longer one.
+ pending_aLength += aLength;
+ pending_cLength += cLength;
+ if (ab_bLength < bc_bLength) {
+ bc_bLength -= ab_bLength;
+ cLength = ab_bLength = 0;
+ } else { // ab_bLength > bc_bLength
+ ab_bLength -= bc_bLength;
+ aLength = bc_bLength = 0;
+ }
+ }
+ if (pending_aLength != 0 || pending_cLength != 0) {
+ addReplace(pending_aLength, pending_cLength);
+ }
+ return this;
+ }
}
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
index 1df35243816..a0c3dee246c 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
@@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang;
import java.io.BufferedReader;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Locale;
@@ -777,6 +778,88 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
+ private static String printOneEdit(Edits.Iterator ei) {
+ if (ei.hasChange()) {
+ return "" + ei.oldLength() + "->" + ei.newLength();
+ } else {
+ return "" + ei.oldLength() + "=" + ei.newLength();
+ }
+ }
+
+ /**
+ * Maps indexes according to the expected edits.
+ * A destination index can occur multiple times when there are source deletions.
+ * Map according to the last occurrence, normally in a non-empty destination span.
+ * Simplest is to search from the back.
+ */
+ private static int srcIndexFromDest(
+ EditChange expected[], int srcLength, int destLength, int index) {
+ int srcIndex = srcLength;
+ int destIndex = destLength;
+ int i = expected.length;
+ while (index < destIndex && i > 0) {
+ --i;
+ int prevSrcIndex = srcIndex - expected[i].oldLength;
+ int prevDestIndex = destIndex - expected[i].newLength;
+ if (index == prevDestIndex) {
+ return prevSrcIndex;
+ } else if (index > prevDestIndex) {
+ if (expected[i].change) {
+ // In a change span, map to its end.
+ return srcIndex;
+ } else {
+ // In an unchanged span, offset within it.
+ return prevSrcIndex + (index - prevDestIndex);
+ }
+ }
+ srcIndex = prevSrcIndex;
+ destIndex = prevDestIndex;
+ }
+ // index is outside the string.
+ return srcIndex;
+ }
+
+ private static int destIndexFromSrc(
+ EditChange expected[], int srcLength, int destLength, int index) {
+ int srcIndex = srcLength;
+ int destIndex = destLength;
+ int i = expected.length;
+ while (index < srcIndex && i > 0) {
+ --i;
+ int prevSrcIndex = srcIndex - expected[i].oldLength;
+ int prevDestIndex = destIndex - expected[i].newLength;
+ if (index == prevSrcIndex) {
+ return prevDestIndex;
+ } else if (index > prevSrcIndex) {
+ if (expected[i].change) {
+ // In a change span, map to its end.
+ return destIndex;
+ } else {
+ // In an unchanged span, offset within it.
+ return prevDestIndex + (index - prevSrcIndex);
+ }
+ }
+ srcIndex = prevSrcIndex;
+ destIndex = prevDestIndex;
+ }
+ // index is outside the string.
+ return destIndex;
+ }
+
+ private void checkEqualEdits(String name, Edits e1, Edits e2) {
+ Edits.Iterator ei1 = e1.getFineIterator();
+ Edits.Iterator ei2 = e2.getFineIterator();
+ for (int i = 0;; ++i) {
+ boolean ei1HasNext = ei1.next();
+ boolean ei2HasNext = ei2.next();
+ assertEquals(name + " next()[" + i + "]", ei1HasNext, ei2HasNext);
+ assertEquals(name + " edit[" + i + "]", printOneEdit(ei1), printOneEdit(ei2));
+ if (!ei1HasNext || !ei2HasNext) {
+ break;
+ }
+ }
+ }
+
private static void checkEditsIter(
String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators
EditChange[] expected, boolean withUnchanged) {
@@ -786,8 +869,6 @@ public final class UCharacterCaseTest extends TestFmwk
int expSrcIndex = 0;
int expDestIndex = 0;
int expReplIndex = 0;
- int expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex()
- int expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex()
for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
EditChange expect = expected[expIndex];
String msg = name + ' ' + expIndex;
@@ -801,7 +882,7 @@ public final class UCharacterCaseTest extends TestFmwk
assertEquals(msg, expReplIndex, ei1.replacementIndex());
}
- if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) {
+ if (expect.oldLength > 0) {
assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
assertEquals(msg, expect.change, ei2.hasChange());
assertEquals(msg, expect.oldLength, ei2.oldLength());
@@ -817,7 +898,7 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
- if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) {
+ if (expect.newLength > 0) {
assertTrue(msg, ei2.findDestinationIndex(expDestIndex));
assertEquals(msg, expect.change, ei2.hasChange());
assertEquals(msg, expect.oldLength, ei2.oldLength());
@@ -833,45 +914,11 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
- // Span starts.
- assertEquals(name, expDestIndexFromSrc,
- ei2.destinationIndexFromSourceIndex(expSrcIndex));
- assertEquals(name, expSrcIndexFromDest,
- ei2.sourceIndexFromDestinationIndex(expDestIndex));
-
- // Inside unchanged span map offsets 1:1.
- if (!expect.change && expect.oldLength >= 2) {
- assertEquals(name, expDestIndex + 1,
- ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
- assertEquals(name, expSrcIndex + 1,
- ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
- }
-
- // Inside change span map to the span limit.
- int expSrcLimit = expSrcIndex + expect.oldLength;
- int expDestLimit = expDestIndex + expect.newLength;
- if (expect.change) {
- if (expect.oldLength >= 2) {
- assertEquals(name, expDestLimit,
- ei2.destinationIndexFromSourceIndex(expSrcIndex + 1));
- }
- if (expect.newLength >= 2) {
- assertEquals(name, expSrcLimit,
- ei2.sourceIndexFromDestinationIndex(expDestIndex + 1));
- }
- }
-
- expSrcIndex = expSrcLimit;
- expDestIndex = expDestLimit;
+ expSrcIndex += expect.oldLength;
+ expDestIndex += expect.newLength;
if (expect.change) {
expReplIndex += expect.newLength;
}
- if (expect.newLength > 0) {
- expSrcIndexFromDest = expSrcIndex;
- }
- if (expect.oldLength > 0) {
- expDestIndexFromSrc = expDestIndex;
- }
}
String msg = name + " end";
assertFalse(msg, ei1.next());
@@ -884,8 +931,49 @@ public final class UCharacterCaseTest extends TestFmwk
assertFalse(name, ei2.findSourceIndex(expSrcIndex));
assertFalse(name, ei2.findDestinationIndex(expDestIndex));
- assertEquals(name, expDestIndex, ei2.destinationIndexFromSourceIndex(expSrcIndex));
- assertEquals(name, expSrcIndex, ei2.sourceIndexFromDestinationIndex(expDestIndex));
+
+ // Check mapping of all indexes against a simple implementation
+ // that works on the expected changes.
+ // Iterate once forward, once backward, to cover more runtime conditions.
+ int srcLength = expSrcIndex;
+ int destLength = expDestIndex;
+ List