diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java index 9f413cf9fd3..5dc36c87115 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java @@ -409,12 +409,7 @@ public final class Edits { spanStart = destIndex; spanLength = newLength_; } - // If we are at the start or limit of an empty span, then we search from - // the start of the string so that we always return - // the first of several consecutive empty spans, for consistent results. - // We do not currently track the properties of the previous span, - // so for now we always reset if we are at the start of the current span. - if (i <= spanStart) { + if (i < spanStart) { // Reset the iterator to the start. index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; } else if (i < (spanStart + spanLength)) { @@ -429,8 +424,8 @@ public final class Edits { spanStart = destIndex; spanLength = newLength_; } - if (i == spanStart || i < (spanStart + spanLength)) { - // The index is in the current span, or at an empty one. + if (i < (spanStart + spanLength)) { + // The index is in the current span. return 0; } if (remaining > 0) { @@ -615,4 +610,167 @@ public final class Edits { public Iterator getFineIterator() { return new Iterator(array, length, false, false); } + + /** + * Merges the two input Edits and appends the result to this object. + * + *

Consider two string transformations (for example, normalization and case mapping) + * where each records Edits in addition to writing an output string.
+ * Edits ab reflect how substrings of input string a + * map to substrings of intermediate string b.
+ * Edits bc reflect how substrings of intermediate string b + * map to substrings of output string c.
+ * This function merges ab and bc such that the additional edits + * recorded in this object reflect how substrings of input string a + * map to substrings of output string c. + * + *

If unrelated Edits are passed in where the output string of the first + * has a different length than the input string of the second, + * then an IllegalArgumentException is thrown. + * + * @param ab reflects how substrings of input string a + * map to substrings of intermediate string b. + * @param bc reflects how substrings of intermediate string b + * map to substrings of output string c. + * @return this, with the merged edits appended + * @draft ICU 60 + * @provisional This API might change or be removed in a future release. + */ + public Edits mergeAndAppend(Edits ab, Edits bc) { + // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. + // Parallel iteration over both Edits. + Iterator abIter = ab.getFineIterator(); + Iterator bcIter = bc.getFineIterator(); + boolean abHasNext = true, bcHasNext = true; + // Copy iterator state into local variables, so that we can modify and subdivide spans. + // ab old & new length, bc old & new length + int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; + // When we have different-intermediate-length changes, we accumulate a larger change. + int pending_aLength = 0, pending_cLength = 0; + for (;;) { + // At this point, for each of the two iterators: + // Either we are done with the locally cached current edit, + // and its intermediate-string length has been reset, + // or we will continue to work with a truncated remainder of this edit. + // + // If the current edit is done, and the iterator has not yet reached the end, + // then we fetch the next edit. This is true for at least one of the iterators. + // + // Normally it does not matter whether we fetch from ab and then bc or vice versa. + // However, the result is observably different when + // ab deletions meet bc insertions at the same intermediate-string index. + // Some users expect the bc insertions to come first, so we fetch from bc first. + if (bc_bLength == 0) { + if (bcHasNext && (bcHasNext = bcIter.next())) { + bc_bLength = bcIter.oldLength(); + cLength = bcIter.newLength(); + if (bc_bLength == 0) { + // insertion + if (ab_bLength == 0 || !abIter.hasChange()) { + addReplace(pending_aLength, pending_cLength + cLength); + pending_aLength = pending_cLength = 0; + } else { + pending_cLength += cLength; + } + continue; + } + } + // else see if the other iterator is done, too. + } + if (ab_bLength == 0) { + if (abHasNext && (abHasNext = abIter.next())) { + aLength = abIter.oldLength(); + ab_bLength = abIter.newLength(); + if (ab_bLength == 0) { + // deletion + if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { + addReplace(pending_aLength + aLength, pending_cLength); + pending_aLength = pending_cLength = 0; + } else { + pending_aLength += aLength; + } + continue; + } + } else if (bc_bLength == 0) { + // Both iterators are done at the same time: + // The intermediate-string lengths match. + break; + } else { + throw new IllegalArgumentException( + "The ab output string is shorter than the bc input string."); + } + } + if (bc_bLength == 0) { + throw new IllegalArgumentException( + "The bc input string is shorter than the ab output string."); + } + // Done fetching: ab_bLength > 0 && bc_bLength > 0 + + // The current state has two parts: + // - Past: We accumulate a longer ac edit in the "pending" variables. + // - Current: We have copies of the current ab/bc edits in local variables. + // At least one side is newly fetched. + // One side might be a truncated remainder of an edit we fetched earlier. + + if (!abIter.hasChange() && !bcIter.hasChange()) { + // An unchanged span all the way from string a to string c. + if (pending_aLength != 0 || pending_cLength != 0) { + addReplace(pending_aLength, pending_cLength); + pending_aLength = pending_cLength = 0; + } + int unchangedLength = aLength <= cLength ? aLength : cLength; + addUnchanged(unchangedLength); + ab_bLength = aLength -= unchangedLength; + bc_bLength = cLength -= unchangedLength; + // At least one of the unchanged spans is now empty. + continue; + } + if (!abIter.hasChange() && bcIter.hasChange()) { + // Unchanged a->b but changed b->c. + if (ab_bLength >= bc_bLength) { + // Split the longer unchanged span into change + remainder. + addReplace(pending_aLength + bc_bLength, pending_cLength + cLength); + pending_aLength = pending_cLength = 0; + aLength = ab_bLength -= bc_bLength; + bc_bLength = 0; + continue; + } + // Handle the shorter unchanged span below like a change. + } else if (abIter.hasChange() && !bcIter.hasChange()) { + // Changed a->b and then unchanged b->c. + if (ab_bLength <= bc_bLength) { + // Split the longer unchanged span into change + remainder. + addReplace(pending_aLength + aLength, pending_cLength + ab_bLength); + pending_aLength = pending_cLength = 0; + cLength = bc_bLength -= ab_bLength; + ab_bLength = 0; + continue; + } + // Handle the shorter unchanged span below like a change. + } else { // both abIter.hasChange() && bcIter.hasChange() + if (ab_bLength == bc_bLength) { + // Changes on both sides up to the same position. Emit & reset. + addReplace(pending_aLength + aLength, pending_cLength + cLength); + pending_aLength = pending_cLength = 0; + ab_bLength = bc_bLength = 0; + continue; + } + } + // Accumulate the a->c change, reset the shorter side, + // keep a remainder of the longer one. + pending_aLength += aLength; + pending_cLength += cLength; + if (ab_bLength < bc_bLength) { + bc_bLength -= ab_bLength; + cLength = ab_bLength = 0; + } else { // ab_bLength > bc_bLength + ab_bLength -= bc_bLength; + aLength = bc_bLength = 0; + } + } + if (pending_aLength != 0 || pending_cLength != 0) { + addReplace(pending_aLength, pending_cLength); + } + return this; + } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java index 1df35243816..a0c3dee246c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java @@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.lang; import java.io.BufferedReader; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Locale; @@ -777,6 +778,88 @@ public final class UCharacterCaseTest extends TestFmwk } } + private static String printOneEdit(Edits.Iterator ei) { + if (ei.hasChange()) { + return "" + ei.oldLength() + "->" + ei.newLength(); + } else { + return "" + ei.oldLength() + "=" + ei.newLength(); + } + } + + /** + * Maps indexes according to the expected edits. + * A destination index can occur multiple times when there are source deletions. + * Map according to the last occurrence, normally in a non-empty destination span. + * Simplest is to search from the back. + */ + private static int srcIndexFromDest( + EditChange expected[], int srcLength, int destLength, int index) { + int srcIndex = srcLength; + int destIndex = destLength; + int i = expected.length; + while (index < destIndex && i > 0) { + --i; + int prevSrcIndex = srcIndex - expected[i].oldLength; + int prevDestIndex = destIndex - expected[i].newLength; + if (index == prevDestIndex) { + return prevSrcIndex; + } else if (index > prevDestIndex) { + if (expected[i].change) { + // In a change span, map to its end. + return srcIndex; + } else { + // In an unchanged span, offset within it. + return prevSrcIndex + (index - prevDestIndex); + } + } + srcIndex = prevSrcIndex; + destIndex = prevDestIndex; + } + // index is outside the string. + return srcIndex; + } + + private static int destIndexFromSrc( + EditChange expected[], int srcLength, int destLength, int index) { + int srcIndex = srcLength; + int destIndex = destLength; + int i = expected.length; + while (index < srcIndex && i > 0) { + --i; + int prevSrcIndex = srcIndex - expected[i].oldLength; + int prevDestIndex = destIndex - expected[i].newLength; + if (index == prevSrcIndex) { + return prevDestIndex; + } else if (index > prevSrcIndex) { + if (expected[i].change) { + // In a change span, map to its end. + return destIndex; + } else { + // In an unchanged span, offset within it. + return prevDestIndex + (index - prevSrcIndex); + } + } + srcIndex = prevSrcIndex; + destIndex = prevDestIndex; + } + // index is outside the string. + return destIndex; + } + + private void checkEqualEdits(String name, Edits e1, Edits e2) { + Edits.Iterator ei1 = e1.getFineIterator(); + Edits.Iterator ei2 = e2.getFineIterator(); + for (int i = 0;; ++i) { + boolean ei1HasNext = ei1.next(); + boolean ei2HasNext = ei2.next(); + assertEquals(name + " next()[" + i + "]", ei1HasNext, ei2HasNext); + assertEquals(name + " edit[" + i + "]", printOneEdit(ei1), printOneEdit(ei2)); + if (!ei1HasNext || !ei2HasNext) { + break; + } + } + } + private static void checkEditsIter( String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators EditChange[] expected, boolean withUnchanged) { @@ -786,8 +869,6 @@ public final class UCharacterCaseTest extends TestFmwk int expSrcIndex = 0; int expDestIndex = 0; int expReplIndex = 0; - int expSrcIndexFromDest = 0; // for sourceIndexFromDestinationIndex() - int expDestIndexFromSrc = 0; // for destinationIndexFromSourceIndex() for (int expIndex = 0; expIndex < expected.length; ++expIndex) { EditChange expect = expected[expIndex]; String msg = name + ' ' + expIndex; @@ -801,7 +882,7 @@ public final class UCharacterCaseTest extends TestFmwk assertEquals(msg, expReplIndex, ei1.replacementIndex()); } - if (expect.oldLength > 0 && expDestIndex == expDestIndexFromSrc) { + if (expect.oldLength > 0) { assertTrue(msg, ei2.findSourceIndex(expSrcIndex)); assertEquals(msg, expect.change, ei2.hasChange()); assertEquals(msg, expect.oldLength, ei2.oldLength()); @@ -817,7 +898,7 @@ public final class UCharacterCaseTest extends TestFmwk } } - if (expect.newLength > 0 && expSrcIndex == expSrcIndexFromDest) { + if (expect.newLength > 0) { assertTrue(msg, ei2.findDestinationIndex(expDestIndex)); assertEquals(msg, expect.change, ei2.hasChange()); assertEquals(msg, expect.oldLength, ei2.oldLength()); @@ -833,45 +914,11 @@ public final class UCharacterCaseTest extends TestFmwk } } - // Span starts. - assertEquals(name, expDestIndexFromSrc, - ei2.destinationIndexFromSourceIndex(expSrcIndex)); - assertEquals(name, expSrcIndexFromDest, - ei2.sourceIndexFromDestinationIndex(expDestIndex)); - - // Inside unchanged span map offsets 1:1. - if (!expect.change && expect.oldLength >= 2) { - assertEquals(name, expDestIndex + 1, - ei2.destinationIndexFromSourceIndex(expSrcIndex + 1)); - assertEquals(name, expSrcIndex + 1, - ei2.sourceIndexFromDestinationIndex(expDestIndex + 1)); - } - - // Inside change span map to the span limit. - int expSrcLimit = expSrcIndex + expect.oldLength; - int expDestLimit = expDestIndex + expect.newLength; - if (expect.change) { - if (expect.oldLength >= 2) { - assertEquals(name, expDestLimit, - ei2.destinationIndexFromSourceIndex(expSrcIndex + 1)); - } - if (expect.newLength >= 2) { - assertEquals(name, expSrcLimit, - ei2.sourceIndexFromDestinationIndex(expDestIndex + 1)); - } - } - - expSrcIndex = expSrcLimit; - expDestIndex = expDestLimit; + expSrcIndex += expect.oldLength; + expDestIndex += expect.newLength; if (expect.change) { expReplIndex += expect.newLength; } - if (expect.newLength > 0) { - expSrcIndexFromDest = expSrcIndex; - } - if (expect.oldLength > 0) { - expDestIndexFromSrc = expDestIndex; - } } String msg = name + " end"; assertFalse(msg, ei1.next()); @@ -884,8 +931,49 @@ public final class UCharacterCaseTest extends TestFmwk assertFalse(name, ei2.findSourceIndex(expSrcIndex)); assertFalse(name, ei2.findDestinationIndex(expDestIndex)); - assertEquals(name, expDestIndex, ei2.destinationIndexFromSourceIndex(expSrcIndex)); - assertEquals(name, expSrcIndex, ei2.sourceIndexFromDestinationIndex(expDestIndex)); + + // Check mapping of all indexes against a simple implementation + // that works on the expected changes. + // Iterate once forward, once backward, to cover more runtime conditions. + int srcLength = expSrcIndex; + int destLength = expDestIndex; + List srcIndexes = new ArrayList(); + List destIndexes = new ArrayList(); + srcIndexes.add(-1); + destIndexes.add(-1); + int srcIndex = 0; + int destIndex = 0; + for (int i = 0; i < expected.length; ++i) { + if (expected[i].oldLength > 0) { + srcIndexes.add(srcIndex); + if (expected[i].oldLength > 1) { + srcIndexes.add(srcIndex + 1); + } + } + if (expected[i].newLength > 0) { + destIndexes.add(destIndex); + if (expected[i].newLength > 0) { + destIndexes.add(destIndex + 1); + } + } + srcIndex += expected[i].oldLength; + destIndex += expected[i].newLength; + } + srcIndexes.add(srcLength); + destIndexes.add(destLength); + srcIndexes.add(srcLength + 1); + destIndexes.add(destLength + 1); + Collections.reverse(destIndexes); + for (int i : srcIndexes) { + assertEquals(name + " destIndexFromSrc(" + i + "):", + destIndexFromSrc(expected, srcLength, destLength, i), + ei2.destinationIndexFromSourceIndex(i)); + } + for (int i : destIndexes) { + assertEquals(name + " srcIndexFromDest(" + i + "):", + srcIndexFromDest(expected, srcLength, destLength, i), + ei2.sourceIndexFromDestinationIndex(i)); + } } @Test @@ -949,6 +1037,167 @@ public final class UCharacterCaseTest extends TestFmwk assertFalse("reset then iterator", ei.next()); } + @Test + public void TestMergeEdits() { + Edits ab = new Edits(), bc = new Edits(), ac = new Edits(), expected_ac = new Edits(); + + // Simple: Two parallel non-changes. + ab.addUnchanged(2); + bc.addUnchanged(2); + expected_ac.addUnchanged(2); + + // Simple: Two aligned changes. + ab.addReplace(3, 2); + bc.addReplace(2, 1); + expected_ac.addReplace(3, 1); + + // Unequal non-changes. + ab.addUnchanged(5); + bc.addUnchanged(3); + expected_ac.addUnchanged(3); + // ab ahead by 2 + + // Overlapping changes accumulate until they share a boundary. + ab.addReplace(4, 3); + bc.addReplace(3, 2); + ab.addReplace(4, 3); + bc.addReplace(3, 2); + ab.addReplace(4, 3); + bc.addReplace(3, 2); + bc.addUnchanged(4); + expected_ac.addReplace(14, 8); + // bc ahead by 2 + + // Balance out intermediate-string lengths. + ab.addUnchanged(2); + expected_ac.addUnchanged(2); + + // Insert something and delete it: Should disappear. + ab.addReplace(0, 5); + ab.addReplace(0, 2); + bc.addReplace(7, 0); + + // Parallel change to make a new boundary. + ab.addReplace(1, 2); + bc.addReplace(2, 3); + expected_ac.addReplace(1, 3); + + // Multiple ab deletions should remain separate at the boundary. + ab.addReplace(1, 0); + ab.addReplace(2, 0); + ab.addReplace(3, 0); + expected_ac.addReplace(1, 0); + expected_ac.addReplace(2, 0); + expected_ac.addReplace(3, 0); + + // Unequal non-changes can be split for another boundary. + ab.addUnchanged(2); + bc.addUnchanged(1); + expected_ac.addUnchanged(1); + // ab ahead by 1 + + // Multiple bc insertions should create a boundary and remain separate. + bc.addReplace(0, 4); + bc.addReplace(0, 5); + bc.addReplace(0, 6); + expected_ac.addReplace(0, 4); + expected_ac.addReplace(0, 5); + expected_ac.addReplace(0, 6); + // ab ahead by 1 + + // Multiple ab deletions in the middle of a bc change are merged. + bc.addReplace(2, 2); + // bc ahead by 1 + ab.addReplace(1, 0); + ab.addReplace(2, 0); + ab.addReplace(3, 0); + ab.addReplace(4, 1); + expected_ac.addReplace(11, 2); + + // Multiple bc insertions in the middle of an ab change are merged. + ab.addReplace(5, 6); + bc.addReplace(3, 3); + // ab ahead by 3 + bc.addReplace(0, 4); + bc.addReplace(0, 5); + bc.addReplace(0, 6); + bc.addReplace(3, 7); + expected_ac.addReplace(5, 25); + + // Delete around a deletion. + ab.addReplace(4, 4); + ab.addReplace(3, 0); + ab.addUnchanged(2); + bc.addReplace(2, 2); + bc.addReplace(4, 0); + expected_ac.addReplace(9, 2); + + // Insert into an insertion. + ab.addReplace(0, 2); + bc.addReplace(1, 1); + bc.addReplace(0, 8); + bc.addUnchanged(4); + expected_ac.addReplace(0, 10); + // bc ahead by 3 + + // Balance out intermediate-string lengths. + ab.addUnchanged(3); + expected_ac.addUnchanged(3); + + // Deletions meet insertions. + // Output order is arbitrary in principle, but we expect insertions first + // and want to keep it that way. + ab.addReplace(2, 0); + ab.addReplace(4, 0); + ab.addReplace(6, 0); + bc.addReplace(0, 1); + bc.addReplace(0, 3); + bc.addReplace(0, 5); + expected_ac.addReplace(0, 1); + expected_ac.addReplace(0, 3); + expected_ac.addReplace(0, 5); + expected_ac.addReplace(2, 0); + expected_ac.addReplace(4, 0); + expected_ac.addReplace(6, 0); + + // End with a non-change, so that further edits are never reordered. + ab.addUnchanged(1); + bc.addUnchanged(1); + expected_ac.addUnchanged(1); + + ac.mergeAndAppend(ab, bc); + checkEqualEdits("ab+bc", expected_ac, ac); + + // Append more Edits. + Edits ab2 = new Edits(), bc2 = new Edits(); + ab2.addUnchanged(5); + bc2.addReplace(1, 2); + bc2.addUnchanged(4); + expected_ac.addReplace(1, 2); + expected_ac.addUnchanged(4); + ac.mergeAndAppend(ab2, bc2); + checkEqualEdits("ab2+bc2", expected_ac, ac); + + // Append empty edits. + Edits empty = new Edits(); + ac.mergeAndAppend(empty, empty); + checkEqualEdits("empty+empty", expected_ac, ac); + + // Error: Append more edits with mismatched intermediate-string lengths. + Edits mismatch = new Edits(); + mismatch.addReplace(1, 1); + try { + ac.mergeAndAppend(ab2, mismatch); + fail("ab2+mismatch did not yield IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + } + try { + ac.mergeAndAppend(mismatch, bc2); + fail("mismatch+bc2 did not yield IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + } + } + @Test public void TestCaseMapWithEdits() { StringBuilder sb = new StringBuilder();