ICU-22845 Better iterations for the ICU4J UnicodeSet

This commit is contained in:
Mihai Nita 2024-08-08 22:17:31 +00:00 committed by Mihai Nita
parent 66ba09973a
commit b5b3e16afa
2 changed files with 400 additions and 8 deletions

View file

@ -17,7 +17,12 @@ import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.SortedSet;
import java.util.Spliterator;
import java.util.TreeSet;
import java.util.function.IntConsumer;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import com.ibm.icu.impl.BMPSet;
import com.ibm.icu.impl.CharacterPropertiesImpl;
@ -278,14 +283,22 @@ import com.ibm.icu.util.VersionInfo;
* </tr>
* </table>
* </blockquote>
* <p>To iterate over contents of UnicodeSet, the following are available:
* <ul><li>{@link #ranges()} to iterate through the ranges</li>
* <li>{@link #strings()} to iterate through the strings</li>
* <li>{@link #iterator()} to iterate through the entire contents in a single loop.
* That method is, however, not particularly efficient, since it "boxes" each code point into a String.
*
* <p>To iterate over contents of {@code UnicodeSet}, the following are available:
* <ul>
* <li>to iterate over the ranges: {@link #ranges()}, {@link #rangeStream()}</li>
* <li>to iterate over the strings: {@link #strings()}, {@link #stringStream()}</li>
* <li>to iterate over the code points: {@link #codePoints()}, {@link #codePointStream()}</li>
* <li>to iterate over the entire contents in a single loop: this class itself is {@link Iterable},
* or use {@link #stream()}.<br>
* All of these method are, however, not particularly efficient,
* since they convert each individual code point to a {@code String}.
* </ul>
* All of the above can be used in <b>for</b> loops.
* The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
*
* <p>The iterators and streams methods work as expected in idiomatic Java usage.<br>
* The {@link UnicodeSetIterator} cannot be used in <b>for</b> loops, and it is not very Java-idiomatic, because it is old.
* But it might be faster in certain use cases. We recommend that you measure in performance sensitive code.<br>
*
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
@ -5127,5 +5140,221 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
CharacterPropertiesImpl.clear();
XSYMBOL_TABLE = xSymbolTable;
}
/**
* Returns a {@link Stream} of {@link EntryRange} values from this {@code UnicodeSet}.
*
* <p><b>Warnings:</b>
* <ul>
* <li>The {@link EntryRange} instance is the same each time; the contents are just reset.
* <li>To iterate over the full contents, you have to also iterate over the strings.
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
* </ul>
*
* @return a {@link Stream} of {@link EntryRange}
*
* @draft ICU 76
*/
public Stream<EntryRange> rangeStream() {
// Must use false to never make this parallel because the iterator always returns the same EntryRange object.
return StreamSupport.stream(ranges().spliterator(), false);
}
/**
* Returns a {@link Stream} of {@code String} values from this {@code UnicodeSet}.
*
* <p><b>Warnings:</b>
* <ul>
* <li>To iterate over the full contents, you have to also iterate over the ranges or code points.
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
* </ul>
*
* @return a {@link Stream} of {@code String}
*
* @draft ICU 76
*/
public Stream<String> stringStream() {
return strings().stream();
}
/**
* Returns an {@link IntStream} of Unicode code point values from this {@code UnicodeSet}.
*
* <p><b>Warnings:</b>
* <ul>
* <li>To iterate over the full contents, you have to also iterate over the strings.
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
* </ul>
*
* @return an {@link IntStream} of Unicode code point values
*
* @draft ICU 76
*/
public IntStream codePointStream() {
return StreamSupport.intStream(new CodePointSpliterator(this), false);
}
/**
* Returns a stream of {@code String} values from this {@code UnicodeSet}.
*
* <p><b>Warnings:</b>
* <ul>
* <li>To iterate over the full contents, you have to also iterate over the strings.
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
* </ul>
*
* @return a {@link Stream} of {@code String}
*
* @draft ICU 76
*/
public Stream<String> stream() {
return StreamSupport.stream(spliterator(), false);
}
/**
* Returns an {@link Iterable} for iteration over all the code points in this set.
*
* <p><b>Warnings:</b>
* <ul>
* <li>This is a convenience method, but comes with a performance penalty
* because it boxes {@code int} into {@code Integer}.<br>
* For an efficient but old alternative use {@link UnicodeSetIterator#next()}.
* <li>To iterate over the full contents, you have to also iterate over the strings.
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
* </ul>
*
* @return an {@link Iterable} over all the code points
*
* @draft ICU 76
*/
public Iterable<Integer> codePoints() {
return new CodePointIterable(this);
}
private class CodePointIterable implements Iterable<Integer> {
private final UnicodeSet unicodeSet;
CodePointIterable(UnicodeSet unicodeSet) {
this.unicodeSet = unicodeSet;
}
@Override
public Iterator<Integer> iterator() {
return new CodePointIterator(unicodeSet);
}
}
private class CodePointIterator implements Iterator<Integer> {
private final CodePointIteratorInt cpi;
CodePointIterator(UnicodeSet unicodeSet) {
cpi = new CodePointIteratorInt(unicodeSet);
}
@Override
public boolean hasNext() {
return cpi.hasNext();
}
@Override
public Integer next() {
return cpi.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
private static class CodePointSpliterator implements Spliterator.OfInt {
private final static int CHARACTERISTICS = Spliterator.SIZED | Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.NONNULL;
private final UnicodeSet unicodeSet;
private final CodePointIteratorInt cpi;
CodePointSpliterator(UnicodeSet unicodeSet) {
this.unicodeSet = unicodeSet;
cpi = new CodePointIteratorInt(unicodeSet);
}
@Override
public long estimateSize() {
return unicodeSet.size() - unicodeSet.strings.size();
}
@Override
public int characteristics() {
return unicodeSet.isFrozen() ? Spliterator.IMMUTABLE | CHARACTERISTICS : CHARACTERISTICS;
}
@Override
public Spliterator.OfInt trySplit() {
/* From the doc:
* > This method may return null for any reason, including emptiness, inability to split after
* > traversal has commenced, data structure constraints, and efficiency considerations.
*/
return null;
}
@Override
public boolean tryAdvance(IntConsumer action) {
if (action == null) {
throw new NullPointerException();
}
if (cpi.hasNext()) {
action.accept(cpi.next());
return true;
}
return false;
}
}
/**
* This class is optimized to iterate on code points and will be used to implement both
* the Iterator<Integer> (Integer, boxed value) and the Spliterator.OfInt (int primitive).
* It looks exactly like an Iterator<Integer>, but works on the primitive int,
* so it can't implement Iterator.
*/
static private class CodePointIteratorInt {
private final int[] list;
private final int lastRange;
private int currentRange = 0;
private int rangeStart;
private int rangeLimit;
public CodePointIteratorInt(UnicodeSet unicodeSet) {
this.list = unicodeSet.list;
lastRange = unicodeSet.len - 1;
currentRange = 0;
rangeStart = list[currentRange++];
if (lastRange > 0) { // not an empty list
rangeLimit = list[currentRange++];
} else {
rangeLimit = rangeStart; // should be HIGH, the guard value
}
}
public boolean hasNext() {
return rangeStart < rangeLimit || currentRange < lastRange;
}
public int next() {
if (rangeStart >= rangeLimit) {
if (currentRange >= lastRange) {
throw new NoSuchElementException();
}
rangeStart = list[currentRange++];
rangeLimit = list[currentRange++];
}
return rangeStart++;
}
}
}
//eof

View file

@ -21,6 +21,7 @@ import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.SortedSet;
import java.util.StringJoiner;
@ -61,6 +62,16 @@ public class UnicodeSetTest extends CoreTestFmwk {
static final String NOT = "%%%%";
// Used to test iterators and streams
final static UnicodeSet[] UNICODE_SETS_TO_ITERATE = {
UnicodeSet.EMPTY,
new UnicodeSet(0, 0), // one code point, zero
new UnicodeSet(0x0000, 0x0010), // from zero
new UnicodeSet(0x10FFFF, 0x10FFFF), // one code point, max
new UnicodeSet(0x10FF00, 0x10FFFF), // end in max
populateUnicodeSet()
};
private static final boolean isCccValue(int ccc) {
switch (ccc) {
case 0:
@ -841,7 +852,7 @@ public class UnicodeSetTest extends CoreTestFmwk {
}
}
}
@Test
public void TestSetRelation() {
@ -3198,4 +3209,156 @@ public class UnicodeSetTest extends CoreTestFmwk {
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains("🚲"));
}
}
private static UnicodeSet populateUnicodeSet() {
// Trying to cover the most interesting combinations:
final UnicodeSet unicodeSet = new UnicodeSet();
// Patterns
unicodeSet.applyPattern("\\p{sc=Ethi}");
unicodeSet.applyPattern("\\p{Number}");
// Single code point
unicodeSet.add('X'); // adds a code point in the ASCII range
unicodeSet.add('Σ'); // adds a code point in the Greek block
unicodeSet.add(0x1F600); // adds a code point above BMP (😀)
// Code point ranges and from CharSequence
unicodeSet.add('A', 'F'); // adds a code point range in the ASCII range
unicodeSet.add('α', 'ζ'); // adds a code point range in the Greek block
unicodeSet.add(0x1F347, 0x1F353); // adds a code point range above BMP (🍇-🍓)
// Strings
unicodeSet.add("world"); // adds string
unicodeSet.addAll("one", "two", "three"); // adds strings
return (UnicodeSet) unicodeSet.freeze();
}
@Test
public void testIterationMethodsCodepoints() {
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
// Build the reference test result
UnicodeSetIterator usi = new UnicodeSetIterator(us);
StringJoiner joiner = newResult();
while (usi.next() && usi.codepoint != UnicodeSetIterator.IS_STRING) {
joiner.add(codePointToString(usi.codepoint));
}
String expected = joiner.toString();
StringJoiner fromIterable = newResult();
for (Integer cp : us.codePoints()) {
fromIterable.add(codePointToString(cp));
}
assertEquals("code points :: codePoints", expected, fromIterable.toString());
StringJoiner fromStream = newResult();
us.codePointStream().forEach(cp -> fromStream.add(codePointToString(cp)));
assertEquals("code points :: codePointStream", expected, fromStream.toString());
}
}
@Test
public void testIterationMethodsRanges() {
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
// Build the reference test result
StringJoiner expected = newResult();
for (EntryRange r : us.ranges()) {
expected.add(rangeToString(r));
}
StringJoiner actual = newResult();
us.rangeStream().forEach(r -> actual.add(rangeToString(r)));
assertEquals("ranges :: rangeStream", expected.toString(), actual.toString());
}
}
@Test
public void testIterationMethodsStrings() {
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
// Build the reference test result
StringJoiner expected = newResult();
for (String str : us.strings()) {
expected.add(stringToString(str));
}
StringJoiner actual = newResult();
us.stringStream().map(UnicodeSetTest::stringToString).forEach(actual::add);
assertEquals("strings :: stringStream", expected.toString(), actual.toString());
}
}
@Test
// Iterates on strings AND on code points (converted to strings)
public void testIterationMethodsCodepointsAndStrings() {
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
// Build the reference test result
StringJoiner expected = newResult();
for (String str : us) {
expected.add(stringToString(str));
}
StringJoiner fromStream = newResult();
us.stream().map(UnicodeSetTest::stringToString).forEach(fromStream::add);
assertEquals("code points + strings :: stream", expected.toString(), fromStream.toString());
}
}
@Test(expected = NoSuchElementException.class)
public void testMisuseIterator() {
UnicodeSet us = new UnicodeSet(0x10FFFE, 0x10FFFF);
assertEquals("Even length, last range ends at HIGH", 2, us.size());
Iterator<Integer> cpIter = us.codePoints().iterator();
assertEquals("", (Integer) 0x10FFFE, cpIter.next());
assertEquals("", (Integer) 0x10FFFF, cpIter.next());
cpIter.next(); // NoSuchElementException
}
/*
* Helper methods for testing various iterations.
* The main goal is to make the results of any possible failures more readable
* by formatting code points to something like U+03A3(Σ) and wrapping strings in double quotes.
*/
private static String codePointToString(int cp) {
String fromCodePoint = UTF16.valueOf(cp);
return String.format("U+%04X(%s)", cp, fromCodePoint);
}
private static String rangeToString(EntryRange range) {
return rangeToString(range.codepoint, range.codepointEnd);
}
private static String rangeToString(int cpStart, int cpEnd) {
return String.format("%s-%s", codePointToString(cpStart), codePointToString(cpEnd));
}
private static String stringToString(String str) {
return String.format("\"%s\"", str);
}
private static StringJoiner newResult() {
return new StringJoiner(", ");
}
@Test
public void testParallelStreams() {
if (!isVerbose()) {
return;
}
UnicodeSet us = UnicodeSet.ALL_CODE_POINTS;
long start = System.nanoTime();
int sumNormal = us.codePointStream().sum();
long timeNormal = System.nanoTime() - start;
System.out.println("codePointStream.normal : " + timeNormal);
start = System.nanoTime();
int sumParallel = us.codePointStream().parallel().sum();
long timeParallel = System.nanoTime() - start;
System.out.println("codePointStream.parallel : " + timeParallel);
// On my machines this is about 1.4-1.5, so it is a bit faster.
// Unlikely to have any practical benefit, this is more to test that the
// parallel stream works the same as the normal one, by comparing the sums.
System.out.println("Speed (normal/parallel) : " + (double) timeNormal / timeParallel);
assertEquals("normal and parallel give different sum", sumNormal, sumParallel);
}
}