mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 15:42:14 +00:00
ICU-22845 Better iterations for the ICU4J UnicodeSet
This commit is contained in:
parent
66ba09973a
commit
b5b3e16afa
2 changed files with 400 additions and 8 deletions
|
@ -17,7 +17,12 @@ import java.util.Collections;
|
|||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.SortedSet;
|
||||
import java.util.Spliterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.IntConsumer;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import com.ibm.icu.impl.BMPSet;
|
||||
import com.ibm.icu.impl.CharacterPropertiesImpl;
|
||||
|
@ -278,14 +283,22 @@ import com.ibm.icu.util.VersionInfo;
|
|||
* </tr>
|
||||
* </table>
|
||||
* </blockquote>
|
||||
* <p>To iterate over contents of UnicodeSet, the following are available:
|
||||
* <ul><li>{@link #ranges()} to iterate through the ranges</li>
|
||||
* <li>{@link #strings()} to iterate through the strings</li>
|
||||
* <li>{@link #iterator()} to iterate through the entire contents in a single loop.
|
||||
* That method is, however, not particularly efficient, since it "boxes" each code point into a String.
|
||||
*
|
||||
* <p>To iterate over contents of {@code UnicodeSet}, the following are available:
|
||||
* <ul>
|
||||
* <li>to iterate over the ranges: {@link #ranges()}, {@link #rangeStream()}</li>
|
||||
* <li>to iterate over the strings: {@link #strings()}, {@link #stringStream()}</li>
|
||||
* <li>to iterate over the code points: {@link #codePoints()}, {@link #codePointStream()}</li>
|
||||
* <li>to iterate over the entire contents in a single loop: this class itself is {@link Iterable},
|
||||
* or use {@link #stream()}.<br>
|
||||
* All of these method are, however, not particularly efficient,
|
||||
* since they convert each individual code point to a {@code String}.
|
||||
* </ul>
|
||||
* All of the above can be used in <b>for</b> loops.
|
||||
* The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
|
||||
*
|
||||
* <p>The iterators and streams methods work as expected in idiomatic Java usage.<br>
|
||||
* The {@link UnicodeSetIterator} cannot be used in <b>for</b> loops, and it is not very Java-idiomatic, because it is old.
|
||||
* But it might be faster in certain use cases. We recommend that you measure in performance sensitive code.<br>
|
||||
*
|
||||
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
|
||||
*
|
||||
* @author Alan Liu
|
||||
|
@ -5127,5 +5140,221 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
CharacterPropertiesImpl.clear();
|
||||
XSYMBOL_TABLE = xSymbolTable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link Stream} of {@link EntryRange} values from this {@code UnicodeSet}.
|
||||
*
|
||||
* <p><b>Warnings:</b>
|
||||
* <ul>
|
||||
* <li>The {@link EntryRange} instance is the same each time; the contents are just reset.
|
||||
* <li>To iterate over the full contents, you have to also iterate over the strings.
|
||||
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
|
||||
* Do not alter the {@code UnicodeSet} while iterating.
|
||||
* </ul>
|
||||
*
|
||||
* @return a {@link Stream} of {@link EntryRange}
|
||||
*
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public Stream<EntryRange> rangeStream() {
|
||||
// Must use false to never make this parallel because the iterator always returns the same EntryRange object.
|
||||
return StreamSupport.stream(ranges().spliterator(), false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link Stream} of {@code String} values from this {@code UnicodeSet}.
|
||||
*
|
||||
* <p><b>Warnings:</b>
|
||||
* <ul>
|
||||
* <li>To iterate over the full contents, you have to also iterate over the ranges or code points.
|
||||
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
|
||||
* Do not alter the {@code UnicodeSet} while iterating.
|
||||
* </ul>
|
||||
*
|
||||
* @return a {@link Stream} of {@code String}
|
||||
*
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public Stream<String> stringStream() {
|
||||
return strings().stream();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link IntStream} of Unicode code point values from this {@code UnicodeSet}.
|
||||
*
|
||||
* <p><b>Warnings:</b>
|
||||
* <ul>
|
||||
* <li>To iterate over the full contents, you have to also iterate over the strings.
|
||||
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
|
||||
* Do not alter the {@code UnicodeSet} while iterating.
|
||||
* </ul>
|
||||
*
|
||||
* @return an {@link IntStream} of Unicode code point values
|
||||
*
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public IntStream codePointStream() {
|
||||
return StreamSupport.intStream(new CodePointSpliterator(this), false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a stream of {@code String} values from this {@code UnicodeSet}.
|
||||
*
|
||||
* <p><b>Warnings:</b>
|
||||
* <ul>
|
||||
* <li>To iterate over the full contents, you have to also iterate over the strings.
|
||||
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
|
||||
* Do not alter the {@code UnicodeSet} while iterating.
|
||||
* </ul>
|
||||
*
|
||||
* @return a {@link Stream} of {@code String}
|
||||
*
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public Stream<String> stream() {
|
||||
return StreamSupport.stream(spliterator(), false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link Iterable} for iteration over all the code points in this set.
|
||||
*
|
||||
* <p><b>Warnings:</b>
|
||||
* <ul>
|
||||
* <li>This is a convenience method, but comes with a performance penalty
|
||||
* because it boxes {@code int} into {@code Integer}.<br>
|
||||
* For an efficient but old alternative use {@link UnicodeSetIterator#next()}.
|
||||
* <li>To iterate over the full contents, you have to also iterate over the strings.
|
||||
* <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
|
||||
* Do not alter the {@code UnicodeSet} while iterating.
|
||||
* </ul>
|
||||
*
|
||||
* @return an {@link Iterable} over all the code points
|
||||
*
|
||||
* @draft ICU 76
|
||||
*/
|
||||
public Iterable<Integer> codePoints() {
|
||||
return new CodePointIterable(this);
|
||||
}
|
||||
|
||||
private class CodePointIterable implements Iterable<Integer> {
|
||||
private final UnicodeSet unicodeSet;
|
||||
|
||||
CodePointIterable(UnicodeSet unicodeSet) {
|
||||
this.unicodeSet = unicodeSet;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Integer> iterator() {
|
||||
return new CodePointIterator(unicodeSet);
|
||||
}
|
||||
}
|
||||
|
||||
private class CodePointIterator implements Iterator<Integer> {
|
||||
private final CodePointIteratorInt cpi;
|
||||
|
||||
CodePointIterator(UnicodeSet unicodeSet) {
|
||||
cpi = new CodePointIteratorInt(unicodeSet);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return cpi.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer next() {
|
||||
return cpi.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
private static class CodePointSpliterator implements Spliterator.OfInt {
|
||||
private final static int CHARACTERISTICS = Spliterator.SIZED | Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.NONNULL;
|
||||
|
||||
private final UnicodeSet unicodeSet;
|
||||
private final CodePointIteratorInt cpi;
|
||||
|
||||
CodePointSpliterator(UnicodeSet unicodeSet) {
|
||||
this.unicodeSet = unicodeSet;
|
||||
cpi = new CodePointIteratorInt(unicodeSet);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long estimateSize() {
|
||||
return unicodeSet.size() - unicodeSet.strings.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int characteristics() {
|
||||
return unicodeSet.isFrozen() ? Spliterator.IMMUTABLE | CHARACTERISTICS : CHARACTERISTICS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Spliterator.OfInt trySplit() {
|
||||
/* From the doc:
|
||||
* > This method may return null for any reason, including emptiness, inability to split after
|
||||
* > traversal has commenced, data structure constraints, and efficiency considerations.
|
||||
*/
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tryAdvance(IntConsumer action) {
|
||||
if (action == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
|
||||
if (cpi.hasNext()) {
|
||||
action.accept(cpi.next());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is optimized to iterate on code points and will be used to implement both
|
||||
* the Iterator<Integer> (Integer, boxed value) and the Spliterator.OfInt (int primitive).
|
||||
* It looks exactly like an Iterator<Integer>, but works on the primitive int,
|
||||
* so it can't implement Iterator.
|
||||
*/
|
||||
static private class CodePointIteratorInt {
|
||||
private final int[] list;
|
||||
private final int lastRange;
|
||||
private int currentRange = 0;
|
||||
private int rangeStart;
|
||||
private int rangeLimit;
|
||||
|
||||
public CodePointIteratorInt(UnicodeSet unicodeSet) {
|
||||
this.list = unicodeSet.list;
|
||||
lastRange = unicodeSet.len - 1;
|
||||
currentRange = 0;
|
||||
rangeStart = list[currentRange++];
|
||||
if (lastRange > 0) { // not an empty list
|
||||
rangeLimit = list[currentRange++];
|
||||
} else {
|
||||
rangeLimit = rangeStart; // should be HIGH, the guard value
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return rangeStart < rangeLimit || currentRange < lastRange;
|
||||
}
|
||||
|
||||
public int next() {
|
||||
if (rangeStart >= rangeLimit) {
|
||||
if (currentRange >= lastRange) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
rangeStart = list[currentRange++];
|
||||
rangeLimit = list[currentRange++];
|
||||
}
|
||||
return rangeStart++;
|
||||
}
|
||||
}
|
||||
}
|
||||
//eof
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.Iterator;
|
|||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.StringJoiner;
|
||||
|
@ -61,6 +62,16 @@ public class UnicodeSetTest extends CoreTestFmwk {
|
|||
|
||||
static final String NOT = "%%%%";
|
||||
|
||||
// Used to test iterators and streams
|
||||
final static UnicodeSet[] UNICODE_SETS_TO_ITERATE = {
|
||||
UnicodeSet.EMPTY,
|
||||
new UnicodeSet(0, 0), // one code point, zero
|
||||
new UnicodeSet(0x0000, 0x0010), // from zero
|
||||
new UnicodeSet(0x10FFFF, 0x10FFFF), // one code point, max
|
||||
new UnicodeSet(0x10FF00, 0x10FFFF), // end in max
|
||||
populateUnicodeSet()
|
||||
};
|
||||
|
||||
private static final boolean isCccValue(int ccc) {
|
||||
switch (ccc) {
|
||||
case 0:
|
||||
|
@ -841,7 +852,7 @@ public class UnicodeSetTest extends CoreTestFmwk {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestSetRelation() {
|
||||
|
||||
|
@ -3198,4 +3209,156 @@ public class UnicodeSetTest extends CoreTestFmwk {
|
|||
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains("🚲"));
|
||||
}
|
||||
}
|
||||
|
||||
private static UnicodeSet populateUnicodeSet() {
|
||||
// Trying to cover the most interesting combinations:
|
||||
final UnicodeSet unicodeSet = new UnicodeSet();
|
||||
// Patterns
|
||||
unicodeSet.applyPattern("\\p{sc=Ethi}");
|
||||
unicodeSet.applyPattern("\\p{Number}");
|
||||
// Single code point
|
||||
unicodeSet.add('X'); // adds a code point in the ASCII range
|
||||
unicodeSet.add('Σ'); // adds a code point in the Greek block
|
||||
unicodeSet.add(0x1F600); // adds a code point above BMP (😀)
|
||||
// Code point ranges and from CharSequence
|
||||
unicodeSet.add('A', 'F'); // adds a code point range in the ASCII range
|
||||
unicodeSet.add('α', 'ζ'); // adds a code point range in the Greek block
|
||||
unicodeSet.add(0x1F347, 0x1F353); // adds a code point range above BMP (🍇-🍓)
|
||||
// Strings
|
||||
unicodeSet.add("world"); // adds string
|
||||
unicodeSet.addAll("one", "two", "three"); // adds strings
|
||||
return (UnicodeSet) unicodeSet.freeze();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIterationMethodsCodepoints() {
|
||||
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
|
||||
// Build the reference test result
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator(us);
|
||||
StringJoiner joiner = newResult();
|
||||
while (usi.next() && usi.codepoint != UnicodeSetIterator.IS_STRING) {
|
||||
joiner.add(codePointToString(usi.codepoint));
|
||||
}
|
||||
String expected = joiner.toString();
|
||||
|
||||
StringJoiner fromIterable = newResult();
|
||||
for (Integer cp : us.codePoints()) {
|
||||
fromIterable.add(codePointToString(cp));
|
||||
}
|
||||
assertEquals("code points :: codePoints", expected, fromIterable.toString());
|
||||
|
||||
StringJoiner fromStream = newResult();
|
||||
us.codePointStream().forEach(cp -> fromStream.add(codePointToString(cp)));
|
||||
assertEquals("code points :: codePointStream", expected, fromStream.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIterationMethodsRanges() {
|
||||
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
|
||||
// Build the reference test result
|
||||
StringJoiner expected = newResult();
|
||||
for (EntryRange r : us.ranges()) {
|
||||
expected.add(rangeToString(r));
|
||||
}
|
||||
|
||||
StringJoiner actual = newResult();
|
||||
us.rangeStream().forEach(r -> actual.add(rangeToString(r)));
|
||||
assertEquals("ranges :: rangeStream", expected.toString(), actual.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIterationMethodsStrings() {
|
||||
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
|
||||
// Build the reference test result
|
||||
StringJoiner expected = newResult();
|
||||
for (String str : us.strings()) {
|
||||
expected.add(stringToString(str));
|
||||
}
|
||||
|
||||
StringJoiner actual = newResult();
|
||||
us.stringStream().map(UnicodeSetTest::stringToString).forEach(actual::add);
|
||||
assertEquals("strings :: stringStream", expected.toString(), actual.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
// Iterates on strings AND on code points (converted to strings)
|
||||
public void testIterationMethodsCodepointsAndStrings() {
|
||||
for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
|
||||
// Build the reference test result
|
||||
StringJoiner expected = newResult();
|
||||
for (String str : us) {
|
||||
expected.add(stringToString(str));
|
||||
}
|
||||
|
||||
StringJoiner fromStream = newResult();
|
||||
us.stream().map(UnicodeSetTest::stringToString).forEach(fromStream::add);
|
||||
|
||||
assertEquals("code points + strings :: stream", expected.toString(), fromStream.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = NoSuchElementException.class)
|
||||
public void testMisuseIterator() {
|
||||
UnicodeSet us = new UnicodeSet(0x10FFFE, 0x10FFFF);
|
||||
assertEquals("Even length, last range ends at HIGH", 2, us.size());
|
||||
Iterator<Integer> cpIter = us.codePoints().iterator();
|
||||
assertEquals("", (Integer) 0x10FFFE, cpIter.next());
|
||||
assertEquals("", (Integer) 0x10FFFF, cpIter.next());
|
||||
cpIter.next(); // NoSuchElementException
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper methods for testing various iterations.
|
||||
* The main goal is to make the results of any possible failures more readable
|
||||
* by formatting code points to something like U+03A3(Σ) and wrapping strings in double quotes.
|
||||
*/
|
||||
|
||||
private static String codePointToString(int cp) {
|
||||
String fromCodePoint = UTF16.valueOf(cp);
|
||||
return String.format("U+%04X(%s)", cp, fromCodePoint);
|
||||
}
|
||||
|
||||
private static String rangeToString(EntryRange range) {
|
||||
return rangeToString(range.codepoint, range.codepointEnd);
|
||||
}
|
||||
|
||||
private static String rangeToString(int cpStart, int cpEnd) {
|
||||
return String.format("%s-%s", codePointToString(cpStart), codePointToString(cpEnd));
|
||||
}
|
||||
|
||||
private static String stringToString(String str) {
|
||||
return String.format("\"%s\"", str);
|
||||
}
|
||||
|
||||
private static StringJoiner newResult() {
|
||||
return new StringJoiner(", ");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParallelStreams() {
|
||||
if (!isVerbose()) {
|
||||
return;
|
||||
}
|
||||
UnicodeSet us = UnicodeSet.ALL_CODE_POINTS;
|
||||
|
||||
long start = System.nanoTime();
|
||||
int sumNormal = us.codePointStream().sum();
|
||||
long timeNormal = System.nanoTime() - start;
|
||||
System.out.println("codePointStream.normal : " + timeNormal);
|
||||
|
||||
start = System.nanoTime();
|
||||
int sumParallel = us.codePointStream().parallel().sum();
|
||||
long timeParallel = System.nanoTime() - start;
|
||||
System.out.println("codePointStream.parallel : " + timeParallel);
|
||||
|
||||
// On my machines this is about 1.4-1.5, so it is a bit faster.
|
||||
// Unlikely to have any practical benefit, this is more to test that the
|
||||
// parallel stream works the same as the normal one, by comparing the sums.
|
||||
System.out.println("Speed (normal/parallel) : " + (double) timeNormal / timeParallel);
|
||||
|
||||
assertEquals("normal and parallel give different sum", sumNormal, sumParallel);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue