ICU-22845 Better iterations for the ICU4J UnicodeSet

2025-04-10 15:42:14 +00:00 · 2024-08-08 22:17:31 +00:00 · 2024-08-08 22:17:31 +00:00 · b5b3e16afa
commit b5b3e16afa
parent 66ba09973a
2 changed files with 400 additions and 8 deletions
--- a/icu4j/main/core/src/main/java/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/UnicodeSet.java
@ -17,7 +17,12 @@ import java.util.Collections;
 import java.util.Iterator;
 import java.util.NoSuchElementException;
 import java.util.SortedSet;
+import java.util.Spliterator;
 import java.util.TreeSet;
+import java.util.function.IntConsumer;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;

 import com.ibm.icu.impl.BMPSet;
 import com.ibm.icu.impl.CharacterPropertiesImpl;
@ -278,14 +283,22 @@ import com.ibm.icu.util.VersionInfo;
 *     </tr>
 *   </table>
 * </blockquote>
- * <p>To iterate over contents of UnicodeSet, the following are available:
- * <ul><li>{@link #ranges()} to iterate through the ranges</li>
- * <li>{@link #strings()} to iterate through the strings</li>
- * <li>{@link #iterator()} to iterate through the entire contents in a single loop.
- * That method is, however, not particularly efficient, since it "boxes" each code point into a String.
+ *
+ * <p>To iterate over contents of {@code UnicodeSet}, the following are available:
+ * <ul>
+ *   <li>to iterate over the ranges: {@link #ranges()}, {@link #rangeStream()}</li>
+ *   <li>to iterate over the strings: {@link #strings()}, {@link #stringStream()}</li>
+ *   <li>to iterate over the code points: {@link #codePoints()}, {@link #codePointStream()}</li>
+ *   <li>to iterate over the entire contents in a single loop: this class itself is {@link Iterable},
+ *       or use {@link #stream()}.<br>
+ *       All of these method are, however, not particularly efficient,
+ *       since they convert each individual code point to a {@code String}.
 * </ul>
- * All of the above can be used in <b>for</b> loops.
- * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
+ *
+ * <p>The iterators and streams methods work as expected in idiomatic Java usage.<br>
+ * The {@link UnicodeSetIterator} cannot be used in <b>for</b> loops, and it is not very Java-idiomatic, because it is old.
+ * But it might be faster in certain use cases. We recommend that you measure in performance sensitive code.<br>
+ *
 * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
 *
 * @author Alan Liu
@ -5127,5 +5140,221 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
        CharacterPropertiesImpl.clear();
        XSYMBOL_TABLE = xSymbolTable;
    }
+
+    /**
+     * Returns a {@link Stream} of {@link EntryRange} values from this {@code UnicodeSet}.
+     *
+     * <p><b>Warnings:</b>
+     * <ul>
+     *   <li>The {@link EntryRange} instance is the same each time; the contents are just reset.
+     *   <li>To iterate over the full contents, you have to also iterate over the strings.
+     *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
+     *       Do not alter the {@code UnicodeSet} while iterating.
+     * </ul>
+     *
+     * @return a {@link Stream} of {@link EntryRange}
+     *
+     * @draft ICU 76
+     */
+    public Stream<EntryRange> rangeStream() {
+        // Must use false to never make this parallel because the iterator always returns the same EntryRange object.
+        return StreamSupport.stream(ranges().spliterator(), false);
+    }
+
+    /**
+     * Returns a {@link Stream} of {@code String} values from this {@code UnicodeSet}.
+     *
+     * <p><b>Warnings:</b>
+     * <ul>
+     *   <li>To iterate over the full contents, you have to also iterate over the ranges or code points.
+     *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
+     *       Do not alter the {@code UnicodeSet} while iterating.
+     * </ul>
+     *
+     * @return a {@link Stream} of {@code String}
+     *
+     * @draft ICU 76
+     */
+    public Stream<String> stringStream() {
+        return strings().stream();
+    }
+
+    /**
+     * Returns an {@link IntStream} of Unicode code point values from this {@code UnicodeSet}.
+     *
+     * <p><b>Warnings:</b>
+     * <ul>
+     *   <li>To iterate over the full contents, you have to also iterate over the strings.
+     *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
+     *       Do not alter the {@code UnicodeSet} while iterating.
+     * </ul>
+     *
+     * @return an {@link IntStream} of Unicode code point values
+     *
+     * @draft ICU 76
+     */
+    public IntStream codePointStream() {
+        return StreamSupport.intStream(new CodePointSpliterator(this), false);
+    }
+
+    /**
+     * Returns a stream of {@code String} values from this {@code UnicodeSet}.
+     *
+     * <p><b>Warnings:</b>
+     * <ul>
+     *   <li>To iterate over the full contents, you have to also iterate over the strings.
+     *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
+     *       Do not alter the {@code UnicodeSet} while iterating.
+     * </ul>
+     *
+     * @return a {@link Stream} of {@code String}
+     *
+     * @draft ICU 76
+     */
+    public Stream<String> stream() {
+        return StreamSupport.stream(spliterator(), false);
+    }
+
+    /**
+     * Returns an {@link Iterable} for iteration over all the code points in this set.
+     *
+     * <p><b>Warnings:</b>
+     * <ul>
+     *   <li>This is a convenience method, but comes with a performance penalty
+     *       because it boxes {@code int} into {@code Integer}.<br>
+     *       For an efficient but old alternative use {@link UnicodeSetIterator#next()}.
+     *   <li>To iterate over the full contents, you have to also iterate over the strings.
+     *   <li>For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
+     *       Do not alter the {@code UnicodeSet} while iterating.
+     * </ul>
+     *
+     * @return an {@link Iterable} over all the code points
+     *
+     * @draft ICU 76
+     */
+    public Iterable<Integer> codePoints() {
+        return new CodePointIterable(this);
+    }
+
+    private class CodePointIterable implements Iterable<Integer> {
+        private final UnicodeSet unicodeSet;
+
+        CodePointIterable(UnicodeSet unicodeSet) {
+            this.unicodeSet = unicodeSet;
+        }
+
+        @Override
+        public Iterator<Integer> iterator() {
+            return new CodePointIterator(unicodeSet);
+        }
+    }
+
+    private class CodePointIterator implements Iterator<Integer> {
+        private final CodePointIteratorInt cpi;
+
+        CodePointIterator(UnicodeSet unicodeSet) {
+            cpi = new CodePointIteratorInt(unicodeSet);
+        }
+
+        @Override
+        public boolean hasNext() {
+            return cpi.hasNext();
+        }
+
+        @Override
+        public Integer next() {
+            return cpi.next();
+        }
+
+        @Override
+        public void remove() {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    private static class CodePointSpliterator implements Spliterator.OfInt {
+        private final static int CHARACTERISTICS = Spliterator.SIZED | Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.NONNULL;
+
+        private final UnicodeSet unicodeSet;
+        private final CodePointIteratorInt cpi;
+
+        CodePointSpliterator(UnicodeSet unicodeSet) {
+            this.unicodeSet = unicodeSet;
+            cpi = new CodePointIteratorInt(unicodeSet);
+        }
+
+        @Override
+        public long estimateSize() {
+            return unicodeSet.size() - unicodeSet.strings.size();
+        }
+
+        @Override
+        public int characteristics() {
+            return unicodeSet.isFrozen() ? Spliterator.IMMUTABLE | CHARACTERISTICS : CHARACTERISTICS;
+        }
+
+        @Override
+        public Spliterator.OfInt trySplit() {
+            /* From the doc:
+             *   > This method may return null for any reason, including emptiness, inability to split after
+             *   > traversal has commenced, data structure constraints, and efficiency considerations.
+             */
+            return null;
+        }
+
+        @Override
+        public boolean tryAdvance(IntConsumer action) {
+            if (action == null) {
+                throw new NullPointerException();
+            }
+
+            if (cpi.hasNext()) {
+                action.accept(cpi.next());
+                return true;
+            }
+            return false;
+        }
+    }
+
+    /**
+     * This class is optimized to iterate on code points and will be used to implement both
+     * the Iterator<Integer> (Integer, boxed value) and the Spliterator.OfInt (int primitive).
+     * It looks exactly like an Iterator<Integer>, but works on the primitive int,
+     * so it can't implement Iterator.
+     */
+    static private class CodePointIteratorInt {
+        private final int[] list;
+        private final int lastRange; 
+        private int currentRange = 0;
+        private int rangeStart;
+        private int rangeLimit;
+
+        public CodePointIteratorInt(UnicodeSet unicodeSet) {
+            this.list = unicodeSet.list;
+            lastRange = unicodeSet.len - 1;
+            currentRange = 0;
+            rangeStart = list[currentRange++];
+            if (lastRange > 0) { // not an empty list
+                rangeLimit = list[currentRange++];
+            } else {
+                rangeLimit = rangeStart; // should be HIGH, the guard value
+            }
+        }
+
+        public boolean hasNext() {
+            return rangeStart < rangeLimit || currentRange < lastRange;
+        }
+
+        public int next() {
+            if (rangeStart >= rangeLimit) {
+                if (currentRange >= lastRange) {
+                  throw new NoSuchElementException();
+                }
+                rangeStart = list[currentRange++];
+                rangeLimit = list[currentRange++];
+            }
+            return rangeStart++;
+        }
+    }
 }
 //eof
--- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
+++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/lang/UnicodeSetTest.java
@ -21,6 +21,7 @@ import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.StringJoiner;
@ -61,6 +62,16 @@ public class UnicodeSetTest extends CoreTestFmwk {

    static final String NOT = "%%%%";

+    // Used to test iterators and streams
+    final static UnicodeSet[] UNICODE_SETS_TO_ITERATE = {
+            UnicodeSet.EMPTY,
+            new UnicodeSet(0, 0), // one code point, zero
+            new UnicodeSet(0x0000, 0x0010), // from zero
+            new UnicodeSet(0x10FFFF, 0x10FFFF), // one code point, max
+            new UnicodeSet(0x10FF00, 0x10FFFF), // end in max
+            populateUnicodeSet()
+    };
+
    private static final boolean isCccValue(int ccc) {
        switch (ccc) {
        case 0:
@ -841,7 +852,7 @@ public class UnicodeSetTest extends CoreTestFmwk {
            }
        }
    }
-    
+
    @Test
    public void TestSetRelation() {

@ -3198,4 +3209,156 @@ public class UnicodeSetTest extends CoreTestFmwk {
            assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains("🚲"));
        }
    }
+
+    private static UnicodeSet populateUnicodeSet() {
+        // Trying to cover the most interesting combinations:
+        final UnicodeSet unicodeSet = new UnicodeSet();
+        // Patterns
+        unicodeSet.applyPattern("\\p{sc=Ethi}");
+        unicodeSet.applyPattern("\\p{Number}");
+        // Single code point
+        unicodeSet.add('X'); // adds a code point in the ASCII range
+        unicodeSet.add('Σ'); // adds a code point in the Greek block
+        unicodeSet.add(0x1F600); // adds a code point above BMP (😀)
+        // Code point ranges and from CharSequence
+        unicodeSet.add('A', 'F'); // adds a code point range in the ASCII range
+        unicodeSet.add('α', 'ζ'); // adds a code point range in the Greek block
+        unicodeSet.add(0x1F347, 0x1F353); // adds a code point range above BMP (🍇-🍓)
+        // Strings
+        unicodeSet.add("world"); // adds string
+        unicodeSet.addAll("one", "two", "three"); // adds strings
+        return (UnicodeSet) unicodeSet.freeze();
+    }
+
+    @Test
+    public void testIterationMethodsCodepoints() {
+        for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
+            // Build the reference test result
+            UnicodeSetIterator usi = new UnicodeSetIterator(us);
+            StringJoiner joiner = newResult();
+            while (usi.next() && usi.codepoint != UnicodeSetIterator.IS_STRING) {
+                joiner.add(codePointToString(usi.codepoint));
+            }
+            String expected = joiner.toString();
+
+            StringJoiner fromIterable = newResult();
+            for (Integer cp : us.codePoints()) {
+                fromIterable.add(codePointToString(cp));
+            }
+            assertEquals("code points :: codePoints", expected, fromIterable.toString());
+
+            StringJoiner fromStream = newResult();
+            us.codePointStream().forEach(cp -> fromStream.add(codePointToString(cp)));
+            assertEquals("code points :: codePointStream", expected, fromStream.toString());
+        }
+    }
+
+    @Test
+    public void testIterationMethodsRanges() {
+        for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
+            // Build the reference test result
+            StringJoiner expected = newResult();
+            for (EntryRange r : us.ranges()) {
+                expected.add(rangeToString(r));
+            }
+
+            StringJoiner actual = newResult();
+            us.rangeStream().forEach(r -> actual.add(rangeToString(r)));
+            assertEquals("ranges :: rangeStream", expected.toString(), actual.toString());
+        }
+    }
+
+    @Test
+    public void testIterationMethodsStrings() {
+        for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
+            // Build the reference test result
+            StringJoiner expected = newResult();
+            for (String str : us.strings()) {
+                expected.add(stringToString(str));
+            }
+
+            StringJoiner actual = newResult();
+            us.stringStream().map(UnicodeSetTest::stringToString).forEach(actual::add);
+            assertEquals("strings :: stringStream", expected.toString(), actual.toString());
+        }
+    }
+
+    @Test
+    // Iterates on strings AND on code points (converted to strings)
+    public void testIterationMethodsCodepointsAndStrings() {
+        for (UnicodeSet us : UNICODE_SETS_TO_ITERATE) {
+            // Build the reference test result
+            StringJoiner expected = newResult();
+            for (String str : us) {
+                expected.add(stringToString(str));
+            }
+
+            StringJoiner fromStream = newResult();
+            us.stream().map(UnicodeSetTest::stringToString).forEach(fromStream::add);
+
+            assertEquals("code points + strings :: stream", expected.toString(), fromStream.toString());
+        }
+    }
+
+    @Test(expected = NoSuchElementException.class)
+    public void testMisuseIterator() {
+        UnicodeSet us = new UnicodeSet(0x10FFFE, 0x10FFFF);
+        assertEquals("Even length, last range ends at HIGH", 2, us.size());
+        Iterator<Integer> cpIter = us.codePoints().iterator();
+        assertEquals("", (Integer) 0x10FFFE, cpIter.next());
+        assertEquals("", (Integer) 0x10FFFF, cpIter.next());
+        cpIter.next(); // NoSuchElementException
+    }
+
+    /*
+     * Helper methods for testing various iterations.
+     * The main goal is to make the results of any possible failures more readable
+     * by formatting code points to something like U+03A3(Σ) and wrapping strings in double quotes.
+     */
+
+    private static String codePointToString(int cp) {
+        String fromCodePoint = UTF16.valueOf(cp);
+        return String.format("U+%04X(%s)", cp, fromCodePoint);
+    }
+
+    private static String rangeToString(EntryRange range) {
+        return rangeToString(range.codepoint, range.codepointEnd);
+    }
+
+    private static String rangeToString(int cpStart, int cpEnd) {
+        return String.format("%s-%s", codePointToString(cpStart), codePointToString(cpEnd));
+    }
+
+    private static String stringToString(String str) {
+        return String.format("\"%s\"", str);
+    }
+
+    private static StringJoiner newResult() {
+        return new StringJoiner(", ");
+    }
+
+    @Test
+    public void testParallelStreams() {
+        if (!isVerbose()) {
+            return;
+        }
+        UnicodeSet us = UnicodeSet.ALL_CODE_POINTS;
+
+        long start = System.nanoTime();
+        int sumNormal = us.codePointStream().sum();
+        long timeNormal = System.nanoTime() - start;
+        System.out.println("codePointStream.normal   : " + timeNormal);
+
+        start = System.nanoTime();
+        int sumParallel = us.codePointStream().parallel().sum();
+        long timeParallel = System.nanoTime() - start;
+        System.out.println("codePointStream.parallel : " + timeParallel);
+
+        // On my machines this is about 1.4-1.5, so it is a bit faster.
+        // Unlikely to have any practical benefit, this is more to test that the
+        // parallel stream works the same as the normal one, by comparing the sums.
+        System.out.println("Speed (normal/parallel)  : " + (double) timeNormal / timeParallel);
+
+        assertEquals("normal and parallel give different sum", sumNormal, sumParallel);
+    }
 }