ICU-22789 Replace Stream of boundary Integer with efficient IntStream

This commit is contained in:
Elango Cheran 2025-03-28 16:02:38 -07:00
parent 465f620158
commit 3dbbdb4e4b
2 changed files with 60 additions and 30 deletions

View file

@ -2,7 +2,9 @@ package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
import java.util.Iterator;
import java.util.Spliterator;
import java.util.function.Function;
import java.util.function.IntConsumer;
import java.util.stream.IntStream;
import java.util.stream.Stream;
@ -158,29 +160,12 @@ public interface Segments {
// Inner classes for BoundaryIterable and BoundaryIterator
//
class BoundaryIterable implements Iterable<Integer> {
BreakIterator breakIter;
IterationDirection direction;
int startIdx;
BoundaryIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) {
this.breakIter = breakIter;
this.direction = direction;
this.startIdx = startIdx;
}
@Override
public Iterator<Integer> iterator() {
return new BoundaryIterator(this.breakIter, this.direction, this.startIdx);
}
}
class BoundaryIterator implements Iterator<Integer> {
class BoundaryIteratorOfInts {
BreakIterator breakIter;
IterationDirection direction;
int currIdx;
BoundaryIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) {
BoundaryIteratorOfInts(BreakIterator breakIter, IterationDirection direction, int startIdx) {
this.breakIter = breakIter;
this.direction = direction;
@ -198,12 +183,10 @@ public interface Segments {
}
}
@Override
public boolean hasNext() {
return this.currIdx != BreakIterator.DONE;
}
@Override
public Integer next() {
int result = this.currIdx;
@ -218,4 +201,55 @@ public interface Segments {
}
}
class SegmentSpliterator implements Spliterator.OfInt {
private final BoundaryIteratorOfInts iter;
SegmentSpliterator(BreakIterator breakIter, IterationDirection direction, int startIdx) {
iter = new BoundaryIteratorOfInts(breakIter, direction, startIdx);
}
@Override
public OfInt trySplit() {
// The elements of the Stream represent an iteration through a string, and is thus inherently
// stateful. Therefore, splitting this Stream does not make sense. Ex: splitting the Stream
// is tantamount to discarding the segment subtended by the end value (index into the input
// string) of one substream and the beginning value of the next substream.
return null;
}
@Override
public long estimateSize() {
// The number of segments per input size depends on language, script, and
// the content of the input string, and thus is hard to estimate without
// sacrificing performance. Thus, returning `Long.MAX_VALUE`, according
// to the API, to mean "unknown, or too expensive to compute".
return Long.MAX_VALUE;
}
@Override
public int characteristics() {
return Spliterator.DISTINCT // BreakIterator always advances
| Spliterator.IMMUTABLE // design of Segmenter API is to provide an immutable view of
// segmentation by preventing the input string from mutating
// in the underlying BreakIterator
| Spliterator.NONNULL // primtive int is non-null
| Spliterator.ORDERED // BreakIterator always advances, and in a single direction
;
}
@Override
public boolean tryAdvance(IntConsumer action) {
if (action == null) {
throw new NullPointerException();
}
if (iter.hasNext()) {
action.accept(iter.next());
return true;
} else {
return false;
}
}
}
}

View file

@ -1,10 +1,10 @@
package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.segmenter.Segments.BoundaryIterable;
import com.ibm.icu.text.segmenter.Segments.IterationDirection;
import com.ibm.icu.text.segmenter.Segments.Segment;
import com.ibm.icu.text.segmenter.Segments.SegmentIterable;
import com.ibm.icu.text.segmenter.Segments.SegmentSpliterator;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
@ -86,6 +86,8 @@ public class SegmentsImplUtils {
return new Segment(start, limit, sourceSequence);
}
// TODO(ICU-22987): Remove unused segmentBeforeIndex / segmentAfterIndex after
// ensuring fix for preceding(int) to return `DONE` for negative inputs
public static Segment segmentBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) {
breakIter.setText(sourceSequence);
@ -120,10 +122,7 @@ public class SegmentsImplUtils {
breakIter.setText(sourceSequence);
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
// TODO: optimize IntStream creation to avoid autoboxing
BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.FORWARDS, i);
Stream<Integer> boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false);
return boundariesAsIntegers.mapToInt(Integer::intValue);
return StreamSupport.intStream(new SegmentSpliterator(breakIter, IterationDirection.FORWARDS, i), false);
}
public static IntStream boundariesBackFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) {
@ -139,10 +138,7 @@ public class SegmentsImplUtils {
int backFromIdx = isOnBoundary ? i + 1 : i;
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
// TODO: optimize IntStream creation to avoid autoboxing
BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, backFromIdx);
Stream<Integer> boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false);
return boundariesAsIntegers.mapToInt(Integer::intValue);
return StreamSupport.intStream(new SegmentSpliterator(breakIter, IterationDirection.BACKWARDS, backFromIdx), false);
}
}