mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 15:42:14 +00:00
Merge c1ca1f8877
into 770c4b8042
This commit is contained in:
commit
d3488fe543
9 changed files with 1108 additions and 2 deletions
|
@ -0,0 +1,153 @@
|
|||
package com.ibm.icu.text.segmenter;
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class LocalizedSegmenter implements Segmenter {
|
||||
|
||||
private ULocale locale;
|
||||
|
||||
private SegmentationType segmentationType;
|
||||
|
||||
@Override
|
||||
public Segments segment(CharSequence s) {
|
||||
return new LocalizedSegments(s, this);
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) {
|
||||
this.locale = locale;
|
||||
this.segmentationType = segmentationType;
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
public BreakIterator getNewBreakIterator() {
|
||||
BreakIterator breakIter;
|
||||
switch (this.segmentationType) {
|
||||
case LINE:
|
||||
breakIter = BreakIterator.getLineInstance(this.locale);
|
||||
break;
|
||||
case SENTENCE:
|
||||
breakIter = BreakIterator.getSentenceInstance(this.locale);
|
||||
break;
|
||||
case WORD:
|
||||
breakIter = BreakIterator.getWordInstance(this.locale);
|
||||
break;
|
||||
case GRAPHEME_CLUSTER:
|
||||
default:
|
||||
breakIter = BreakIterator.getCharacterInstance(this.locale);
|
||||
break;
|
||||
}
|
||||
return breakIter;
|
||||
}
|
||||
|
||||
public enum SegmentationType {
|
||||
GRAPHEME_CLUSTER,
|
||||
WORD,
|
||||
LINE,
|
||||
SENTENCE,
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private ULocale locale = ULocale.ROOT;
|
||||
|
||||
private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER;
|
||||
|
||||
Builder() { }
|
||||
|
||||
public Builder setLocale(ULocale locale) {
|
||||
this.locale = locale;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setSegmentationType(SegmentationType segmentationType) {
|
||||
this.segmentationType = segmentationType;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LocalizedSegmenter build() {
|
||||
return new LocalizedSegmenter(this.locale, this.segmentationType);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class LocalizedSegments implements Segments {
|
||||
|
||||
private CharSequence source;
|
||||
|
||||
private LocalizedSegmenter segmenter;
|
||||
|
||||
private BreakIterator breakIter;
|
||||
|
||||
private LocalizedSegments(CharSequence source, LocalizedSegmenter segmenter) {
|
||||
this.source = source;
|
||||
this.segmenter = segmenter;
|
||||
this.breakIter = this.segmenter.getNewBreakIterator();
|
||||
|
||||
this.breakIter.setText(source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<CharSequence> subSequences() {
|
||||
return SegmentsImplUtils.subSequences(this.breakIter, this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Segment segmentAt(int i) {
|
||||
return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<Segment> segments() {
|
||||
return SegmentsImplUtils.segments(this.breakIter, this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isBoundary(int i) {
|
||||
return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<Segment> segmentsFrom(int i) {
|
||||
return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<Segment> segmentsBefore(int i) {
|
||||
return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Function<Segment, CharSequence> segmentToSequenceFn() {
|
||||
return SegmentsImplUtils.segmentToSequenceFn(this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntStream boundaries() {
|
||||
return SegmentsImplUtils.boundaries(this.breakIter, this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntStream boundariesAfter(int i) {
|
||||
return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntStream boundariesBackFrom(int i) {
|
||||
return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package com.ibm.icu.text.segmenter;
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class RuleBasedSegmenter implements Segmenter {
|
||||
|
||||
private String rules;
|
||||
|
||||
@Override
|
||||
public Segments segment(CharSequence s) {
|
||||
return new RuleBasedSegments(s, this);
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
RuleBasedSegmenter(String rules) {
|
||||
this.rules = rules;
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
public BreakIterator getNewBreakIterator() {
|
||||
return new RuleBasedBreakIterator(this.rules);
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
String rules;
|
||||
|
||||
Builder() { }
|
||||
|
||||
public Builder setRules(String rules) {
|
||||
this.rules = rules;
|
||||
return this;
|
||||
}
|
||||
|
||||
public RuleBasedSegmenter build() {
|
||||
return new RuleBasedSegmenter(this.rules);
|
||||
}
|
||||
}
|
||||
|
||||
public static class RuleBasedSegments implements Segments {
|
||||
private CharSequence source;
|
||||
|
||||
private RuleBasedSegmenter segmenter;
|
||||
|
||||
private BreakIterator breakIter;
|
||||
|
||||
RuleBasedSegments(CharSequence source, RuleBasedSegmenter segmenter) {
|
||||
this.source = source;
|
||||
this.segmenter = segmenter;
|
||||
this.breakIter = this.segmenter.getNewBreakIterator();
|
||||
|
||||
this.breakIter.setText(source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<CharSequence> subSequences() {
|
||||
return SegmentsImplUtils.subSequences(this.breakIter, this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Segment segmentAt(int i) {
|
||||
return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<Segment> segments() {
|
||||
return SegmentsImplUtils.segments(this.breakIter, this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isBoundary(int i) {
|
||||
return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<Segment> segmentsFrom(int i) {
|
||||
return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<Segment> segmentsBefore(int i) {
|
||||
return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Function<Segment, CharSequence> segmentToSequenceFn() {
|
||||
return SegmentsImplUtils.segmentToSequenceFn(this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntStream boundaries() {
|
||||
return SegmentsImplUtils.boundaries(this.breakIter, this.source);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntStream boundariesAfter(int i) {
|
||||
return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntStream boundariesBackFrom(int i) {
|
||||
return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package com.ibm.icu.text.segmenter;
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
|
||||
public interface Segmenter {
|
||||
Segments segment(CharSequence s);
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
BreakIterator getNewBreakIterator();
|
||||
|
||||
}
|
|
@ -0,0 +1,257 @@
|
|||
package com.ibm.icu.text.segmenter;
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import java.util.Iterator;
|
||||
import java.util.Spliterator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.IntConsumer;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public interface Segments {
|
||||
|
||||
Stream<CharSequence> subSequences();
|
||||
|
||||
Segment segmentAt(int i);
|
||||
|
||||
Stream<Segment> segments();
|
||||
|
||||
Stream<Segment> segmentsFrom(int i);
|
||||
|
||||
Stream<Segment> segmentsBefore(int i);
|
||||
|
||||
Function<Segment, CharSequence> segmentToSequenceFn();
|
||||
|
||||
/**
|
||||
* Returns whether offset {@code i} is a segmentation boundary. Throws an exception when
|
||||
* {@code i} is not a valid boundary position for the source sequence.
|
||||
* @param i
|
||||
* @return
|
||||
*/
|
||||
boolean isBoundary(int i);
|
||||
|
||||
IntStream boundaries();
|
||||
|
||||
IntStream boundariesAfter(int i);
|
||||
|
||||
IntStream boundariesBackFrom(int i);
|
||||
|
||||
//
|
||||
// Inner enums/classes in common for other inner classes
|
||||
//
|
||||
|
||||
enum IterationDirection {
|
||||
FORWARDS,
|
||||
BACKWARDS,
|
||||
}
|
||||
|
||||
//
|
||||
// Inner classes for Segment, SegmentIterable, and SegmentIterator
|
||||
//
|
||||
|
||||
// TODO: consider options in design for potential memory usage optimization:
|
||||
// 1) keep simple class with public fields, but requires field per Segment to point to source
|
||||
// 2) make Segment an interface (getSource, getStart, getLimit, getRuleStatus, newSegment), and
|
||||
// maybe an abstract class that implements the interface, maybe with a default method impl
|
||||
// for convenience for getting (allocating & returning) the subsequence
|
||||
// 3) do not link the multiple Segment objects and the single Segments object via a field, and
|
||||
// instead provide a function on Segments that can convert each Segment into a CharSequence
|
||||
class Segment {
|
||||
public final int start;
|
||||
public final int limit;
|
||||
public final int ruleStatus = 0;
|
||||
public final CharSequence source;
|
||||
|
||||
public Segment(int start, int limit, CharSequence source) {
|
||||
this.start = start;
|
||||
this.limit = limit;
|
||||
this.source = source;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn
|
||||
* enables the creation of a lazy {@code Stream}.
|
||||
*/
|
||||
class SegmentIterable implements Iterable<Segment> {
|
||||
BreakIterator breakIter;
|
||||
final IterationDirection direction;
|
||||
int startIdx;
|
||||
final CharSequence source;
|
||||
|
||||
SegmentIterable(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) {
|
||||
this.breakIter = breakIter;
|
||||
this.direction = direction;
|
||||
this.startIdx = startIdx;
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Segment> iterator() {
|
||||
return new SegmentIterator(this.breakIter, this.direction, this.startIdx, this.source);
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentIterator implements Iterator<Segment> {
|
||||
BreakIterator breakIter;
|
||||
final IterationDirection direction;
|
||||
int start;
|
||||
int limit;
|
||||
final CharSequence source;
|
||||
|
||||
SegmentIterator(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) {
|
||||
this.breakIter = breakIter;
|
||||
this.direction = direction;
|
||||
this.source = source;
|
||||
|
||||
Segment segmentAtIdx = SegmentsImplUtils.segmentAt(breakIter, source, startIdx);
|
||||
|
||||
if (segmentAtIdx == null) {
|
||||
this.start = BreakIterator.DONE;
|
||||
} else if (direction == IterationDirection.FORWARDS) {
|
||||
this.start = segmentAtIdx.start;
|
||||
this.limit = breakIter.following(this.start);
|
||||
} else {
|
||||
assert direction == IterationDirection.BACKWARDS;
|
||||
if (breakIter.isBoundary(startIdx)) {
|
||||
// Note: breakIter::isBoundary is a stateful operation. It resets the position in the
|
||||
// BreakIterator, which we want to ensure that the position is where we think it is.
|
||||
this.start = startIdx;
|
||||
} else {
|
||||
// Since we already called BreakIterator.isBoundary() which mutates the BreakIterator
|
||||
// position to increment forwards when the return value is false, we should call
|
||||
// BreakIterator.previous() to update the iterator position while getting the start value
|
||||
// of the segment at startIdx
|
||||
this.start = breakIter.previous();
|
||||
}
|
||||
this.limit = getDirectionBasedNextIdx();
|
||||
}
|
||||
}
|
||||
|
||||
int getDirectionBasedNextIdx() {
|
||||
if (direction == IterationDirection.FORWARDS) {
|
||||
return breakIter.next();
|
||||
} else {
|
||||
assert direction == IterationDirection.BACKWARDS;
|
||||
return breakIter.previous();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.limit != BreakIterator.DONE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Segment next() {
|
||||
Segment result;
|
||||
if (this.limit < this.start) {
|
||||
result = new Segment(this.limit, this.start, this.source);
|
||||
} else {
|
||||
result = new Segment(this.start, this.limit, this.source);
|
||||
}
|
||||
|
||||
this.start = this.limit;
|
||||
this.limit = getDirectionBasedNextIdx();
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Inner classes for BoundaryIterable and BoundaryIterator
|
||||
//
|
||||
|
||||
class BoundaryIteratorOfInts {
|
||||
BreakIterator breakIter;
|
||||
IterationDirection direction;
|
||||
int currIdx;
|
||||
|
||||
BoundaryIteratorOfInts(BreakIterator breakIter, IterationDirection direction, int startIdx) {
|
||||
this.breakIter = breakIter;
|
||||
this.direction = direction;
|
||||
|
||||
// TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs
|
||||
if (startIdx < 0 && direction == IterationDirection.BACKWARDS) {
|
||||
this.currIdx = BreakIterator.DONE;
|
||||
return;
|
||||
}
|
||||
|
||||
if (direction == IterationDirection.FORWARDS) {
|
||||
this.currIdx = breakIter.following(startIdx);
|
||||
} else {
|
||||
assert direction == IterationDirection.BACKWARDS;
|
||||
this.currIdx = breakIter.preceding(startIdx);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return this.currIdx != BreakIterator.DONE;
|
||||
}
|
||||
|
||||
public Integer next() {
|
||||
int result = this.currIdx;
|
||||
|
||||
if (direction == IterationDirection.FORWARDS) {
|
||||
this.currIdx = breakIter.next();
|
||||
} else {
|
||||
assert direction == IterationDirection.BACKWARDS;
|
||||
this.currIdx = breakIter.previous();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentSpliterator implements Spliterator.OfInt {
|
||||
|
||||
private final BoundaryIteratorOfInts iter;
|
||||
|
||||
SegmentSpliterator(BreakIterator breakIter, IterationDirection direction, int startIdx) {
|
||||
iter = new BoundaryIteratorOfInts(breakIter, direction, startIdx);
|
||||
}
|
||||
|
||||
@Override
|
||||
public OfInt trySplit() {
|
||||
// The elements of the Stream represent an iteration through a string, and is thus inherently
|
||||
// stateful. Therefore, splitting this Stream does not make sense. Ex: splitting the Stream
|
||||
// is tantamount to discarding the segment subtended by the end value (index into the input
|
||||
// string) of one substream and the beginning value of the next substream.
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long estimateSize() {
|
||||
// The number of segments per input size depends on language, script, and
|
||||
// the content of the input string, and thus is hard to estimate without
|
||||
// sacrificing performance. Thus, returning `Long.MAX_VALUE`, according
|
||||
// to the API, to mean "unknown, or too expensive to compute".
|
||||
return Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int characteristics() {
|
||||
return Spliterator.DISTINCT // BreakIterator always advances
|
||||
| Spliterator.IMMUTABLE // design of Segmenter API is to provide an immutable view of
|
||||
// segmentation by preventing the input string from mutating
|
||||
// in the underlying BreakIterator
|
||||
| Spliterator.NONNULL // primtive int is non-null
|
||||
| Spliterator.ORDERED // BreakIterator always advances, and in a single direction
|
||||
;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean tryAdvance(IntConsumer action) {
|
||||
if (action == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
if (iter.hasNext()) {
|
||||
action.accept(iter.next());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
package com.ibm.icu.text.segmenter;
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.segmenter.Segments.IterationDirection;
|
||||
import com.ibm.icu.text.segmenter.Segments.Segment;
|
||||
import com.ibm.icu.text.segmenter.Segments.SegmentIterable;
|
||||
import com.ibm.icu.text.segmenter.Segments.SegmentSpliterator;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
|
||||
public class SegmentsImplUtils {
|
||||
|
||||
public static boolean isBoundary(BreakIterator breakIter, CharSequence source, int i) {
|
||||
return breakIter.isBoundary(i);
|
||||
}
|
||||
|
||||
public static Stream<CharSequence> subSequences(BreakIterator breakIter, CharSequence sourceSequence) {
|
||||
return segments(breakIter, sourceSequence).map(segmentToSequenceFn(sourceSequence));
|
||||
}
|
||||
|
||||
public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) {
|
||||
int start;
|
||||
int limit;
|
||||
|
||||
boolean isBoundary = breakIter.isBoundary(i);
|
||||
|
||||
if (isBoundary) {
|
||||
start = i;
|
||||
limit = breakIter.next();
|
||||
} else {
|
||||
// BreakIterator::isBoundary(i) will advance forwards to the next boundary if the argument
|
||||
// is not a boundary.
|
||||
limit = breakIter.current();
|
||||
start = breakIter.previous();
|
||||
}
|
||||
|
||||
if (start != BreakIterator.DONE && limit != BreakIterator.DONE) {
|
||||
return new Segment(start, limit, sourceSequence);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static Stream<Segment> segments(BreakIterator breakIter, CharSequence sourceSequence) {
|
||||
return segmentsFrom(breakIter, sourceSequence, 0);
|
||||
}
|
||||
|
||||
public static Stream<Segment> segmentsFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) {
|
||||
breakIter.setText(sourceSequence);
|
||||
|
||||
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
|
||||
SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i, sourceSequence);
|
||||
return StreamSupport.stream(iterable.spliterator(), false);
|
||||
}
|
||||
|
||||
public static Stream<Segment> segmentsBefore(BreakIterator breakIter, CharSequence sourceSequence, int i) {
|
||||
breakIter.setText(sourceSequence);
|
||||
|
||||
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
|
||||
SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i, sourceSequence);
|
||||
return StreamSupport.stream(iterable.spliterator(), false);
|
||||
}
|
||||
|
||||
public static Function<Segment, CharSequence> segmentToSequenceFn(CharSequence sourceSequence) {
|
||||
return segment -> sourceSequence.subSequence(segment.start, segment.limit);
|
||||
}
|
||||
|
||||
public static IntStream boundaries(BreakIterator breakIter, CharSequence sourceSequence) {
|
||||
return boundariesAfter(breakIter, sourceSequence, -1);
|
||||
}
|
||||
|
||||
public static IntStream boundariesAfter(BreakIterator breakIter, CharSequence sourceSequence, int i) {
|
||||
breakIter.setText(sourceSequence);
|
||||
|
||||
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
|
||||
return StreamSupport.intStream(new SegmentSpliterator(breakIter, IterationDirection.FORWARDS, i), false);
|
||||
}
|
||||
|
||||
public static IntStream boundariesBackFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) {
|
||||
int sourceLength = sourceSequence.length();
|
||||
if (i < 0) {
|
||||
return IntStream.empty();
|
||||
}
|
||||
|
||||
boolean isOnBoundary = i <= sourceLength && isBoundary(breakIter, sourceSequence, i);
|
||||
int backFromIdx = isOnBoundary ? i + 1 : i;
|
||||
|
||||
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
|
||||
return StreamSupport.intStream(new SegmentSpliterator(breakIter, IterationDirection.BACKWARDS, backFromIdx), false);
|
||||
}
|
||||
|
||||
}
|
|
@ -46,7 +46,7 @@ public class BreakIteratorRules_en_US_TEST extends ListResourceBundle {
|
|||
// all of which should not influence the algorithm
|
||||
"$_ignore_=[[:Mn:][:Me:][:Cf:]];"
|
||||
|
||||
// lower and upper case Roman letters, apostrophy and dash are
|
||||
// lower and upper case Roman letters, apostrophe and dash are
|
||||
// in the English dictionary
|
||||
+"$_dictionary_=[a-zA-Z\\'\\-];"
|
||||
|
||||
|
@ -64,7 +64,7 @@ public class BreakIteratorRules_en_US_TEST extends ListResourceBundle {
|
|||
+"$mid_word=[[:Pd:]\u00ad\u2027\\\"\\\'];"
|
||||
|
||||
// punctuation that can occur in the middle of a number: currently
|
||||
// apostrophes, qoutation marks, periods, commas, and the Arabic
|
||||
// apostrophes, quotation marks, periods, commas, and the Arabic
|
||||
// decimal point
|
||||
+"$mid_num=[\\\"\\\'\\,\u066b\\.];"
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package com.ibm.icu.dev.test.text.segmenter;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
|
||||
import com.ibm.icu.dev.test.CoreTestFmwk;
|
||||
import com.ibm.icu.text.segmenter.LocalizedSegmenter;
|
||||
import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType;
|
||||
import com.ibm.icu.text.segmenter.Segmenter;
|
||||
import com.ibm.icu.text.segmenter.Segments;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class LocalizedSegmenterTest extends CoreTestFmwk {
|
||||
|
||||
@Test
|
||||
public void testLocaleInLocalizedSegmenter() {
|
||||
String source = "Die 21en Jahrh. ist die Beste.";
|
||||
|
||||
Object[][] casesData = {
|
||||
{"de", Arrays.asList("Die 21en Jahrh. ist die Beste.")},
|
||||
};
|
||||
|
||||
for (Object[] caseDatum : casesData) {
|
||||
String localeTag = (String) caseDatum[0];
|
||||
ULocale locale = ULocale.forLanguageTag(localeTag);
|
||||
List<CharSequence> expWords = (List<CharSequence>) caseDatum[1];
|
||||
|
||||
Segmenter wordSeg =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(locale)
|
||||
.setSegmentationType(SegmentationType.SENTENCE)
|
||||
.build();
|
||||
Segments segments = wordSeg.segment(source);
|
||||
|
||||
List<CharSequence> actWords = segments.subSequences().collect(Collectors.toList());
|
||||
|
||||
assertThat(actWords, is(expWords));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package com.ibm.icu.dev.test.text.segmenter;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
|
||||
import com.ibm.icu.dev.test.CoreTestFmwk;
|
||||
import com.ibm.icu.text.segmenter.RuleBasedSegmenter;
|
||||
import com.ibm.icu.text.segmenter.Segmenter;
|
||||
import com.ibm.icu.text.segmenter.Segments;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class RuleBasedSegmenterTest extends CoreTestFmwk {
|
||||
|
||||
@Test
|
||||
public void testRules() {
|
||||
String source = "hejsan k:a tack";
|
||||
|
||||
Object[][] casesData = {
|
||||
{"default", ".*;", Arrays.asList("hejsan k:a tack")},
|
||||
// TODO: add more cases once RBBI rule syntax is understood
|
||||
};
|
||||
|
||||
for (Object[] caseDatum : casesData) {
|
||||
String desc = (String) caseDatum[0];
|
||||
String subrule = (String) caseDatum[1];
|
||||
List<CharSequence> expWords = (List<CharSequence>) caseDatum[2];
|
||||
|
||||
// the following rule substring was taken as a subset from BreakIteratorRules_en_US_TEST.java:
|
||||
String rules = subrule;
|
||||
|
||||
Segmenter seg = RuleBasedSegmenter.builder()
|
||||
.setRules(rules)
|
||||
.build();
|
||||
Segments segments = seg.segment(source);
|
||||
|
||||
List<CharSequence> actWords = segments.subSequences().collect(Collectors.toList());
|
||||
|
||||
assertThat(desc, actWords, is(expWords));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,373 @@
|
|||
package com.ibm.icu.dev.test.text.segmenter;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
|
||||
import com.ibm.icu.dev.test.CoreTestFmwk;
|
||||
import com.ibm.icu.text.segmenter.LocalizedSegmenter;
|
||||
import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType;
|
||||
import com.ibm.icu.text.segmenter.Segmenter;
|
||||
import com.ibm.icu.text.segmenter.Segments;
|
||||
import com.ibm.icu.text.segmenter.Segments.Segment;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class SegmentsTest extends CoreTestFmwk {
|
||||
|
||||
@Test
|
||||
public void testSegments() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.segments().collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 0, segments.get(0).start);
|
||||
assertEquals("first range limit", 3, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 3, segments.get(1).start);
|
||||
assertEquals("second range limit", 4, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleSegmentObjectsFromSegmenter() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
String source2 = "Sphinx of black quartz, judge my vow.";
|
||||
String source3 = "How vexingly quick daft zebras jump!";
|
||||
|
||||
List<CharSequence> exp1 = Arrays.asList("The", " ", "quick", " ", "brown", " ", "fox", " ",
|
||||
"jumped", " ", "over", " ", "the", " ", "lazy", " ", "dog", ".");
|
||||
List<CharSequence> exp2 = Arrays.asList("Sphinx", " ", "of", " ", "black", " ", "quartz", ",",
|
||||
" ", "judge", " ", "my", " ", "vow", ".");
|
||||
List<CharSequence> exp3 = Arrays.asList("How", " ", "vexingly", " ", "quick", " ", "daft", " ",
|
||||
"zebras", " ", "jump", "!");
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
List<CharSequence> act1 = segments1.subSequences().collect(Collectors.toList());
|
||||
assertThat(act1, is(exp1));
|
||||
|
||||
// Create new Segments for source2
|
||||
Segments segments2 = enWordSegmenter.segment(source2);
|
||||
List<CharSequence> act2 = segments2.subSequences().collect(Collectors.toList());
|
||||
assertThat(act2, is(exp2));
|
||||
|
||||
// Check that Segments for source1 is unaffected
|
||||
act1 = segments1.subSequences().collect(Collectors.toList());
|
||||
assertThat(act1, is(exp1));
|
||||
|
||||
// Create new Segments for source3
|
||||
Segments segments3 = enWordSegmenter.segment(source3);
|
||||
List<CharSequence> act3 = segments3.subSequences().collect(Collectors.toList());
|
||||
assertThat(act3, is(exp3));
|
||||
|
||||
// Check that Segments for source1 is unaffected
|
||||
act1 = segments1.subSequences().collect(Collectors.toList());
|
||||
assertThat(act1, is(exp1));
|
||||
|
||||
// Check that Segments for source2 is unaffected
|
||||
act2 = segments2.subSequences().collect(Collectors.toList());
|
||||
assertThat(act2, is(exp2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsBoundary() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
Object[][] casesData = {
|
||||
{"start of segment", 4, true},
|
||||
{"between start and limit of segment", 6, false},
|
||||
{"limit of segment", 9, true},
|
||||
{"beginning of string", 0, true},
|
||||
{"end of string", source1.length(), true},
|
||||
};
|
||||
|
||||
for (Object[] caseDatum : casesData) {
|
||||
String desc = (String) caseDatum[0];
|
||||
int idx = (int) caseDatum[1];
|
||||
boolean exp = (boolean) caseDatum[2];
|
||||
|
||||
assertThat(desc, segments1.isBoundary(idx) == exp);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsFrom_middleOfSegment() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 1;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 0, segments.get(0).start);
|
||||
assertEquals("first range limit", 3, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 3, segments.get(1).start);
|
||||
assertEquals("second range limit", 4, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsFrom_onBoundary() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 3;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 3, segments.get(0).start);
|
||||
assertEquals("first range limit", 4, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 4, segments.get(1).start);
|
||||
assertEquals("second range limit", 9, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsBefore_middleOfSegment() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 8;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 3, segments.get(0).start);
|
||||
assertEquals("first range limit", 4, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 0, segments.get(1).start);
|
||||
assertEquals("second range limit", 3, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsBefore_onBoundary() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 9;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 4, segments.get(0).start);
|
||||
assertEquals("first range limit", 9, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 3, segments.get(1).start);
|
||||
assertEquals("second range limit", 4, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentToSequenceFn() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 10;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<CharSequence> exp1 = Arrays.asList(" ", "quick", " ", "The");
|
||||
|
||||
List<CharSequence> act1 = segments1.segmentsBefore(startIdx)
|
||||
.map(segments1.segmentToSequenceFn())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertThat(act1, is(exp1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBoundaries() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source = "The quick brown fox jumped over the lazy dog.";
|
||||
|
||||
// Create new Segments for source
|
||||
Segments segments = enWordSegmenter.segment(source);
|
||||
|
||||
int[] exp = {0, 3, 4, 9, 10, 15, 16, 19, 20, 26, 27, 31, 32, 35, 36, 40, 41, 44, 45};
|
||||
|
||||
int[] act = segments.boundaries().toArray();
|
||||
|
||||
assertThat(act, is(exp));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBoundariesAfter() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source = "The quick brown fox jumped over the lazy dog.";
|
||||
int TAKE_LIMIT = 5;
|
||||
|
||||
// Create new Segments for source
|
||||
Segments segments = enWordSegmenter.segment(source);
|
||||
|
||||
Object[][] casesData = {
|
||||
{"first " + TAKE_LIMIT + " before beginning", -2, new int[]{0, 3, 4, 9, 10}},
|
||||
{"first " + TAKE_LIMIT + " in the middle of the third segment", 5, new int[]{9, 10, 15, 16, 19}},
|
||||
{"first " + TAKE_LIMIT + " on the limit of the third segment", 9, new int[]{10, 15, 16, 19, 20}},
|
||||
{"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]},
|
||||
{"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[0]},
|
||||
};
|
||||
|
||||
for (Object[] caseDatum : casesData) {
|
||||
String desc = (String) caseDatum[0];
|
||||
int startIdx = (int) caseDatum[1];
|
||||
int[] exp = (int[]) caseDatum[2];
|
||||
|
||||
int[] act = segments.boundariesAfter(startIdx).limit(TAKE_LIMIT).toArray();
|
||||
|
||||
assertThat(desc, act, is(exp));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBoundariesBackFrom() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source = "The quick brown fox jumped over the lazy dog.";
|
||||
int TAKE_LIMIT = 5;
|
||||
|
||||
// Create new Segments for source
|
||||
Segments segments = enWordSegmenter.segment(source);
|
||||
|
||||
Object[][] casesData = {
|
||||
{"first " + TAKE_LIMIT + " before beginning", -2, new int[0]},
|
||||
{"first " + TAKE_LIMIT + " at the beginning", 0, new int[]{0}},
|
||||
{"first " + TAKE_LIMIT + " from the start of the 2nd to last segment", 41, new int[]{41, 40, 36, 35, 32}},
|
||||
{"first " + TAKE_LIMIT + " in the middle of the 2nd to last segment", 42, new int[]{41, 40, 36, 35, 32}},
|
||||
{"first " + TAKE_LIMIT + " at the end", source.length(), new int[]{45, 44, 41, 40, 36}},
|
||||
{"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[]{45, 44, 41, 40, 36}},
|
||||
};
|
||||
|
||||
for (Object[] caseDatum : casesData) {
|
||||
String desc = (String) caseDatum[0];
|
||||
int startIdx = (int) caseDatum[1];
|
||||
int[] exp = (int[]) caseDatum[2];
|
||||
|
||||
int[] act = segments.boundariesBackFrom(startIdx).limit(TAKE_LIMIT).toArray();
|
||||
|
||||
assertThat(desc, act, is(exp));
|
||||
|
||||
if (startIdx < 0) {
|
||||
logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentAt() {
|
||||
Segmenter enWordSegmenter =
|
||||
LocalizedSegmenter.builder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source = "The quick brown fox jumped over the lazy dog.";
|
||||
|
||||
// Create new Segments for source
|
||||
Segments segments1 = enWordSegmenter.segment(source);
|
||||
|
||||
Object[][] casesData = {
|
||||
{"index before beginning", -2, null, null},
|
||||
{"index at beginning", 0, 0, 3},
|
||||
{"index in the middle of the first segment", 2, 0, 3},
|
||||
{"index in the middle of the third segment", 5, 4, 9},
|
||||
{"index at the end", source.length()-1, 44, 45},
|
||||
{"index after the end", source.length()+1, null, null},
|
||||
};
|
||||
|
||||
for (Object[] caseDatum : casesData) {
|
||||
String desc = (String) caseDatum[0];
|
||||
int startIdx = (int) caseDatum[1];
|
||||
Integer expStart = (Integer) caseDatum[2];
|
||||
Integer expLimit = (Integer) caseDatum[3];
|
||||
|
||||
if (startIdx < 0 ) {
|
||||
logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0");
|
||||
}
|
||||
|
||||
if (expStart == null) {
|
||||
assertThat("Out of bounds range should be null", expLimit == null);
|
||||
} else {
|
||||
Segment segment = segments1.segmentAt(startIdx);
|
||||
|
||||
assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start);
|
||||
assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue