This commit is contained in:
Elango Cheran 2025-04-03 23:38:20 +01:00 committed by GitHub
commit d3488fe543
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 1108 additions and 2 deletions

View file

@ -0,0 +1,153 @@
package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.util.ULocale;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class LocalizedSegmenter implements Segmenter {
private ULocale locale;
private SegmentationType segmentationType;
@Override
public Segments segment(CharSequence s) {
return new LocalizedSegments(s, this);
}
public static Builder builder() {
return new Builder();
}
LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) {
this.locale = locale;
this.segmentationType = segmentationType;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Override
@Deprecated
public BreakIterator getNewBreakIterator() {
BreakIterator breakIter;
switch (this.segmentationType) {
case LINE:
breakIter = BreakIterator.getLineInstance(this.locale);
break;
case SENTENCE:
breakIter = BreakIterator.getSentenceInstance(this.locale);
break;
case WORD:
breakIter = BreakIterator.getWordInstance(this.locale);
break;
case GRAPHEME_CLUSTER:
default:
breakIter = BreakIterator.getCharacterInstance(this.locale);
break;
}
return breakIter;
}
public enum SegmentationType {
GRAPHEME_CLUSTER,
WORD,
LINE,
SENTENCE,
}
public static class Builder {
private ULocale locale = ULocale.ROOT;
private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER;
Builder() { }
public Builder setLocale(ULocale locale) {
this.locale = locale;
return this;
}
public Builder setSegmentationType(SegmentationType segmentationType) {
this.segmentationType = segmentationType;
return this;
}
public LocalizedSegmenter build() {
return new LocalizedSegmenter(this.locale, this.segmentationType);
}
}
public class LocalizedSegments implements Segments {
private CharSequence source;
private LocalizedSegmenter segmenter;
private BreakIterator breakIter;
private LocalizedSegments(CharSequence source, LocalizedSegmenter segmenter) {
this.source = source;
this.segmenter = segmenter;
this.breakIter = this.segmenter.getNewBreakIterator();
this.breakIter.setText(source);
}
@Override
public Stream<CharSequence> subSequences() {
return SegmentsImplUtils.subSequences(this.breakIter, this.source);
}
@Override
public Segment segmentAt(int i) {
return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i);
}
@Override
public Stream<Segment> segments() {
return SegmentsImplUtils.segments(this.breakIter, this.source);
}
@Override
public boolean isBoundary(int i) {
return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i);
}
@Override
public Stream<Segment> segmentsFrom(int i) {
return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i);
}
@Override
public Stream<Segment> segmentsBefore(int i) {
return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i);
}
@Override
public Function<Segment, CharSequence> segmentToSequenceFn() {
return SegmentsImplUtils.segmentToSequenceFn(this.source);
}
@Override
public IntStream boundaries() {
return SegmentsImplUtils.boundaries(this.breakIter, this.source);
}
@Override
public IntStream boundariesAfter(int i) {
return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i);
}
@Override
public IntStream boundariesBackFrom(int i) {
return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i);
}
}
}

View file

@ -0,0 +1,117 @@
package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class RuleBasedSegmenter implements Segmenter {
private String rules;
@Override
public Segments segment(CharSequence s) {
return new RuleBasedSegments(s, this);
}
public static Builder builder() {
return new Builder();
}
RuleBasedSegmenter(String rules) {
this.rules = rules;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Override
@Deprecated
public BreakIterator getNewBreakIterator() {
return new RuleBasedBreakIterator(this.rules);
}
public static class Builder {
String rules;
Builder() { }
public Builder setRules(String rules) {
this.rules = rules;
return this;
}
public RuleBasedSegmenter build() {
return new RuleBasedSegmenter(this.rules);
}
}
public static class RuleBasedSegments implements Segments {
private CharSequence source;
private RuleBasedSegmenter segmenter;
private BreakIterator breakIter;
RuleBasedSegments(CharSequence source, RuleBasedSegmenter segmenter) {
this.source = source;
this.segmenter = segmenter;
this.breakIter = this.segmenter.getNewBreakIterator();
this.breakIter.setText(source);
}
@Override
public Stream<CharSequence> subSequences() {
return SegmentsImplUtils.subSequences(this.breakIter, this.source);
}
@Override
public Segment segmentAt(int i) {
return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i);
}
@Override
public Stream<Segment> segments() {
return SegmentsImplUtils.segments(this.breakIter, this.source);
}
@Override
public boolean isBoundary(int i) {
return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i);
}
@Override
public Stream<Segment> segmentsFrom(int i) {
return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i);
}
@Override
public Stream<Segment> segmentsBefore(int i) {
return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i);
}
@Override
public Function<Segment, CharSequence> segmentToSequenceFn() {
return SegmentsImplUtils.segmentToSequenceFn(this.source);
}
@Override
public IntStream boundaries() {
return SegmentsImplUtils.boundaries(this.breakIter, this.source);
}
@Override
public IntStream boundariesAfter(int i) {
return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i);
}
@Override
public IntStream boundariesBackFrom(int i) {
return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i);
}
}
}

View file

@ -0,0 +1,15 @@
package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
public interface Segmenter {
Segments segment(CharSequence s);
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
BreakIterator getNewBreakIterator();
}

View file

@ -0,0 +1,257 @@
package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
import java.util.Iterator;
import java.util.Spliterator;
import java.util.function.Function;
import java.util.function.IntConsumer;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public interface Segments {
Stream<CharSequence> subSequences();
Segment segmentAt(int i);
Stream<Segment> segments();
Stream<Segment> segmentsFrom(int i);
Stream<Segment> segmentsBefore(int i);
Function<Segment, CharSequence> segmentToSequenceFn();
/**
* Returns whether offset {@code i} is a segmentation boundary. Throws an exception when
* {@code i} is not a valid boundary position for the source sequence.
* @param i
* @return
*/
boolean isBoundary(int i);
IntStream boundaries();
IntStream boundariesAfter(int i);
IntStream boundariesBackFrom(int i);
//
// Inner enums/classes in common for other inner classes
//
enum IterationDirection {
FORWARDS,
BACKWARDS,
}
//
// Inner classes for Segment, SegmentIterable, and SegmentIterator
//
// TODO: consider options in design for potential memory usage optimization:
// 1) keep simple class with public fields, but requires field per Segment to point to source
// 2) make Segment an interface (getSource, getStart, getLimit, getRuleStatus, newSegment), and
// maybe an abstract class that implements the interface, maybe with a default method impl
// for convenience for getting (allocating & returning) the subsequence
// 3) do not link the multiple Segment objects and the single Segments object via a field, and
// instead provide a function on Segments that can convert each Segment into a CharSequence
class Segment {
public final int start;
public final int limit;
public final int ruleStatus = 0;
public final CharSequence source;
public Segment(int start, int limit, CharSequence source) {
this.start = start;
this.limit = limit;
this.source = source;
}
}
/**
* This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn
* enables the creation of a lazy {@code Stream}.
*/
class SegmentIterable implements Iterable<Segment> {
BreakIterator breakIter;
final IterationDirection direction;
int startIdx;
final CharSequence source;
SegmentIterable(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) {
this.breakIter = breakIter;
this.direction = direction;
this.startIdx = startIdx;
this.source = source;
}
@Override
public Iterator<Segment> iterator() {
return new SegmentIterator(this.breakIter, this.direction, this.startIdx, this.source);
}
}
class SegmentIterator implements Iterator<Segment> {
BreakIterator breakIter;
final IterationDirection direction;
int start;
int limit;
final CharSequence source;
SegmentIterator(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) {
this.breakIter = breakIter;
this.direction = direction;
this.source = source;
Segment segmentAtIdx = SegmentsImplUtils.segmentAt(breakIter, source, startIdx);
if (segmentAtIdx == null) {
this.start = BreakIterator.DONE;
} else if (direction == IterationDirection.FORWARDS) {
this.start = segmentAtIdx.start;
this.limit = breakIter.following(this.start);
} else {
assert direction == IterationDirection.BACKWARDS;
if (breakIter.isBoundary(startIdx)) {
// Note: breakIter::isBoundary is a stateful operation. It resets the position in the
// BreakIterator, which we want to ensure that the position is where we think it is.
this.start = startIdx;
} else {
// Since we already called BreakIterator.isBoundary() which mutates the BreakIterator
// position to increment forwards when the return value is false, we should call
// BreakIterator.previous() to update the iterator position while getting the start value
// of the segment at startIdx
this.start = breakIter.previous();
}
this.limit = getDirectionBasedNextIdx();
}
}
int getDirectionBasedNextIdx() {
if (direction == IterationDirection.FORWARDS) {
return breakIter.next();
} else {
assert direction == IterationDirection.BACKWARDS;
return breakIter.previous();
}
}
@Override
public boolean hasNext() {
return this.limit != BreakIterator.DONE;
}
@Override
public Segment next() {
Segment result;
if (this.limit < this.start) {
result = new Segment(this.limit, this.start, this.source);
} else {
result = new Segment(this.start, this.limit, this.source);
}
this.start = this.limit;
this.limit = getDirectionBasedNextIdx();
return result;
}
}
//
// Inner classes for BoundaryIterable and BoundaryIterator
//
class BoundaryIteratorOfInts {
BreakIterator breakIter;
IterationDirection direction;
int currIdx;
BoundaryIteratorOfInts(BreakIterator breakIter, IterationDirection direction, int startIdx) {
this.breakIter = breakIter;
this.direction = direction;
// TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs
if (startIdx < 0 && direction == IterationDirection.BACKWARDS) {
this.currIdx = BreakIterator.DONE;
return;
}
if (direction == IterationDirection.FORWARDS) {
this.currIdx = breakIter.following(startIdx);
} else {
assert direction == IterationDirection.BACKWARDS;
this.currIdx = breakIter.preceding(startIdx);
}
}
public boolean hasNext() {
return this.currIdx != BreakIterator.DONE;
}
public Integer next() {
int result = this.currIdx;
if (direction == IterationDirection.FORWARDS) {
this.currIdx = breakIter.next();
} else {
assert direction == IterationDirection.BACKWARDS;
this.currIdx = breakIter.previous();
}
return result;
}
}
class SegmentSpliterator implements Spliterator.OfInt {
private final BoundaryIteratorOfInts iter;
SegmentSpliterator(BreakIterator breakIter, IterationDirection direction, int startIdx) {
iter = new BoundaryIteratorOfInts(breakIter, direction, startIdx);
}
@Override
public OfInt trySplit() {
// The elements of the Stream represent an iteration through a string, and is thus inherently
// stateful. Therefore, splitting this Stream does not make sense. Ex: splitting the Stream
// is tantamount to discarding the segment subtended by the end value (index into the input
// string) of one substream and the beginning value of the next substream.
return null;
}
@Override
public long estimateSize() {
// The number of segments per input size depends on language, script, and
// the content of the input string, and thus is hard to estimate without
// sacrificing performance. Thus, returning `Long.MAX_VALUE`, according
// to the API, to mean "unknown, or too expensive to compute".
return Long.MAX_VALUE;
}
@Override
public int characteristics() {
return Spliterator.DISTINCT // BreakIterator always advances
| Spliterator.IMMUTABLE // design of Segmenter API is to provide an immutable view of
// segmentation by preventing the input string from mutating
// in the underlying BreakIterator
| Spliterator.NONNULL // primtive int is non-null
| Spliterator.ORDERED // BreakIterator always advances, and in a single direction
;
}
@Override
public boolean tryAdvance(IntConsumer action) {
if (action == null) {
throw new NullPointerException();
}
if (iter.hasNext()) {
action.accept(iter.next());
return true;
} else {
return false;
}
}
}
}

View file

@ -0,0 +1,95 @@
package com.ibm.icu.text.segmenter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.segmenter.Segments.IterationDirection;
import com.ibm.icu.text.segmenter.Segments.Segment;
import com.ibm.icu.text.segmenter.Segments.SegmentIterable;
import com.ibm.icu.text.segmenter.Segments.SegmentSpliterator;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
public class SegmentsImplUtils {
public static boolean isBoundary(BreakIterator breakIter, CharSequence source, int i) {
return breakIter.isBoundary(i);
}
public static Stream<CharSequence> subSequences(BreakIterator breakIter, CharSequence sourceSequence) {
return segments(breakIter, sourceSequence).map(segmentToSequenceFn(sourceSequence));
}
public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) {
int start;
int limit;
boolean isBoundary = breakIter.isBoundary(i);
if (isBoundary) {
start = i;
limit = breakIter.next();
} else {
// BreakIterator::isBoundary(i) will advance forwards to the next boundary if the argument
// is not a boundary.
limit = breakIter.current();
start = breakIter.previous();
}
if (start != BreakIterator.DONE && limit != BreakIterator.DONE) {
return new Segment(start, limit, sourceSequence);
} else {
return null;
}
}
public static Stream<Segment> segments(BreakIterator breakIter, CharSequence sourceSequence) {
return segmentsFrom(breakIter, sourceSequence, 0);
}
public static Stream<Segment> segmentsFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) {
breakIter.setText(sourceSequence);
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i, sourceSequence);
return StreamSupport.stream(iterable.spliterator(), false);
}
public static Stream<Segment> segmentsBefore(BreakIterator breakIter, CharSequence sourceSequence, int i) {
breakIter.setText(sourceSequence);
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i, sourceSequence);
return StreamSupport.stream(iterable.spliterator(), false);
}
public static Function<Segment, CharSequence> segmentToSequenceFn(CharSequence sourceSequence) {
return segment -> sourceSequence.subSequence(segment.start, segment.limit);
}
public static IntStream boundaries(BreakIterator breakIter, CharSequence sourceSequence) {
return boundariesAfter(breakIter, sourceSequence, -1);
}
public static IntStream boundariesAfter(BreakIterator breakIter, CharSequence sourceSequence, int i) {
breakIter.setText(sourceSequence);
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
return StreamSupport.intStream(new SegmentSpliterator(breakIter, IterationDirection.FORWARDS, i), false);
}
public static IntStream boundariesBackFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) {
int sourceLength = sourceSequence.length();
if (i < 0) {
return IntStream.empty();
}
boolean isOnBoundary = i <= sourceLength && isBoundary(breakIter, sourceSequence, i);
int backFromIdx = isOnBoundary ? i + 1 : i;
// create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager
return StreamSupport.intStream(new SegmentSpliterator(breakIter, IterationDirection.BACKWARDS, backFromIdx), false);
}
}

View file

@ -46,7 +46,7 @@ public class BreakIteratorRules_en_US_TEST extends ListResourceBundle {
// all of which should not influence the algorithm
"$_ignore_=[[:Mn:][:Me:][:Cf:]];"
// lower and upper case Roman letters, apostrophy and dash are
// lower and upper case Roman letters, apostrophe and dash are
// in the English dictionary
+"$_dictionary_=[a-zA-Z\\'\\-];"
@ -64,7 +64,7 @@ public class BreakIteratorRules_en_US_TEST extends ListResourceBundle {
+"$mid_word=[[:Pd:]\u00ad\u2027\\\"\\\'];"
// punctuation that can occur in the middle of a number: currently
// apostrophes, qoutation marks, periods, commas, and the Arabic
// apostrophes, quotation marks, periods, commas, and the Arabic
// decimal point
+"$mid_num=[\\\"\\\'\\,\u066b\\.];"

View file

@ -0,0 +1,47 @@
package com.ibm.icu.dev.test.text.segmenter;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import com.ibm.icu.dev.test.CoreTestFmwk;
import com.ibm.icu.text.segmenter.LocalizedSegmenter;
import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType;
import com.ibm.icu.text.segmenter.Segmenter;
import com.ibm.icu.text.segmenter.Segments;
import com.ibm.icu.util.ULocale;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class LocalizedSegmenterTest extends CoreTestFmwk {
@Test
public void testLocaleInLocalizedSegmenter() {
String source = "Die 21en Jahrh. ist die Beste.";
Object[][] casesData = {
{"de", Arrays.asList("Die 21en Jahrh. ist die Beste.")},
};
for (Object[] caseDatum : casesData) {
String localeTag = (String) caseDatum[0];
ULocale locale = ULocale.forLanguageTag(localeTag);
List<CharSequence> expWords = (List<CharSequence>) caseDatum[1];
Segmenter wordSeg =
LocalizedSegmenter.builder()
.setLocale(locale)
.setSegmentationType(SegmentationType.SENTENCE)
.build();
Segments segments = wordSeg.segment(source);
List<CharSequence> actWords = segments.subSequences().collect(Collectors.toList());
assertThat(actWords, is(expWords));
}
}
}

View file

@ -0,0 +1,49 @@
package com.ibm.icu.dev.test.text.segmenter;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import com.ibm.icu.dev.test.CoreTestFmwk;
import com.ibm.icu.text.segmenter.RuleBasedSegmenter;
import com.ibm.icu.text.segmenter.Segmenter;
import com.ibm.icu.text.segmenter.Segments;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class RuleBasedSegmenterTest extends CoreTestFmwk {
@Test
public void testRules() {
String source = "hejsan k:a tack";
Object[][] casesData = {
{"default", ".*;", Arrays.asList("hejsan k:a tack")},
// TODO: add more cases once RBBI rule syntax is understood
};
for (Object[] caseDatum : casesData) {
String desc = (String) caseDatum[0];
String subrule = (String) caseDatum[1];
List<CharSequence> expWords = (List<CharSequence>) caseDatum[2];
// the following rule substring was taken as a subset from BreakIteratorRules_en_US_TEST.java:
String rules = subrule;
Segmenter seg = RuleBasedSegmenter.builder()
.setRules(rules)
.build();
Segments segments = seg.segment(source);
List<CharSequence> actWords = segments.subSequences().collect(Collectors.toList());
assertThat(desc, actWords, is(expWords));
}
}
}

View file

@ -0,0 +1,373 @@
package com.ibm.icu.dev.test.text.segmenter;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import com.ibm.icu.dev.test.CoreTestFmwk;
import com.ibm.icu.text.segmenter.LocalizedSegmenter;
import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType;
import com.ibm.icu.text.segmenter.Segmenter;
import com.ibm.icu.text.segmenter.Segments;
import com.ibm.icu.text.segmenter.Segments.Segment;
import com.ibm.icu.util.ULocale;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class SegmentsTest extends CoreTestFmwk {
@Test
public void testSegments() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.segments().collect(Collectors.toList());
assertEquals("first range start", 0, segments.get(0).start);
assertEquals("first range limit", 3, segments.get(0).limit);
assertEquals("second range start", 3, segments.get(1).start);
assertEquals("second range limit", 4, segments.get(1).limit);
}
@Test
public void testMultipleSegmentObjectsFromSegmenter() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
String source2 = "Sphinx of black quartz, judge my vow.";
String source3 = "How vexingly quick daft zebras jump!";
List<CharSequence> exp1 = Arrays.asList("The", " ", "quick", " ", "brown", " ", "fox", " ",
"jumped", " ", "over", " ", "the", " ", "lazy", " ", "dog", ".");
List<CharSequence> exp2 = Arrays.asList("Sphinx", " ", "of", " ", "black", " ", "quartz", ",",
" ", "judge", " ", "my", " ", "vow", ".");
List<CharSequence> exp3 = Arrays.asList("How", " ", "vexingly", " ", "quick", " ", "daft", " ",
"zebras", " ", "jump", "!");
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<CharSequence> act1 = segments1.subSequences().collect(Collectors.toList());
assertThat(act1, is(exp1));
// Create new Segments for source2
Segments segments2 = enWordSegmenter.segment(source2);
List<CharSequence> act2 = segments2.subSequences().collect(Collectors.toList());
assertThat(act2, is(exp2));
// Check that Segments for source1 is unaffected
act1 = segments1.subSequences().collect(Collectors.toList());
assertThat(act1, is(exp1));
// Create new Segments for source3
Segments segments3 = enWordSegmenter.segment(source3);
List<CharSequence> act3 = segments3.subSequences().collect(Collectors.toList());
assertThat(act3, is(exp3));
// Check that Segments for source1 is unaffected
act1 = segments1.subSequences().collect(Collectors.toList());
assertThat(act1, is(exp1));
// Check that Segments for source2 is unaffected
act2 = segments2.subSequences().collect(Collectors.toList());
assertThat(act2, is(exp2));
}
@Test
public void testIsBoundary() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
Object[][] casesData = {
{"start of segment", 4, true},
{"between start and limit of segment", 6, false},
{"limit of segment", 9, true},
{"beginning of string", 0, true},
{"end of string", source1.length(), true},
};
for (Object[] caseDatum : casesData) {
String desc = (String) caseDatum[0];
int idx = (int) caseDatum[1];
boolean exp = (boolean) caseDatum[2];
assertThat(desc, segments1.isBoundary(idx) == exp);
}
}
@Test
public void testSegmentsFrom_middleOfSegment() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 1;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList());
assertEquals("first range start", 0, segments.get(0).start);
assertEquals("first range limit", 3, segments.get(0).limit);
assertEquals("second range start", 3, segments.get(1).start);
assertEquals("second range limit", 4, segments.get(1).limit);
}
@Test
public void testSegmentsFrom_onBoundary() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 3;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList());
assertEquals("first range start", 3, segments.get(0).start);
assertEquals("first range limit", 4, segments.get(0).limit);
assertEquals("second range start", 4, segments.get(1).start);
assertEquals("second range limit", 9, segments.get(1).limit);
}
@Test
public void testSegmentsBefore_middleOfSegment() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 8;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList());
assertEquals("first range start", 3, segments.get(0).start);
assertEquals("first range limit", 4, segments.get(0).limit);
assertEquals("second range start", 0, segments.get(1).start);
assertEquals("second range limit", 3, segments.get(1).limit);
}
@Test
public void testSegmentsBefore_onBoundary() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 9;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList());
assertEquals("first range start", 4, segments.get(0).start);
assertEquals("first range limit", 9, segments.get(0).limit);
assertEquals("second range start", 3, segments.get(1).start);
assertEquals("second range limit", 4, segments.get(1).limit);
}
@Test
public void testSegmentToSequenceFn() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 10;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<CharSequence> exp1 = Arrays.asList(" ", "quick", " ", "The");
List<CharSequence> act1 = segments1.segmentsBefore(startIdx)
.map(segments1.segmentToSequenceFn())
.collect(Collectors.toList());
assertThat(act1, is(exp1));
}
@Test
public void testBoundaries() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(SegmentationType.WORD)
.build();
String source = "The quick brown fox jumped over the lazy dog.";
// Create new Segments for source
Segments segments = enWordSegmenter.segment(source);
int[] exp = {0, 3, 4, 9, 10, 15, 16, 19, 20, 26, 27, 31, 32, 35, 36, 40, 41, 44, 45};
int[] act = segments.boundaries().toArray();
assertThat(act, is(exp));
}
@Test
public void testBoundariesAfter() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(SegmentationType.WORD)
.build();
String source = "The quick brown fox jumped over the lazy dog.";
int TAKE_LIMIT = 5;
// Create new Segments for source
Segments segments = enWordSegmenter.segment(source);
Object[][] casesData = {
{"first " + TAKE_LIMIT + " before beginning", -2, new int[]{0, 3, 4, 9, 10}},
{"first " + TAKE_LIMIT + " in the middle of the third segment", 5, new int[]{9, 10, 15, 16, 19}},
{"first " + TAKE_LIMIT + " on the limit of the third segment", 9, new int[]{10, 15, 16, 19, 20}},
{"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]},
{"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[0]},
};
for (Object[] caseDatum : casesData) {
String desc = (String) caseDatum[0];
int startIdx = (int) caseDatum[1];
int[] exp = (int[]) caseDatum[2];
int[] act = segments.boundariesAfter(startIdx).limit(TAKE_LIMIT).toArray();
assertThat(desc, act, is(exp));
}
}
@Test
public void testBoundariesBackFrom() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(SegmentationType.WORD)
.build();
String source = "The quick brown fox jumped over the lazy dog.";
int TAKE_LIMIT = 5;
// Create new Segments for source
Segments segments = enWordSegmenter.segment(source);
Object[][] casesData = {
{"first " + TAKE_LIMIT + " before beginning", -2, new int[0]},
{"first " + TAKE_LIMIT + " at the beginning", 0, new int[]{0}},
{"first " + TAKE_LIMIT + " from the start of the 2nd to last segment", 41, new int[]{41, 40, 36, 35, 32}},
{"first " + TAKE_LIMIT + " in the middle of the 2nd to last segment", 42, new int[]{41, 40, 36, 35, 32}},
{"first " + TAKE_LIMIT + " at the end", source.length(), new int[]{45, 44, 41, 40, 36}},
{"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[]{45, 44, 41, 40, 36}},
};
for (Object[] caseDatum : casesData) {
String desc = (String) caseDatum[0];
int startIdx = (int) caseDatum[1];
int[] exp = (int[]) caseDatum[2];
int[] act = segments.boundariesBackFrom(startIdx).limit(TAKE_LIMIT).toArray();
assertThat(desc, act, is(exp));
if (startIdx < 0) {
logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0");
}
}
}
@Test
public void testSegmentAt() {
Segmenter enWordSegmenter =
LocalizedSegmenter.builder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(SegmentationType.WORD)
.build();
String source = "The quick brown fox jumped over the lazy dog.";
// Create new Segments for source
Segments segments1 = enWordSegmenter.segment(source);
Object[][] casesData = {
{"index before beginning", -2, null, null},
{"index at beginning", 0, 0, 3},
{"index in the middle of the first segment", 2, 0, 3},
{"index in the middle of the third segment", 5, 4, 9},
{"index at the end", source.length()-1, 44, 45},
{"index after the end", source.length()+1, null, null},
};
for (Object[] caseDatum : casesData) {
String desc = (String) caseDatum[0];
int startIdx = (int) caseDatum[1];
Integer expStart = (Integer) caseDatum[2];
Integer expLimit = (Integer) caseDatum[3];
if (startIdx < 0 ) {
logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0");
}
if (expStart == null) {
assertThat("Out of bounds range should be null", expLimit == null);
} else {
Segment segment = segments1.segmentAt(startIdx);
assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start);
assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit);
}
}
}
}