ICU-22789 Fix boundary condition behavior for segmentsBefore API

This commit is contained in:
Elango Cheran 2025-01-03 20:42:16 -08:00
parent e500e421dc
commit f9f4d041de
2 changed files with 61 additions and 7 deletions
icu4j/main/core/src
main/java/com/ibm/icu/text/segmenter
test/java/com/ibm/icu/dev/test/text/segmenter

View file

@ -112,7 +112,17 @@ public interface Segments {
this.limit = breakIter.following(this.start);
} else {
assert direction == IterationDirection.BACKWARDS;
this.start = breakIter.preceding(segmentAtIdx.start);
if (breakIter.isBoundary(startIdx)) {
// Note: breakIter::isBoundary is a stateful operation. It resets the position in the
// BreakIterator, which we want to ensure that the position is where we think it is.
this.start = startIdx;
} else {
// Since we already called BreakIterator.isBoundary() which mutates the BreakIterator
// position to increment forwards when the return value is false, we should call
// BreakIterator.previous() to update the iterator position while getting the start value
// of the segment at startIdx
this.start = breakIter.previous();
}
this.limit = getDirectionBasedNextIdx();
}
}

View file

@ -118,12 +118,10 @@ public class SegmentsTest extends CoreTestFmwk {
assertThat(desc, segments1.isBoundary(idx) == exp);
}
}
@Test
public void testSegmentsFrom() {
public void testSegmentsFrom_middleOfSegment() {
Segmenter enWordSegmenter =
new LocalizedSegmenterBuilder()
.setLocale(ULocale.ENGLISH)
@ -146,7 +144,7 @@ public class SegmentsTest extends CoreTestFmwk {
}
@Test
public void testSegmentsBefore() {
public void testSegmentsFrom_onBoundary() {
Segmenter enWordSegmenter =
new LocalizedSegmenterBuilder()
.setLocale(ULocale.ENGLISH)
@ -154,7 +152,53 @@ public class SegmentsTest extends CoreTestFmwk {
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 10;
int startIdx = 3;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList());
assertEquals("first range start", 3, segments.get(0).start);
assertEquals("first range limit", 4, segments.get(0).limit);
assertEquals("second range start", 4, segments.get(1).start);
assertEquals("second range limit", 9, segments.get(1).limit);
}
@Test
public void testSegmentsBefore_middleOfSegment() {
Segmenter enWordSegmenter =
new LocalizedSegmenterBuilder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 8;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<Segment> segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList());
assertEquals("first range start", 3, segments.get(0).start);
assertEquals("first range limit", 4, segments.get(0).limit);
assertEquals("second range start", 0, segments.get(1).start);
assertEquals("second range limit", 3, segments.get(1).limit);
}
@Test
public void testSegmentsBefore_onBoundary() {
Segmenter enWordSegmenter =
new LocalizedSegmenterBuilder()
.setLocale(ULocale.ENGLISH)
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
.build();
String source1 = "The quick brown fox jumped over the lazy dog.";
int startIdx = 9;
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
@ -182,7 +226,7 @@ public class SegmentsTest extends CoreTestFmwk {
// Create new Segments for source1
Segments segments1 = enWordSegmenter.segment(source1);
List<CharSequence> exp1 = Arrays.asList("quick", " ", "The");
List<CharSequence> exp1 = Arrays.asList(" ", "quick", " ", "The");
List<CharSequence> act1 = segments1.rangesBeforeIndex(startIdx)
.map(segments1.rangeToSequenceFn())