From f9f4d041de677c34934247c2370d46561abc21c7 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 20:42:16 -0800 Subject: [PATCH] ICU-22789 Fix boundary condition behavior for segmentsBefore API --- .../com/ibm/icu/text/segmenter/Segments.java | 12 +++- .../dev/test/text/segmenter/SegmentsTest.java | 56 +++++++++++++++++-- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index dcd7dda17de..f4004904db3 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -112,7 +112,17 @@ public interface Segments { this.limit = breakIter.following(this.start); } else { assert direction == IterationDirection.BACKWARDS; - this.start = breakIter.preceding(segmentAtIdx.start); + if (breakIter.isBoundary(startIdx)) { + // Note: breakIter::isBoundary is a stateful operation. It resets the position in the + // BreakIterator, which we want to ensure that the position is where we think it is. + this.start = startIdx; + } else { + // Since we already called BreakIterator.isBoundary() which mutates the BreakIterator + // position to increment forwards when the return value is false, we should call + // BreakIterator.previous() to update the iterator position while getting the start value + // of the segment at startIdx + this.start = breakIter.previous(); + } this.limit = getDirectionBasedNextIdx(); } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 259f9b53f78..6f6b3da956e 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -118,12 +118,10 @@ public class SegmentsTest extends CoreTestFmwk { assertThat(desc, segments1.isBoundary(idx) == exp); } - - } @Test - public void testSegmentsFrom() { + public void testSegmentsFrom_middleOfSegment() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -146,7 +144,7 @@ public class SegmentsTest extends CoreTestFmwk { } @Test - public void testSegmentsBefore() { + public void testSegmentsFrom_onBoundary() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -154,7 +152,53 @@ public class SegmentsTest extends CoreTestFmwk { .build(); String source1 = "The quick brown fox jumped over the lazy dog."; - int startIdx = 10; + int startIdx = 3; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); + + assertEquals("second range start", 4, segments.get(1).start); + assertEquals("second range limit", 9, segments.get(1).limit); + } + + @Test + public void testSegmentsBefore_middleOfSegment() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 8; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); + + assertEquals("second range start", 0, segments.get(1).start); + assertEquals("second range limit", 3, segments.get(1).limit); + } + + @Test + public void testSegmentsBefore_onBoundary() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 9; // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); @@ -182,7 +226,7 @@ public class SegmentsTest extends CoreTestFmwk { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List exp1 = Arrays.asList("quick", " ", "The"); + List exp1 = Arrays.asList(" ", "quick", " ", "The"); List act1 = segments1.rangesBeforeIndex(startIdx) .map(segments1.rangeToSequenceFn())