From 47ffdd8fa9c15ff97c24fd318e6e66477c0a5d63 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 12:20:54 -0800 Subject: [PATCH] ICU-22789 Add segmentAt API for Segments interface --- .../text/segmenter/LocalizedSegmenter.java | 5 ++ .../text/segmenter/RuleBasedSegmenter.java | 5 ++ .../com/ibm/icu/text/segmenter/Segments.java | 2 + .../icu/text/segmenter/SegmentsImplUtils.java | 26 ++++++++++ .../dev/test/text/segmenter/SegmentsTest.java | 47 ++++++++++++++++++- 5 files changed, 84 insertions(+), 1 deletion(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 323081d372f..9e3d34e75b1 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -74,6 +74,11 @@ public class LocalizedSegmenter implements Segmenter { return SegmentsImplUtils.subSequences(this.breakIter, this.source); } + @Override + public Segment segmentAt(int i) { + return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i); + } + @Override public Stream ranges() { return SegmentsImplUtils.ranges(this.breakIter, this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 6a30516b70c..13a1846fc84 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -47,6 +47,11 @@ public class RuleBasedSegmenter implements Segmenter { return SegmentsImplUtils.subSequences(this.breakIter, this.source); } + @Override + public Segment segmentAt(int i) { + return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i); + } + @Override public Stream ranges() { return SegmentsImplUtils.ranges(this.breakIter, this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 6b5aba6ab4b..1e3dd9bacd5 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -9,6 +9,8 @@ import java.util.stream.Stream; public interface Segments { Stream subSequences(); + Segment segmentAt(int i); + Stream ranges(); Stream rangesAfterIndex(int i); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 0623a789524..7b025999401 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -24,6 +24,32 @@ public class SegmentsImplUtils { return ranges(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); } + public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) { + // TODO: make initialization of breakIterator a prerequisite + breakIter.setText(sourceSequence); + + int start; + int limit; + + boolean isBoundary = breakIter.isBoundary(i); + + if (isBoundary) { + start = i; + limit = breakIter.next(); + } else { + // BreakIterator::isBoundary(i) will advance forwards to the next boundary if the argument + // is not a boundary. + limit = breakIter.current(); + start = breakIter.previous(); + } + + if (start != BreakIterator.DONE && limit != BreakIterator.DONE) { + return new Segment(start, limit, sourceSequence); + } else { + return null; + } + } + public static Stream ranges(BreakIterator breakIter, CharSequence sourceSequence) { return rangesAfterIndex(breakIter, sourceSequence, -1); } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 46088fe66f0..3d02520dd7b 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -261,7 +261,7 @@ public class SegmentsTest extends CoreTestFmwk { Segment segment = segments.rangeBeforeIndex(startIdx); - if (startIdx == -2) { + if (startIdx < 0 ) { logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); } @@ -366,4 +366,49 @@ public class SegmentsTest extends CoreTestFmwk { } } + @Test + public void testSegmentAt() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source + Segments segments1 = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"index before beginning", -2, null, null}, + {"index at beginning", 0, 0, 3}, + {"index in the middle of the first segment", 2, 0, 3}, + {"index in the middle of the third segment", 5, 4, 9}, + {"index at the end", source.length()-1, 44, 45}, + {"index after the end", source.length()+1, null, null}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + Integer expStart = (Integer) caseDatum[2]; + Integer expLimit = (Integer) caseDatum[3]; + + if (startIdx < 0 ) { + logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); + } + + if (expStart == null) { + assertThat("Out of bounds range should be null", expLimit == null); + } else { + Segment segment = segments1.segmentAt(startIdx); + + assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start); + assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit); + } + } + + + } + }