mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-22789 Fix boundary condition behavior for segmentsBefore API
This commit is contained in:
parent
e500e421dc
commit
f9f4d041de
2 changed files with 61 additions and 7 deletions
icu4j/main/core/src
main/java/com/ibm/icu/text/segmenter
test/java/com/ibm/icu/dev/test/text/segmenter
|
@ -112,7 +112,17 @@ public interface Segments {
|
|||
this.limit = breakIter.following(this.start);
|
||||
} else {
|
||||
assert direction == IterationDirection.BACKWARDS;
|
||||
this.start = breakIter.preceding(segmentAtIdx.start);
|
||||
if (breakIter.isBoundary(startIdx)) {
|
||||
// Note: breakIter::isBoundary is a stateful operation. It resets the position in the
|
||||
// BreakIterator, which we want to ensure that the position is where we think it is.
|
||||
this.start = startIdx;
|
||||
} else {
|
||||
// Since we already called BreakIterator.isBoundary() which mutates the BreakIterator
|
||||
// position to increment forwards when the return value is false, we should call
|
||||
// BreakIterator.previous() to update the iterator position while getting the start value
|
||||
// of the segment at startIdx
|
||||
this.start = breakIter.previous();
|
||||
}
|
||||
this.limit = getDirectionBasedNextIdx();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -118,12 +118,10 @@ public class SegmentsTest extends CoreTestFmwk {
|
|||
|
||||
assertThat(desc, segments1.isBoundary(idx) == exp);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsFrom() {
|
||||
public void testSegmentsFrom_middleOfSegment() {
|
||||
Segmenter enWordSegmenter =
|
||||
new LocalizedSegmenterBuilder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
|
@ -146,7 +144,7 @@ public class SegmentsTest extends CoreTestFmwk {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsBefore() {
|
||||
public void testSegmentsFrom_onBoundary() {
|
||||
Segmenter enWordSegmenter =
|
||||
new LocalizedSegmenterBuilder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
|
@ -154,7 +152,53 @@ public class SegmentsTest extends CoreTestFmwk {
|
|||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 10;
|
||||
int startIdx = 3;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 3, segments.get(0).start);
|
||||
assertEquals("first range limit", 4, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 4, segments.get(1).start);
|
||||
assertEquals("second range limit", 9, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsBefore_middleOfSegment() {
|
||||
Segmenter enWordSegmenter =
|
||||
new LocalizedSegmenterBuilder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 8;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<Segment> segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList());
|
||||
|
||||
assertEquals("first range start", 3, segments.get(0).start);
|
||||
assertEquals("first range limit", 4, segments.get(0).limit);
|
||||
|
||||
assertEquals("second range start", 0, segments.get(1).start);
|
||||
assertEquals("second range limit", 3, segments.get(1).limit);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentsBefore_onBoundary() {
|
||||
Segmenter enWordSegmenter =
|
||||
new LocalizedSegmenterBuilder()
|
||||
.setLocale(ULocale.ENGLISH)
|
||||
.setSegmentationType(LocalizedSegmenter.SegmentationType.WORD)
|
||||
.build();
|
||||
|
||||
String source1 = "The quick brown fox jumped over the lazy dog.";
|
||||
int startIdx = 9;
|
||||
|
||||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
@ -182,7 +226,7 @@ public class SegmentsTest extends CoreTestFmwk {
|
|||
// Create new Segments for source1
|
||||
Segments segments1 = enWordSegmenter.segment(source1);
|
||||
|
||||
List<CharSequence> exp1 = Arrays.asList("quick", " ", "The");
|
||||
List<CharSequence> exp1 = Arrays.asList(" ", "quick", " ", "The");
|
||||
|
||||
List<CharSequence> act1 = segments1.rangesBeforeIndex(startIdx)
|
||||
.map(segments1.rangeToSequenceFn())
|
||||
|
|
Loading…
Add table
Reference in a new issue