ICU-12515 fix errors in J filtered brk, enable @ss=

* fix errors in the filtered break implementation, sync with C
* sync rbbitst.txt with C
* enable 'en@ss=standard' syntax to enable filtered break

X-SVN-Rev: 39213
This commit is contained in:
Steven R. Loomis 2016-09-13 20:08:10 +00:00
parent dea458fef7
commit 12b103f98a
4 changed files with 295 additions and 147 deletions

View file

@ -11,6 +11,7 @@ package com.ibm.icu.impl;
import java.text.CharacterIterator;
import java.util.HashSet;
import com.ibm.icu.impl.ICUResourceBundle.OpenType;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.FilteredBreakIteratorBuilder;
import com.ibm.icu.text.UCharacterIterator;
@ -19,7 +20,6 @@ import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrieBuilder;
import com.ibm.icu.util.StringTrieBuilder;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;
/**
* @author tomzhang
@ -46,84 +46,132 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
this.backwardsTrie = backwardsTrie;
}
@Override
public int next() {
int n = delegate.next();
/**
* Reset the filter from the delegate.
*/
private final void resetState() {
text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
}
/**
* Is there an exception at this point?
*
* @param n
* @return
*/
private final boolean breakExceptionAt(int n) {
// Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
int bestPosn = -1;
int bestValue = -1;
// loops while 'n' points to an exception
text.setIndex(n);
backwardsTrie.reset();
int uch;
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
// TODO only do this the 1st time?
} else {
uch = text.nextCodePoint();
}
BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
if (r.hasValue()) { // remember the best match so far
bestPosn = text.getIndex();
bestValue = backwardsTrie.getValue();
}
}
if (r.matches()) { // exact match?
bestValue = backwardsTrie.getValue();
bestPosn = text.getIndex();
}
if (bestPosn >= 0) {
if (bestValue == Builder.MATCH) { // exact match!
return true; // Exception here.
} else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
// make sure there's a forward trie
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
// to see if it matches something going forward.
forwardsPartialTrie.reset();
BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
text.setIndex(bestPosn); // hope that's close ..
while ((uch = text.nextCodePoint()) != BreakIterator.DONE
&& ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
}
if (rfwd.matches()) {
// Exception here
return true;
} // else fall through
} // else fall through
} // else fall through
return false; // No exception here.
}
/**
* Given that the delegate has already given its "initial" answer,
* find the NEXT actual (non-excepted) break.
* @param n initial position from delegate
* @return new break position or UBRK_DONE
*/
private final int internalNext(int n) {
if (n == BreakIterator.DONE || // at end or
backwardsTrie == null) { // .. no backwards table loaded == no exceptions
return n;
}
// UCharacterIterator text;
text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
do { // outer loop runs once per underlying break (from fDelegate).
// loops while 'n' points to an exception.
text.setIndex(n);
backwardsTrie.reset();
int uch;
resetState();
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
// TODO only do this the 1st time?
final int textLen = text.getLength();
while (n != BreakIterator.DONE && n != textLen) {
// outer loop runs once per underlying break (from fDelegate).
// loops while 'n' points to an exception.
if (breakExceptionAt(n)) {
// n points to a break exception
n = delegate.next();
} else {
uch = text.nextCodePoint();
// no exception at this spot
return n;
}
}
return n; //hit underlying DONE or break at end of text
}
BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
/**
* Given that the delegate has already given its "initial" answer,
* find the NEXT actual (non-excepted) break.
* @param n initial position from delegate
* @return new break position or UBRK_DONE
*/
private final int internalPrev(int n) {
if (n == 0 || n == BreakIterator.DONE || // at end or
backwardsTrie == null) { // .. no backwards table loaded == no exceptions
return n;
}
resetState();
int bestPosn = -1;
int bestValue = -1;
while (n != BreakIterator.DONE && n != 0) {
// outer loop runs once per underlying break (from fDelegate).
// loops while 'n' points to an exception.
while ((uch = text.previousCodePoint()) != BreakIterator.DONE && // more to consume backwards and..
((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
if (r.hasValue()) { // remember the best match so far
bestPosn = text.getIndex();
bestValue = backwardsTrie.getValue();
}
}
if (r.matches()) { // exact match?
bestValue = backwardsTrie.getValue();
bestPosn = text.getIndex();
}
if (bestPosn >= 0) {
if (bestValue == Builder.MATCH) { // exact match!
n = delegate.next(); // skip this one. Find the next lowerlevel break.
if (n == BreakIterator.DONE) {
break;
}
continue; // See if the next is another exception.
} else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
// make sure there's a forward trie
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
// to see if it matches something going forward.
forwardsPartialTrie.reset();
BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
text.setIndex(bestPosn); // hope that's close ..
while ((uch = text.nextCodePoint()) != BreakIterator.DONE
&& ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
}
if (rfwd.matches()) {
// only full matches here, nothing to check
// skip the next:
n = delegate.next();
if (n == BreakIterator.DONE) {
break;
}
continue;
} else {
// no match (no exception) -return the 'underlying' break
break;
}
} else {
break; // internal error and/or no forwards trie
}
if (breakExceptionAt(n)) {
// n points to a break exception
n = delegate.previous();
} else {
break; // No match - so exit. Not an exception.
// no exception at this spot
return n;
}
} while (n != BreakIterator.DONE);
return n;
}
return n; //hit underlying DONE or break at end of text
}
@Override
@ -150,32 +198,20 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
return other;
}
@Override
public int first() {
return delegate.first();
return internalNext(delegate.first());
}
@Override
public int last() {
return delegate.last();
}
@Override
public int next(int n) {
// TODO
throw new UnsupportedOperationException("next(int) is not yet implemented");
public int preceding(int offset) {
return internalPrev(delegate.preceding(offset));
}
@Override
public int previous() {
// TODO
throw new UnsupportedOperationException("previous() is not yet implemented");
}
@Override
public int following(int offset) {
// TODO
throw new UnsupportedOperationException("following(int) is not yet implemented");
return internalPrev(delegate.previous());
}
@Override
@ -184,9 +220,39 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
}
@Override
public int preceding(int offset) {
// TODO
throw new UnsupportedOperationException("preceding(int) is not yet implemented");
public boolean isBoundary(int offset) {
if(!delegate.isBoundary(offset)) {
return false; // No underlying break to suppress?
}
// delegate thinks there's a break
if(backwardsTrie == null) {
return true; // no data
}
resetState();
return !breakExceptionAt(offset); // if there's an exception: no break.
}
@Override
public int next() {
return internalNext(delegate.next());
}
@Override
public int next(int n) {
return internalNext(delegate.next(n));
}
@Override
public int following(int offset) {
return internalNext(delegate.following(offset));
}
@Override
public int last() {
// Don't suppress a break opportunity at the end of text.
return delegate.last();
}
@Override
@ -215,20 +281,18 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
* @param loc the locale to get filtered iterators
*/
public Builder(ULocale loc) {
ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
ICUData.ICU_BRKITR_BASE_NAME, loc);
ICUResourceBundle exceptions = rb.findWithFallback("exceptions");
if (exceptions != null) {
ICUResourceBundle breaks = exceptions.findWithFallback("SentenceBreak");
if (breaks != null) {
for (int index = 0, size = breaks.getSize(); index < size; ++index) {
ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
String br = b.getString();
filterSet.add(br);
}
ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
if (breaks != null) {
for (int index = 0, size = breaks.getSize(); index < size; ++index) {
ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
String br = b.getString();
filterSet.add(br);
}
} // else - no exceptions.
}
}
/**
@ -261,7 +325,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
// Short circuit - nothing to except.
return adoptBreakIterator;
}
CharsTrieBuilder builder = new CharsTrieBuilder();
CharsTrieBuilder builder2 = new CharsTrieBuilder();

View file

@ -136,6 +136,7 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
typeKeyExt = "_" + lbKeyValue;
}
}
try {
String typeKey = (typeKeyExt == null)? KIND_NAMES[kind]: KIND_NAMES[kind] + typeKeyExt;
String brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
@ -162,6 +163,15 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
iter.setLocale(uloc, uloc);
iter.setBreakType(kind);
// filtered break
if (kind == BreakIterator.KIND_SENTENCE) {
final String ssKeyword = locale.getKeywordValue("ss");
if (ssKeyword != null && ssKeyword.equals("standard")) {
final ULocale base = new ULocale(locale.getBaseName());
return FilteredBreakIteratorBuilder.createInstance(base).build(iter);
}
}
return iter;
}

View file

@ -33,7 +33,7 @@ public class BreakIteratorTest extends TestFmwk
{
}
@Before
public void init(){
characterBreak = BreakIterator.getCharacterInstance();
@ -301,8 +301,8 @@ public class BreakIteratorTest extends TestFmwk
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
errln("Break between CR and LF in string U+" + Integer.toHexString(
(int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
(int)(work.charAt(3))));
(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
(work.charAt(3))));
errorCount++;
if (errorCount >= 75)
return;
@ -328,8 +328,8 @@ public class BreakIteratorTest extends TestFmwk
tb.setText(work.toString());
for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
if (k == 2) {
errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
+ " and U+" + Integer.toHexString((int)(work.charAt(2))));
errln("Break between U+" + Integer.toHexString((work.charAt(1)))
+ " and U+" + Integer.toHexString((work.charAt(2))));
errorCount++;
if (errorCount >= 75)
return;
@ -348,7 +348,7 @@ public class BreakIteratorTest extends TestFmwk
out.append(c);
else {
out.append("\\u");
temp = Integer.toHexString((int)c);
temp = Integer.toHexString(c);
out.append(zeros.substring(0, 4 - temp.length()));
out.append(temp);
}
@ -568,7 +568,7 @@ public class BreakIteratorTest extends TestFmwk
generalIteratorTest(lineBreak, lineSelectionData);
}
/**
* @bug 4117554
@ -709,7 +709,7 @@ public class BreakIteratorTest extends TestFmwk
int begin = 3;
int end = str.length() - 3;
// not used boolean gotException = false;
iter.setText(new StringCharacterIterator(str, begin, end, begin));
for (int index = -1; index < begin + 1; ++index) {
@ -772,16 +772,16 @@ public class BreakIteratorTest extends TestFmwk
if (locList.length == 0)
errln("getAvailableLocales() returned an empty list!");
// I have no idea how to test this function...
com.ibm.icu.util.ULocale[] ulocList = BreakIterator.getAvailableULocales();
if (ulocList.length == 0) {
errln("getAvailableULocales() returned an empty list!");
errln("getAvailableULocales() returned an empty list!");
} else {
logln("getAvailableULocales() returned " + ulocList.length + " locales");
}
}
/**
* @bug 4068137
*/
@ -838,7 +838,7 @@ public class BreakIteratorTest extends TestFmwk
}
}
/**
* Bug 4450804
*/
@ -893,7 +893,7 @@ public class BreakIteratorTest extends TestFmwk
assertEquals("Next point", 5, brk.next());
assertEquals("Last point", BreakIterator.DONE, brk.next());
}
/*
* Test case for Ticket#10721. BreakIterator factory method should throw NPE
* when specified locale is null.
@ -956,7 +956,7 @@ public class BreakIteratorTest extends TestFmwk
errln("getWordInstance((ULocale)null) did not throw NPE.");
} catch (NullPointerException e) { /* OK */ }
}
/**
* Test FilteredBreakIteratorBuilder newly introduced
*/
@ -980,14 +980,7 @@ public class BreakIteratorTest extends TestFmwk
logln("Building new BI\n");
filteredBI = builder.build(baseBI);
logln("Testing:");
filteredBI.setText(text);
assertEquals("1st next", 20, filteredBI.next());
assertEquals("1st next", 84, filteredBI.next());
assertEquals("1st next", 90, filteredBI.next());
assertEquals("1st next", 181, filteredBI.next());
assertEquals("1st next", 278, filteredBI.next());
filteredBI.first();
assertDefaultBreakBehavior(filteredBI, text);
}
{
@ -1015,7 +1008,7 @@ public class BreakIteratorTest extends TestFmwk
assertEquals("2nd next", 278, filteredBI.next());
filteredBI.first();
}
{
logln("Constructing empty builder\n");
@ -1072,15 +1065,39 @@ public class BreakIteratorTest extends TestFmwk
filteredBI = builder.build(baseBI);
if(filteredBI != null) {
logln("Testing:");
filteredBI.setText(text);
assertEquals("5th next", 84, filteredBI.next());
assertEquals("5th next", 278, filteredBI.next());
filteredBI.first();
assertEnglishBreakBehavior(filteredBI, text);
}
}
{
logln("Constructing English @ss=standard\n");
filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("en-US-u-ss-standard"));
if(filteredBI != null) {
assertEnglishBreakBehavior(filteredBI, text);
}
}
{
logln("Constructing Afrikaans @ss=standard - should be == default\n");
filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("af-u-ss-standard"));
assertDefaultBreakBehavior(filteredBI, text);
}
{
logln("Constructing Japanese @ss=standard - should be == default\n");
filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("ja-u-ss-standard"));
assertDefaultBreakBehavior(filteredBI, text);
}
{
logln("Constructing tfg @ss=standard - should be == default\n");
filteredBI = BreakIterator.getSentenceInstance(ULocale.forLanguageTag("tfg-u-ss-standard"));
assertDefaultBreakBehavior(filteredBI, text);
}
{
logln("Constructing French builder");
builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH);
@ -1092,12 +1109,48 @@ public class BreakIteratorTest extends TestFmwk
filteredBI = builder.build(baseBI);
if(filteredBI != null) {
logln("Testing:");
filteredBI.setText(text);
assertEquals("6th next", 20, filteredBI.next());
assertEquals("6th next", 84, filteredBI.next());
filteredBI.first();
assertFrenchBreakBehavior(filteredBI, text);
}
}
}
/**
* @param filteredBI
* @param text
*/
private void assertFrenchBreakBehavior(BreakIterator filteredBI, String text) {
logln("Testing French behavior:");
filteredBI.setText(text);
assertEquals("6th next", 20, filteredBI.next());
assertEquals("6th next", 84, filteredBI.next());
filteredBI.first();
}
/**
* @param filteredBI
* @param text
*/
private void assertEnglishBreakBehavior(BreakIterator filteredBI, String text) {
logln("Testing English filtered behavior:");
filteredBI.setText(text);
assertEquals("5th next", 84, filteredBI.next());
assertEquals("5th next", 278, filteredBI.next());
filteredBI.first();
}
/**
* @param filteredBI
* @param text
*/
private void assertDefaultBreakBehavior(BreakIterator filteredBI, String text) {
logln("Testing Default Behavior:");
filteredBI.setText(text);
assertEquals("1st next", 20, filteredBI.next());
assertEquals("1st next", 84, filteredBI.next());
assertEquals("1st next", 90, filteredBI.next());
assertEquals("1st next", 181, filteredBI.next());
assertEquals("1st next", 278, filteredBI.next());
filteredBI.first();
}
}

View file

@ -1,4 +1,6 @@
# Copyright (c) 2001-2016 International Business Machines
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2001-2016 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -31,6 +33,7 @@
# are merged back into ICU4C's copy of the file, lest they get overwritten later.
# TODO: figure out how to have a single copy of the file for use by both C and Java.
# Temp debugging tests
<locale en>
<word>
@ -40,18 +43,37 @@
## FILTERED BREAK TESTS
# (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
#<locale en>
#<sent>
#<data>\
#•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
#
#<locale en@ss=standard>
#<sent>
#<data>\
#•In the meantime Mr. Weston arrived with his small ship, which he had now recovered. •Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
#
<locale en>
<sent>
<data>\
•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
<locale en@ss=standard>
<sent>
<data>\
•In the meantime Mr. Weston arrived with his small ship, which he had now recovered. •Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
# This hits the case where "D." would match the end of "Ph.D.".
<locale en@ss=standard>
<sent>
<data>\
•Doctor with a D. •As in, Ph.D., you know.•</data>
# same as root (unless some exceptions are added!)
<locale tfg@ss=standard>
<sent>
<data>\
•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
# same as root (unless some exceptions are added!)
<locale ja@ss=standard>
<sent>
<data>\
•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
## END FILTERED BREAK TESTS
########################################################################################
#
#
@ -283,7 +305,6 @@
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400></data>
<data>•栃木<400>県<400>足利<400>市<400>で<400>の<400>撮影<400>が<400>公開<400>さ<400>れ<400>た<400></data>
# Ticket #11999
# Unhandled Break Engine was consuming all characters, not just unhandled.
# \U00011700 is AHOM LETTER KA. There is no dictionary for AHOM, triggering the unhandled engine,