ICU-21699 Phrase based breaking(Java)

See #1955
This commit is contained in:
allenwtsu 2021-12-27 04:20:19 +00:00 committed by Frank Yung-Fong Tang
parent 406d90000f
commit 8528bef596
16 changed files with 177 additions and 31 deletions

View file

@ -77,7 +77,7 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {

View file

@ -14,18 +14,31 @@ import static com.ibm.icu.impl.CharacterIteration.next32;
import java.io.IOException;
import java.text.CharacterIterator;
import java.util.HashSet;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.UResourceBundleIterator;
public class CjkBreakEngine extends DictionaryBreakEngine {
private UnicodeSet fHangulWordSet;
private UnicodeSet fNumberOrOpenPunctuationSet;
private UnicodeSet fClosePunctuationSet;
private DictionaryMatcher fDictionary = null;
private HashSet<String> fSkipSet;
public CjkBreakEngine(boolean korean) throws IOException {
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
fHangulWordSet.freeze();
fNumberOrOpenPunctuationSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:]]");
fNumberOrOpenPunctuationSet.freeze();
fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]");
fClosePunctuationSet.freeze();
fSkipSet = new HashSet<String>();
fDictionary = DictionaryData.loadDictionaryFor("Hira");
if (korean) {
@ -33,6 +46,33 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
} else { //Chinese and Japanese
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
setCharacters(cjSet);
initializeJapanesePhraseParamater();
}
}
private void initializeJapanesePhraseParamater() {
loadJapaneseParticleAndAuxVerbs();
loadHiragana();
}
private void loadJapaneseParticleAndAuxVerbs() {
UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja");
final String[] tags = {"particles", "auxVerbs"};
for (String tag : tags) {
UResourceBundle bundle = rb.get(tag);
UResourceBundleIterator iterator = bundle.getIterator();
while (iterator.hasNext()) {
fSkipSet.add(iterator.nextString());
}
}
}
private void loadHiragana() {
UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]");
hiraganaWordSet.freeze();
UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet);
while (iterator.next()) {
fSkipSet.add(iterator.getString());
}
}
@ -66,7 +106,7 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
@Override
public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
if (startPos >= endPos) {
return 0;
}
@ -196,6 +236,25 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
if (bestSnlp[numCodePts] == kint32max) {
t_boundary[numBreaks] = numCodePts;
numBreaks++;
} else if (isPhraseBreaking) {
t_boundary[numBreaks] = numCodePts;
numBreaks++;
int prevIdx = numCodePts;
int codeUnitIdx = 0, length = 0;
for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
length = prevIdx - i;
prevIdx = i;
String pattern = getPatternFromText(text, s, codeUnitIdx, length);
// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
// characters don't occur.
text.setIndex(codeUnitIdx - 1);
if (!fSkipSet.contains(pattern)
&& (!isKatakana(current32(text)) || !isKatakana(next32(text)))) {
t_boundary[numBreaks] = i;
numBreaks++;
}
}
} else {
for (int i = numCodePts; i > 0; i = prev[i]) {
t_boundary[numBreaks] = i;
@ -212,19 +271,50 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
int previous = -1;
for (int i = numBreaks - 1; i >= 0; i--) {
int pos = charPositions[t_boundary[i]] + startPos;
if (pos > previous && pos != startPos) {
foundBreaks.push(pos);
correctedNumBreaks++;
// In phrase breaking, there has to be a breakpoint between Cj character and close
// punctuation.
// E.g.携帯電話正しい選択 -> 携帯電話正しい選択 -> breakpoint between and
if (pos > previous) {
if (pos != startPos
|| (isPhraseBreaking && pos > 0
&& fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
correctedNumBreaks++;
}
}
previous = pos;
}
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
foundBreaks.pop();
correctedNumBreaks--;
// In phrase breaking, there has to be a breakpoint between Cj character and
// the number/open punctuation.
// E.g. る文字そうだ京都->文字そうだ京都-> breakpoint between and
// E.g. 乗車率90程度だろうか -> 乗車程度だろうか -> breakpoint between and
if (isPhraseBreaking) {
if (!fNumberOrOpenPunctuationSet.contains(inText.setIndex(endPos))) {
foundBreaks.pop();
correctedNumBreaks--;
}
} else {
foundBreaks.pop();
correctedNumBreaks--;
}
}
if (!foundBreaks.isEmpty())
inText.setIndex(foundBreaks.peek());
return correctedNumBreaks;
}
private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
int length) {
sb.setLength(0);
if(length > 0) {
text.setIndex(start);
sb.appendCodePoint(current32(text));
for (int j = 1; j < length; j++) {
sb.appendCodePoint(next32(text));
}
}
return sb.toString();
}
}

View file

@ -183,7 +183,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
int result = 0;
// Find the span of characters included in the set.
@ -202,7 +202,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
rangeStart = start;
rangeEnd = current;
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking);
text.setIndex(current);
return result;
@ -226,5 +226,6 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
abstract int divideUpDictionaryRange(CharacterIterator text,
int rangeStart,
int rangeEnd,
DequeI foundBreaks );
DequeI foundBreaks,
boolean isPhraseBreaking);
}

View file

@ -85,7 +85,7 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
return 0; // Not enough characters for word

View file

@ -343,7 +343,7 @@ public class LSTMBreakEngine extends DictionaryBreakEngine {
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
int beginSize = foundBreaks.size();
if ((rangeEnd - rangeStart) < MIN_WORD_SPAN) {

View file

@ -32,7 +32,7 @@ public interface LanguageBreakEngine {
* @return the number of breaks found
*/
int findBreaks(CharacterIterator text, int startPos, int endPos,
DictionaryBreakEngine.DequeI foundBreaks);
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking);
}

View file

@ -85,7 +85,7 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {

View file

@ -96,7 +96,7 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
DequeI foundBreaks, boolean isPhraseBreaking) {
if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
return 0; // Not enough characters for word

View file

@ -44,7 +44,7 @@ public final class UnhandledBreakEngine implements LanguageBreakEngine {
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
DictionaryBreakEngine.DequeI foundBreaks) {
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
UnicodeSet uniset = fHandled;
int c = CharacterIteration.current32(text);

View file

@ -129,17 +129,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
// Get the binary rules.
//
ByteBuffer bytes = null;
String typeKeyExt = null;
String typeKeyExt = "";
if (kind == BreakIterator.KIND_LINE) {
String lbKeyValue = locale.getKeywordValue("lb");
if ( lbKeyValue != null && (lbKeyValue.equals("strict") || lbKeyValue.equals("normal") || lbKeyValue.equals("loose")) ) {
typeKeyExt = "_" + lbKeyValue;
String keyValue = locale.getKeywordValue("lb");
if ( keyValue != null && (keyValue.equals("strict") || keyValue.equals("normal") || keyValue.equals("loose")) ) {
typeKeyExt = "_" + keyValue;
}
String language = locale.getLanguage();
if (language != null && language.equals("ja")) {
keyValue = locale.getKeywordValue("lw");
if (keyValue != null && keyValue.equals("phrase")) {
typeKeyExt += "_" + keyValue;
}
}
}
String brkfname;
try {
String typeKey = (typeKeyExt == null)? KIND_NAMES[kind]: KIND_NAMES[kind] + typeKeyExt;
String brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
String typeKey = typeKeyExt.isEmpty() ? KIND_NAMES[kind] : KIND_NAMES[kind] + typeKeyExt;
brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
String rulesFileName = ICUData.ICU_BRKITR_NAME+ '/' + brkfname;
bytes = ICUBinary.getData(rulesFileName);
}
@ -151,7 +159,8 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
// Create a normal RuleBasedBreakIterator.
//
try {
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes);
boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase");
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking);
}
catch (IOException e) {
// Shouldn't be possible to get here.

View file

@ -84,6 +84,32 @@ public class RuleBasedBreakIterator extends BreakIterator {
return This;
}
/**
* This factory method doesn't have an access modifier; it is only accessible in the same
* package.
*
* Create a break iterator from a precompiled set of break rules.
*
* Creating a break iterator from the binary rules is much faster than
* creating one from source rules.
*
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
* Binary break iterator rules are not guaranteed to be compatible between
* different versions of ICU.
*
* @param bytes a buffer supplying the compiled binary rules.
* @param phraseBreaking a flag indicating if phrase breaking is required.
* @throws IOException if there is an error while reading the rules from the buffer.
* @see #compileRules(String, OutputStream)
* @internal
*/
/* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
ByteBuffer bytes, boolean phraseBreaking) throws IOException {
RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
instance.fPhraseBreaking = phraseBreaking;
return instance;
}
/**
* Create a break iterator from a precompiled set of break rules.
*
@ -274,6 +300,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
private BreakCache fBreakCache = new BreakCache();
/**
* Flag used to indicate if phrase breaking is required.
*/
private boolean fPhraseBreaking = false;
/**
* Counter for the number of characters encountered with the "dictionary"
@ -1205,7 +1236,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != null) {
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
}
// Reload the loop variables for the next go-round

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:44951f88294c06e433a3b61238d9bb5f59ba01f091fcfb8fe4966f98f0748ef7
size 13627084
oid sha256:65125c8b8176c083a7597fed4c895fa263a185593bda5309753b95e8a5ec0dda
size 13650605

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d13d3b8e7c58f0e41e4b6ff6f2bfa43529de382ecf2c1e3944429b1c1a761361
size 96439
oid sha256:31a470c8a209305fd98faf5ed0f20bf79cf57cfcb2281041b20d98ad742c7b5e
size 96440

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cf33f21346eea88c0282a4960f19f27e475554449f52ef4f25889e2b8a34a1c0
size 826063
oid sha256:2c951a44c5d9726ea4532cb840309d8503c380094b7fd0e56b96094187ce0a24
size 826064

View file

@ -80,7 +80,7 @@ public class LSTMBreakEngineTest extends TestFmwk {
int length = fields[1].length();
CharacterIterator input = new StringCharacterIterator(fields[1]);
DictionaryBreakEngine.DequeI foundBreaks = new DictionaryBreakEngine.DequeI();
int ret = engine.findBreaks(input, 0, length, foundBreaks);
int ret = engine.findBreaks(input, 0, length, foundBreaks, false);
StringBuilder sb = new StringBuilder();
sb.append('{');
for (int i = 0; i < foundBreaks.size(); i++) {

View file

@ -1884,6 +1884,21 @@ Bangkok)•</data>
# woman astronaut, woman astronaut / fitz4
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
<locale ja@lw=phrase>
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
#乗車率90%程度だろうか。 -> 乗車•率•90%•程度だ•ろうか。•
<data>•\u4e57\u8eca•\u7387•\uff19\uff10\uff05•\u7a0b\u5ea6\u3060•\u308d\u3046\u304b\u3002•</data>
#[携帯電話]正しい選択 -> [携帯•電話]•正しい•選択•
<data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
<data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
####################################################################################
#