mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
parent
406d90000f
commit
8528bef596
16 changed files with 177 additions and 31 deletions
|
@ -77,7 +77,7 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
|
||||
|
||||
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
|
||||
|
|
|
@ -14,18 +14,31 @@ import static com.ibm.icu.impl.CharacterIteration.next32;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.HashSet;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.UResourceBundle;
|
||||
import com.ibm.icu.util.UResourceBundleIterator;
|
||||
|
||||
public class CjkBreakEngine extends DictionaryBreakEngine {
|
||||
private UnicodeSet fHangulWordSet;
|
||||
private UnicodeSet fNumberOrOpenPunctuationSet;
|
||||
private UnicodeSet fClosePunctuationSet;
|
||||
private DictionaryMatcher fDictionary = null;
|
||||
private HashSet<String> fSkipSet;
|
||||
|
||||
public CjkBreakEngine(boolean korean) throws IOException {
|
||||
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
|
||||
fHangulWordSet.freeze();
|
||||
fNumberOrOpenPunctuationSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:]]");
|
||||
fNumberOrOpenPunctuationSet.freeze();
|
||||
fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]");
|
||||
fClosePunctuationSet.freeze();
|
||||
fSkipSet = new HashSet<String>();
|
||||
|
||||
fDictionary = DictionaryData.loadDictionaryFor("Hira");
|
||||
if (korean) {
|
||||
|
@ -33,6 +46,33 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
|
||||
setCharacters(cjSet);
|
||||
initializeJapanesePhraseParamater();
|
||||
}
|
||||
}
|
||||
|
||||
private void initializeJapanesePhraseParamater() {
|
||||
loadJapaneseParticleAndAuxVerbs();
|
||||
loadHiragana();
|
||||
}
|
||||
|
||||
private void loadJapaneseParticleAndAuxVerbs() {
|
||||
UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja");
|
||||
final String[] tags = {"particles", "auxVerbs"};
|
||||
for (String tag : tags) {
|
||||
UResourceBundle bundle = rb.get(tag);
|
||||
UResourceBundleIterator iterator = bundle.getIterator();
|
||||
while (iterator.hasNext()) {
|
||||
fSkipSet.add(iterator.nextString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void loadHiragana() {
|
||||
UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]");
|
||||
hiraganaWordSet.freeze();
|
||||
UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet);
|
||||
while (iterator.next()) {
|
||||
fSkipSet.add(iterator.getString());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -66,7 +106,7 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
if (startPos >= endPos) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -196,6 +236,25 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
if (bestSnlp[numCodePts] == kint32max) {
|
||||
t_boundary[numBreaks] = numCodePts;
|
||||
numBreaks++;
|
||||
} else if (isPhraseBreaking) {
|
||||
t_boundary[numBreaks] = numCodePts;
|
||||
numBreaks++;
|
||||
int prevIdx = numCodePts;
|
||||
int codeUnitIdx = 0, length = 0;
|
||||
for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
|
||||
codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
|
||||
length = prevIdx - i;
|
||||
prevIdx = i;
|
||||
String pattern = getPatternFromText(text, s, codeUnitIdx, length);
|
||||
// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
|
||||
// characters don't occur.
|
||||
text.setIndex(codeUnitIdx - 1);
|
||||
if (!fSkipSet.contains(pattern)
|
||||
&& (!isKatakana(current32(text)) || !isKatakana(next32(text)))) {
|
||||
t_boundary[numBreaks] = i;
|
||||
numBreaks++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = numCodePts; i > 0; i = prev[i]) {
|
||||
t_boundary[numBreaks] = i;
|
||||
|
@ -212,19 +271,50 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
int previous = -1;
|
||||
for (int i = numBreaks - 1; i >= 0; i--) {
|
||||
int pos = charPositions[t_boundary[i]] + startPos;
|
||||
if (pos > previous && pos != startPos) {
|
||||
foundBreaks.push(pos);
|
||||
correctedNumBreaks++;
|
||||
// In phrase breaking, there has to be a breakpoint between Cj character and close
|
||||
// punctuation.
|
||||
// E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
|
||||
if (pos > previous) {
|
||||
if (pos != startPos
|
||||
|| (isPhraseBreaking && pos > 0
|
||||
&& fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
|
||||
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
|
||||
correctedNumBreaks++;
|
||||
}
|
||||
}
|
||||
previous = pos;
|
||||
}
|
||||
|
||||
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
|
||||
foundBreaks.pop();
|
||||
correctedNumBreaks--;
|
||||
// In phrase breaking, there has to be a breakpoint between Cj character and
|
||||
// the number/open punctuation.
|
||||
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
|
||||
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だ▁ろうか -> breakpoint between 率 and 9
|
||||
if (isPhraseBreaking) {
|
||||
if (!fNumberOrOpenPunctuationSet.contains(inText.setIndex(endPos))) {
|
||||
foundBreaks.pop();
|
||||
correctedNumBreaks--;
|
||||
}
|
||||
} else {
|
||||
foundBreaks.pop();
|
||||
correctedNumBreaks--;
|
||||
}
|
||||
}
|
||||
if (!foundBreaks.isEmpty())
|
||||
inText.setIndex(foundBreaks.peek());
|
||||
return correctedNumBreaks;
|
||||
}
|
||||
|
||||
private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
|
||||
int length) {
|
||||
sb.setLength(0);
|
||||
if(length > 0) {
|
||||
text.setIndex(start);
|
||||
sb.appendCodePoint(current32(text));
|
||||
for (int j = 1; j < length; j++) {
|
||||
sb.appendCodePoint(next32(text));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -183,7 +183,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
|||
|
||||
@Override
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
int result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
|
@ -202,7 +202,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
|||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking);
|
||||
text.setIndex(current);
|
||||
|
||||
return result;
|
||||
|
@ -226,5 +226,6 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
|||
abstract int divideUpDictionaryRange(CharacterIterator text,
|
||||
int rangeStart,
|
||||
int rangeEnd,
|
||||
DequeI foundBreaks );
|
||||
DequeI foundBreaks,
|
||||
boolean isPhraseBreaking);
|
||||
}
|
||||
|
|
|
@ -85,7 +85,7 @@ public class KhmerBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
|
||||
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for word
|
||||
|
|
|
@ -343,7 +343,7 @@ public class LSTMBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
int beginSize = foundBreaks.size();
|
||||
|
||||
if ((rangeEnd - rangeStart) < MIN_WORD_SPAN) {
|
||||
|
|
|
@ -32,7 +32,7 @@ public interface LanguageBreakEngine {
|
|||
* @return the number of breaks found
|
||||
*/
|
||||
int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
DictionaryBreakEngine.DequeI foundBreaks);
|
||||
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -85,7 +85,7 @@ public class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
|
||||
|
||||
if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
|
||||
|
|
|
@ -96,7 +96,7 @@ public class ThaiBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
|
||||
if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for word
|
||||
|
|
|
@ -44,7 +44,7 @@ public final class UnhandledBreakEngine implements LanguageBreakEngine {
|
|||
|
||||
@Override
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
DictionaryBreakEngine.DequeI foundBreaks) {
|
||||
DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) {
|
||||
|
||||
UnicodeSet uniset = fHandled;
|
||||
int c = CharacterIteration.current32(text);
|
||||
|
|
|
@ -129,17 +129,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
|||
// Get the binary rules.
|
||||
//
|
||||
ByteBuffer bytes = null;
|
||||
String typeKeyExt = null;
|
||||
String typeKeyExt = "";
|
||||
if (kind == BreakIterator.KIND_LINE) {
|
||||
String lbKeyValue = locale.getKeywordValue("lb");
|
||||
if ( lbKeyValue != null && (lbKeyValue.equals("strict") || lbKeyValue.equals("normal") || lbKeyValue.equals("loose")) ) {
|
||||
typeKeyExt = "_" + lbKeyValue;
|
||||
String keyValue = locale.getKeywordValue("lb");
|
||||
if ( keyValue != null && (keyValue.equals("strict") || keyValue.equals("normal") || keyValue.equals("loose")) ) {
|
||||
typeKeyExt = "_" + keyValue;
|
||||
}
|
||||
String language = locale.getLanguage();
|
||||
if (language != null && language.equals("ja")) {
|
||||
keyValue = locale.getKeywordValue("lw");
|
||||
if (keyValue != null && keyValue.equals("phrase")) {
|
||||
typeKeyExt += "_" + keyValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String brkfname;
|
||||
try {
|
||||
String typeKey = (typeKeyExt == null)? KIND_NAMES[kind]: KIND_NAMES[kind] + typeKeyExt;
|
||||
String brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
|
||||
String typeKey = typeKeyExt.isEmpty() ? KIND_NAMES[kind] : KIND_NAMES[kind] + typeKeyExt;
|
||||
brkfname = rb.getStringWithFallback("boundaries/" + typeKey);
|
||||
String rulesFileName = ICUData.ICU_BRKITR_NAME+ '/' + brkfname;
|
||||
bytes = ICUBinary.getData(rulesFileName);
|
||||
}
|
||||
|
@ -151,7 +159,8 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
|||
// Create a normal RuleBasedBreakIterator.
|
||||
//
|
||||
try {
|
||||
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes);
|
||||
boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase");
|
||||
iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking);
|
||||
}
|
||||
catch (IOException e) {
|
||||
// Shouldn't be possible to get here.
|
||||
|
|
|
@ -84,6 +84,32 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
return This;
|
||||
}
|
||||
|
||||
/**
|
||||
* This factory method doesn't have an access modifier; it is only accessible in the same
|
||||
* package.
|
||||
*
|
||||
* Create a break iterator from a precompiled set of break rules.
|
||||
*
|
||||
* Creating a break iterator from the binary rules is much faster than
|
||||
* creating one from source rules.
|
||||
*
|
||||
* The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
|
||||
* Binary break iterator rules are not guaranteed to be compatible between
|
||||
* different versions of ICU.
|
||||
*
|
||||
* @param bytes a buffer supplying the compiled binary rules.
|
||||
* @param phraseBreaking a flag indicating if phrase breaking is required.
|
||||
* @throws IOException if there is an error while reading the rules from the buffer.
|
||||
* @see #compileRules(String, OutputStream)
|
||||
* @internal
|
||||
*/
|
||||
/* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
|
||||
ByteBuffer bytes, boolean phraseBreaking) throws IOException {
|
||||
RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
|
||||
instance.fPhraseBreaking = phraseBreaking;
|
||||
return instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a break iterator from a precompiled set of break rules.
|
||||
*
|
||||
|
@ -274,6 +300,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
*/
|
||||
private BreakCache fBreakCache = new BreakCache();
|
||||
|
||||
/**
|
||||
* Flag used to indicate if phrase breaking is required.
|
||||
*/
|
||||
private boolean fPhraseBreaking = false;
|
||||
|
||||
|
||||
/**
|
||||
* Counter for the number of characters encountered with the "dictionary"
|
||||
|
@ -1205,7 +1236,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != null) {
|
||||
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
|
||||
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:44951f88294c06e433a3b61238d9bb5f59ba01f091fcfb8fe4966f98f0748ef7
|
||||
size 13627084
|
||||
oid sha256:65125c8b8176c083a7597fed4c895fa263a185593bda5309753b95e8a5ec0dda
|
||||
size 13650605
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d13d3b8e7c58f0e41e4b6ff6f2bfa43529de382ecf2c1e3944429b1c1a761361
|
||||
size 96439
|
||||
oid sha256:31a470c8a209305fd98faf5ed0f20bf79cf57cfcb2281041b20d98ad742c7b5e
|
||||
size 96440
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:cf33f21346eea88c0282a4960f19f27e475554449f52ef4f25889e2b8a34a1c0
|
||||
size 826063
|
||||
oid sha256:2c951a44c5d9726ea4532cb840309d8503c380094b7fd0e56b96094187ce0a24
|
||||
size 826064
|
||||
|
|
|
@ -80,7 +80,7 @@ public class LSTMBreakEngineTest extends TestFmwk {
|
|||
int length = fields[1].length();
|
||||
CharacterIterator input = new StringCharacterIterator(fields[1]);
|
||||
DictionaryBreakEngine.DequeI foundBreaks = new DictionaryBreakEngine.DequeI();
|
||||
int ret = engine.findBreaks(input, 0, length, foundBreaks);
|
||||
int ret = engine.findBreaks(input, 0, length, foundBreaks, false);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append('{');
|
||||
for (int i = 0; i < foundBreaks.size(); i++) {
|
||||
|
|
|
@ -1884,6 +1884,21 @@ Bangkok)•</data>
|
|||
# woman astronaut, woman astronaut / fitz4
|
||||
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
|
||||
|
||||
<locale ja@lw=phrase>
|
||||
<line>
|
||||
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
|
||||
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
|
||||
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
|
||||
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
|
||||
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
|
||||
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
|
||||
#乗車率90%程度だろうか。 -> 乗車•率•90%•程度だ•ろうか。•
|
||||
<data>•\u4e57\u8eca•\u7387•\uff19\uff10\uff05•\u7a0b\u5ea6\u3060•\u308d\u3046\u304b\u3002•</data>
|
||||
#[携帯電話]正しい選択 -> [携帯•電話]•正しい•選択•
|
||||
<data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
|
||||
#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
|
||||
<data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
|
|
Loading…
Add table
Reference in a new issue