diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java index e9f0299c765..12352e4dc47 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/BurmeseBreakEngine.java @@ -77,7 +77,7 @@ public class BurmeseBreakEngine extends DictionaryBreakEngine { @Override public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java index 0404e031cc2..06b93683771 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java @@ -14,18 +14,31 @@ import static com.ibm.icu.impl.CharacterIteration.next32; import java.io.IOException; import java.text.CharacterIterator; +import java.util.HashSet; import com.ibm.icu.impl.Assert; +import com.ibm.icu.impl.ICUData; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.UResourceBundle; +import com.ibm.icu.util.UResourceBundleIterator; public class CjkBreakEngine extends DictionaryBreakEngine { private UnicodeSet fHangulWordSet; + private UnicodeSet fNumberOrOpenPunctuationSet; + private UnicodeSet fClosePunctuationSet; private DictionaryMatcher fDictionary = null; + private HashSet fSkipSet; public CjkBreakEngine(boolean korean) throws IOException { fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]"); fHangulWordSet.freeze(); + fNumberOrOpenPunctuationSet = new UnicodeSet("[[:Nd:][:Pi:][:Ps:]]"); + fNumberOrOpenPunctuationSet.freeze(); + fClosePunctuationSet = new UnicodeSet("[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"); + fClosePunctuationSet.freeze(); + fSkipSet = new HashSet(); fDictionary = DictionaryData.loadDictionaryFor("Hira"); if (korean) { @@ -33,6 +46,33 @@ public class CjkBreakEngine extends DictionaryBreakEngine { } else { //Chinese and Japanese UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"); setCharacters(cjSet); + initializeJapanesePhraseParamater(); + } + } + + private void initializeJapanesePhraseParamater() { + loadJapaneseParticleAndAuxVerbs(); + loadHiragana(); + } + + private void loadJapaneseParticleAndAuxVerbs() { + UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "ja"); + final String[] tags = {"particles", "auxVerbs"}; + for (String tag : tags) { + UResourceBundle bundle = rb.get(tag); + UResourceBundleIterator iterator = bundle.getIterator(); + while (iterator.hasNext()) { + fSkipSet.add(iterator.nextString()); + } + } + } + + private void loadHiragana() { + UnicodeSet hiraganaWordSet = new UnicodeSet("[:Hiragana:]"); + hiraganaWordSet.freeze(); + UnicodeSetIterator iterator = new UnicodeSetIterator(hiraganaWordSet); + while (iterator.next()) { + fSkipSet.add(iterator.getString()); } } @@ -66,7 +106,7 @@ public class CjkBreakEngine extends DictionaryBreakEngine { @Override public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { if (startPos >= endPos) { return 0; } @@ -196,6 +236,25 @@ public class CjkBreakEngine extends DictionaryBreakEngine { if (bestSnlp[numCodePts] == kint32max) { t_boundary[numBreaks] = numCodePts; numBreaks++; + } else if (isPhraseBreaking) { + t_boundary[numBreaks] = numCodePts; + numBreaks++; + int prevIdx = numCodePts; + int codeUnitIdx = 0, length = 0; + for (int i = prev[numCodePts]; i > 0; i = prev[i]) { + codeUnitIdx = prenormstr.offsetByCodePoints(0, i); + length = prevIdx - i; + prevIdx = i; + String pattern = getPatternFromText(text, s, codeUnitIdx, length); + // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana + // characters don't occur. + text.setIndex(codeUnitIdx - 1); + if (!fSkipSet.contains(pattern) + && (!isKatakana(current32(text)) || !isKatakana(next32(text)))) { + t_boundary[numBreaks] = i; + numBreaks++; + } + } } else { for (int i = numCodePts; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; @@ -212,19 +271,50 @@ public class CjkBreakEngine extends DictionaryBreakEngine { int previous = -1; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; - if (pos > previous && pos != startPos) { - foundBreaks.push(pos); - correctedNumBreaks++; + // In phrase breaking, there has to be a breakpoint between Cj character and close + // punctuation. + // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + if (pos > previous) { + if (pos != startPos + || (isPhraseBreaking && pos > 0 + && fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) { + foundBreaks.push(charPositions[t_boundary[i]] + startPos); + correctedNumBreaks++; + } } previous = pos; } if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { - foundBreaks.pop(); - correctedNumBreaks--; + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だ▁ろうか -> breakpoint between 率 and 9 + if (isPhraseBreaking) { + if (!fNumberOrOpenPunctuationSet.contains(inText.setIndex(endPos))) { + foundBreaks.pop(); + correctedNumBreaks--; + } + } else { + foundBreaks.pop(); + correctedNumBreaks--; + } } if (!foundBreaks.isEmpty()) inText.setIndex(foundBreaks.peek()); return correctedNumBreaks; } + + private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start, + int length) { + sb.setLength(0); + if(length > 0) { + text.setIndex(start); + sb.appendCodePoint(current32(text)); + for (int j = 1; j < length; j++) { + sb.appendCodePoint(next32(text)); + } + } + return sb.toString(); + } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/DictionaryBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/DictionaryBreakEngine.java index 208d1de44c3..443badcdb72 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/DictionaryBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/DictionaryBreakEngine.java @@ -183,7 +183,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine { @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { int result = 0; // Find the span of characters included in the set. @@ -202,7 +202,7 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine { rangeStart = start; rangeEnd = current; - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking); text.setIndex(current); return result; @@ -226,5 +226,6 @@ public abstract class DictionaryBreakEngine implements LanguageBreakEngine { abstract int divideUpDictionaryRange(CharacterIterator text, int rangeStart, int rangeEnd, - DequeI foundBreaks ); + DequeI foundBreaks, + boolean isPhraseBreaking); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java index 02401f8ed67..bd3fa9f2ded 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/KhmerBreakEngine.java @@ -85,7 +85,7 @@ public class KhmerBreakEngine extends DictionaryBreakEngine { @Override public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { return 0; // Not enough characters for word diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LSTMBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LSTMBreakEngine.java index 7028a865598..267ada824ac 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LSTMBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LSTMBreakEngine.java @@ -343,7 +343,7 @@ public class LSTMBreakEngine extends DictionaryBreakEngine { @Override public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { int beginSize = foundBreaks.size(); if ((rangeEnd - rangeStart) < MIN_WORD_SPAN) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LanguageBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LanguageBreakEngine.java index ede94b76dad..ca8e8e6c57e 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LanguageBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LanguageBreakEngine.java @@ -32,7 +32,7 @@ public interface LanguageBreakEngine { * @return the number of breaks found */ int findBreaks(CharacterIterator text, int startPos, int endPos, - DictionaryBreakEngine.DequeI foundBreaks); + DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java index 95a8ef3762e..e60271aa5a4 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/LaoBreakEngine.java @@ -85,7 +85,7 @@ public class LaoBreakEngine extends DictionaryBreakEngine { @Override public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { if ((rangeEnd - rangeStart) < LAO_MIN_WORD) { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java index 71ba5096e5a..3f96705109b 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/ThaiBreakEngine.java @@ -96,7 +96,7 @@ public class ThaiBreakEngine extends DictionaryBreakEngine { @Override public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, - DequeI foundBreaks) { + DequeI foundBreaks, boolean isPhraseBreaking) { if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { return 0; // Not enough characters for word diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/UnhandledBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/UnhandledBreakEngine.java index b00cca0815b..cd5fb092e45 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/UnhandledBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/UnhandledBreakEngine.java @@ -44,7 +44,7 @@ public final class UnhandledBreakEngine implements LanguageBreakEngine { @Override public int findBreaks(CharacterIterator text, int startPos, int endPos, - DictionaryBreakEngine.DequeI foundBreaks) { + DictionaryBreakEngine.DequeI foundBreaks, boolean isPhraseBreaking) { UnicodeSet uniset = fHandled; int c = CharacterIteration.current32(text); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java index 2594c1b1347..3de520597aa 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java @@ -129,17 +129,25 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim // Get the binary rules. // ByteBuffer bytes = null; - String typeKeyExt = null; + String typeKeyExt = ""; if (kind == BreakIterator.KIND_LINE) { - String lbKeyValue = locale.getKeywordValue("lb"); - if ( lbKeyValue != null && (lbKeyValue.equals("strict") || lbKeyValue.equals("normal") || lbKeyValue.equals("loose")) ) { - typeKeyExt = "_" + lbKeyValue; + String keyValue = locale.getKeywordValue("lb"); + if ( keyValue != null && (keyValue.equals("strict") || keyValue.equals("normal") || keyValue.equals("loose")) ) { + typeKeyExt = "_" + keyValue; + } + String language = locale.getLanguage(); + if (language != null && language.equals("ja")) { + keyValue = locale.getKeywordValue("lw"); + if (keyValue != null && keyValue.equals("phrase")) { + typeKeyExt += "_" + keyValue; + } } } + String brkfname; try { - String typeKey = (typeKeyExt == null)? KIND_NAMES[kind]: KIND_NAMES[kind] + typeKeyExt; - String brkfname = rb.getStringWithFallback("boundaries/" + typeKey); + String typeKey = typeKeyExt.isEmpty() ? KIND_NAMES[kind] : KIND_NAMES[kind] + typeKeyExt; + brkfname = rb.getStringWithFallback("boundaries/" + typeKey); String rulesFileName = ICUData.ICU_BRKITR_NAME+ '/' + brkfname; bytes = ICUBinary.getData(rulesFileName); } @@ -151,7 +159,8 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim // Create a normal RuleBasedBreakIterator. // try { - iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes); + boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase"); + iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking); } catch (IOException e) { // Shouldn't be possible to get here. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 6bf2a264136..507677579fc 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -84,6 +84,32 @@ public class RuleBasedBreakIterator extends BreakIterator { return This; } + /** + * This factory method doesn't have an access modifier; it is only accessible in the same + * package. + * + * Create a break iterator from a precompiled set of break rules. + * + * Creating a break iterator from the binary rules is much faster than + * creating one from source rules. + * + * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. + * Binary break iterator rules are not guaranteed to be compatible between + * different versions of ICU. + * + * @param bytes a buffer supplying the compiled binary rules. + * @param phraseBreaking a flag indicating if phrase breaking is required. + * @throws IOException if there is an error while reading the rules from the buffer. + * @see #compileRules(String, OutputStream) + * @internal + */ + /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( + ByteBuffer bytes, boolean phraseBreaking) throws IOException { + RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); + instance.fPhraseBreaking = phraseBreaking; + return instance; + } + /** * Create a break iterator from a precompiled set of break rules. * @@ -274,6 +300,11 @@ public class RuleBasedBreakIterator extends BreakIterator { */ private BreakCache fBreakCache = new BreakCache(); + /** + * Flag used to indicate if phrase breaking is required. + */ + private boolean fPhraseBreaking = false; + /** * Counter for the number of characters encountered with the "dictionary" @@ -1205,7 +1236,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != null) { - foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks); + foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); } // Reload the loop variables for the next go-round diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 84083a72a13..2840f866456 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44951f88294c06e433a3b61238d9bb5f59ba01f091fcfb8fe4966f98f0748ef7 -size 13627084 +oid sha256:65125c8b8176c083a7597fed4c895fa263a185593bda5309753b95e8a5ec0dda +size 13650605 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index b4c58beb7ab..69bf00a16b8 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d13d3b8e7c58f0e41e4b6ff6f2bfa43529de382ecf2c1e3944429b1c1a761361 -size 96439 +oid sha256:31a470c8a209305fd98faf5ed0f20bf79cf57cfcb2281041b20d98ad742c7b5e +size 96440 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 8dc53bbbf05..4728fd9d4c2 100644 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf33f21346eea88c0282a4960f19f27e475554449f52ef4f25889e2b8a34a1c0 -size 826063 +oid sha256:2c951a44c5d9726ea4532cb840309d8503c380094b7fd0e56b96094187ce0a24 +size 826064 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/LSTMBreakEngineTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/LSTMBreakEngineTest.java index 8d248a551a2..1cedfd8e91d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/LSTMBreakEngineTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/LSTMBreakEngineTest.java @@ -80,7 +80,7 @@ public class LSTMBreakEngineTest extends TestFmwk { int length = fields[1].length(); CharacterIterator input = new StringCharacterIterator(fields[1]); DictionaryBreakEngine.DequeI foundBreaks = new DictionaryBreakEngine.DequeI(); - int ret = engine.findBreaks(input, 0, length, foundBreaks); + int ret = engine.findBreaks(input, 0, length, foundBreaks, false); StringBuilder sb = new StringBuilder(); sb.append('{'); for (int i = 0; i < foundBreaks.size(); i++) { diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 1948360277d..346da988d7d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1884,6 +1884,21 @@ Bangkok)• # woman astronaut, woman astronaut / fitz4 •\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020• + + +#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。• +•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002• +#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た +•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f• +#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• +•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d• +#乗車率90%程度だろうか。 -> 乗車•率•90%•程度だ•ろうか。• +•\u4e57\u8eca•\u7387•\uff19\uff10\uff05•\u7a0b\u5ea6\u3060•\u308d\u3046\u304b\u3002• +#[携帯電話]正しい選択 -> [携帯•電話]•正しい•選択• +•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e• +#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール +•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB• + #################################################################################### #