ICU-22100 Incorporate BudouX into ICU (Java)

See #2214
This commit is contained in:
allenwtsu 2022-12-20 16:34:42 +00:00 committed by Markus Scherer
parent 44480c4ba4
commit 90caafbcd4
8 changed files with 554 additions and 7 deletions

View file

@ -190,6 +190,38 @@ jobs:
[ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
if: ${{ failure() }}
# ICU4J build and unit test under adaboost
adaboost-icu4j-build-and-test:
runs-on: ubuntu-latest
steps:
- name: Checkout and setup
uses: actions/checkout@v2
with:
lfs: true
- name: Checkout lfs objects
run: git lfs pull
- uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '11'
- name: Config Adaboost and Rebuild data jar
run: |
cd icu4c/source;
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
make clean;
make -j2 ICU4J_ROOT=../../../icu4j icu4j-data-install;
cd ../..
- name: ICU4J
run: |
cd icu4j;
ant init;
ant -Dcom.ibm.icu.impl.breakiter.useMLPhraseBreaking=true check;
ant localespiCheck
- name: List failures (if any)
run: |
[ -d icu4j/out/junit-results ] && cd icu4j && cat `find out/junit-results -name "*.txt" -exec grep -l FAILED {} \;`;
if: ${{ failure() }}
# gcc debug build.
# Includes dependency checker.
# Note - the dependency checker needs to be run on both a debug and an optimized build.

View file

@ -338,11 +338,13 @@
<!--set the property - if it was set before it won't override-->
<property name="user-jvm-options" value=""/>
<property name="internal-jvm-options" value=""/>
<property name="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value=""/>
<delete dir="${junit.out.dir}/@{test-name}"/>
<mkdir dir="${junit.out.dir}/@{test-name}"/>
<junit fork="yes" forkmode="once" printsummary="yes" haltonfailure="no"
failureproperty="@{failure-status}" tempdir="${junit.out.dir}">
<sysproperty key="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value="${com.ibm.icu.impl.breakiter.useMLPhraseBreaking}" />
<jvmarg value="-Xss4m"/>
<jvmarg value="-ea"/>
<jvmarg value="-Djava.awt.headless=true"/>

View file

@ -63,3 +63,9 @@ com.ibm.icu.impl.ICUResourceBundle.skipRuntimeLocaleResourceScan = false
# LocaleDisplayNames implementation class
# @internal
# com.ibm.icu.text.LocaleDisplayNames.impl = com.ibm.icu.impl.LocaleDisplayNamesImpl
#
# [Internal Use Only]
# Enable ML phrase breaking
# @internal
com.ibm.icu.impl.breakiter.useMLPhraseBreaking = false

View file

@ -76,7 +76,7 @@ public class ICUConfig {
val = System.getProperty(name);
}
if (val == null) {
if (val == null || val.equals("")) {
val = CONFIG_PROPS.getProperty(name, def);
}
return val;

View file

@ -18,6 +18,7 @@ import java.text.CharacterIterator;
import java.util.HashSet;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUConfig;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UnicodeSet;
@ -31,6 +32,8 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
private UnicodeSet fClosePunctuationSet;
private DictionaryMatcher fDictionary = null;
private HashSet<String> fSkipSet;
private MlBreakEngine fMlBreakEngine;
private boolean isCj = false;
public CjkBreakEngine(boolean korean) throws IOException {
fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]");
@ -47,9 +50,16 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
if (korean) {
setCharacters(fHangulWordSet);
} else { //Chinese and Japanese
isCj = true;
UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]");
setCharacters(cjSet);
initializeJapanesePhraseParamater();
if (Boolean.parseBoolean(
ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
fClosePunctuationSet);
} else {
initializeJapanesePhraseParamater();
}
}
}
@ -151,6 +161,15 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
charPositions[numCodePts] = index;
}
}
// Use ML phrase breaking
if (Boolean.parseBoolean(
ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
if (isPhraseBreaking && isCj) {
return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text,
numCodePts, charPositions, foundBreaks);
}
}
// From here on out, do the algorithm. Note that our indices
// refer to indices within the normalized string.
@ -276,10 +295,11 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
// In phrase breaking, there has to be a breakpoint between Cj character and close
// punctuation.
// E.g.携帯電話正しい選択 -> 携帯電話正しい選択 -> breakpoint between and
inText.setIndex(pos);
if (pos > previous) {
if (pos != startPos
|| (isPhraseBreaking && pos > 0
&& fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) {
&& fClosePunctuationSet.contains(previous32(inText)))) {
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
correctedNumBreaks++;
}
@ -294,7 +314,9 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
// E.g. 乗車率90程度だろうか -> 乗車程度だろうか -> breakpoint between and
// E.g. しかもロゴがUnicode -> しかもロゴが-> breakpoint between and
if (isPhraseBreaking) {
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(inText.setIndex(endPos))) {
inText.setIndex(endPos);
int current = current32(inText);
if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
foundBreaks.pop();
correctedNumBreaks--;
}

View file

@ -0,0 +1,436 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package com.ibm.icu.impl.breakiter;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.current32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.previous32;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.UResourceBundle;
import com.ibm.icu.util.UResourceBundleIterator;
import java.lang.System;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.HashMap;
public class MlBreakEngine {
private static final int INVALID = '|';
private static final String INVALID_STRING = "|";
private static final int MAX_FEATURE = 26;
private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
private UnicodeSet fClosePunctuationSet;
private HashMap<String, Integer> fModel;
private int fNegativeSum;
static class Element {
private int character;
private String ublock;
/**
* Default constructor.
*/
public Element() {
character = 0;
ublock = null;
}
/**
* Set the character and its unicode block.
*
* @param ch A unicode character.
* @param str The unicode block of the character.
*/
public void setCharAndUblock(int ch, String str) {
Assert.assrt(str.length() <= 3);
this.character = ch;
ublock = str;
}
/**
* Get the unicode character.
*
* @return The unicode character.
*/
public int getCharacter() {
return character;
}
/**
* Get the unicode character's unicode block.
*
* @return The unicode block.
*/
public String getUblock() {
return ublock;
}
}
private static boolean isValid(Element element) {
String ublock = element.getUblock();
return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID;
}
/**
* Constructor for Chinese and Japanese phrase breaking.
*
* @param digitOrOpenPunctuationOrAlphabetSet An unicode set with the digit and open punctuation
* and alphabet.
* @param closePunctuationSet An unicode set with the close punctuation.
*/
public MlBreakEngine(UnicodeSet digitOrOpenPunctuationOrAlphabetSet,
UnicodeSet closePunctuationSet) {
fDigitOrOpenPunctuationOrAlphabetSet = digitOrOpenPunctuationOrAlphabetSet;
fClosePunctuationSet = closePunctuationSet;
fModel = new HashMap<String, Integer>();
fNegativeSum = 0;
loadMLModel();
}
/**
* Divide up a range of characters handled by this break engine.
*
* @param inText A input text.
* @param startPos The start index of the input text.
* @param endPos The end index of the input text.
* @param inString A input string normalized from inText from startPos to endPos
* @param numCodePts The number of code points of inString
* @param charPositions A map that transforms inString's code point index to code unit index.
* @param foundBreaks A list to store the breakpoint.
* @return The number of breakpoints
*/
public int divideUpRange(CharacterIterator inText, int startPos, int endPos,
CharacterIterator inString, int numCodePts, int[] charPositions,
DictionaryBreakEngine.DequeI foundBreaks) {
if (startPos >= endPos) {
return 0;
}
ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
int ch;
String ublock;
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
// Like a sliding window, the elementList removes the first char and appends the new char
// from inString in each iteration so that its size always remains at six.
Element elementList[] = new Element[6];
initElementList(inString, elementList, numCodePts);
// Add a break for the start.
boundary.add(0, 0);
for (int i = 1; i < numCodePts; i++) {
evaluateBreakpoint(elementList, i, boundary);
if (i + 1 > numCodePts) {
break;
}
shiftLeftOne(elementList);
ch = (i + 3) < numCodePts ? next32(inString) : INVALID;
ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING;
elementList[5].setCharAndUblock(ch, ublock);
}
// Add a break for the end if there is not one there already.
if (boundary.get(boundary.size() - 1) != numCodePts) {
boundary.add(numCodePts);
}
int correctedNumBreaks = 0;
int previous = -1;
int numBreaks = boundary.size();
for (int i = 0; i < numBreaks; i++) {
int pos = charPositions[boundary.get(i)] + startPos;
// In phrase breaking, there has to be a breakpoint between Cj character and close
// punctuation.
// E.g.携帯電話正しい選択 -> 携帯電話正しい選択 -> breakpoint between and
inText.setIndex(pos);
if (pos > previous) {
if (pos != startPos
|| (pos > 0
&& fClosePunctuationSet.contains(previous32(inText)))) {
foundBreaks.push(pos);
correctedNumBreaks++;
}
}
previous = pos;
}
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
// In phrase breaking, there has to be a breakpoint between Cj character and
// the number/open punctuation.
// E.g. る文字そうだ京都->文字そうだ京都-> breakpoint between and
// E.g. 乗車率90程度だろうか -> 乗車程度だろうか -> breakpoint between and
// E.g. しかもロゴがUnicode -> しかもロゴが-> breakpoint between and
inText.setIndex(endPos);
int current = current32(inText);
if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) {
foundBreaks.pop();
correctedNumBreaks--;
}
}
if (!foundBreaks.isEmpty()) {
inText.setIndex(foundBreaks.peek());
}
return correctedNumBreaks;
}
private void shiftLeftOne(Element[] elementList) {
int length = elementList.length;
for (int i = 1; i < length; i++) {
elementList[i - 1].character = elementList[i].character;
elementList[i - 1].ublock = elementList[i].ublock;
}
}
/**
* Evaluate whether the index is a potential breakpoint.
*
* @param elementList A list including six elements for the breakpoint evaluation.
* @param index The breakpoint index to be evaluated.
* @param boundary An list including the index of the breakpoint.
*/
private void evaluateBreakpoint(Element[] elementList, int index, ArrayList<Integer> boundary) {
String[] featureList = new String[MAX_FEATURE];
final int w1 = elementList[0].getCharacter();
final int w2 = elementList[1].getCharacter();
final int w3 = elementList[2].getCharacter();
final int w4 = elementList[3].getCharacter();
final int w5 = elementList[4].getCharacter();
final int w6 = elementList[5].getCharacter();
StringBuilder sb = new StringBuilder();
int idx = 0;
if (w1 != INVALID) {
featureList[idx++] = sb.append("UW1:").appendCodePoint(w1).toString();
}
if (w2 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("UW2:").appendCodePoint(w2).toString();
}
if (w3 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("UW3:").appendCodePoint(w3).toString();
}
if (w4 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("UW4:").appendCodePoint(w4).toString();
}
if (w5 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("UW5:").appendCodePoint(w5).toString();
}
if (w6 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("UW6:").appendCodePoint(w6).toString();
}
if (w2 != INVALID && w3 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("BW1:").appendCodePoint(w2).appendCodePoint(
w3).toString();
}
if (w3 != INVALID && w4 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("BW2:").appendCodePoint(w3).appendCodePoint(
w4).toString();
}
if (w4 != INVALID && w5 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("BW3:").appendCodePoint(w4).appendCodePoint(
w5).toString();
}
if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("TW1:").appendCodePoint(w1).appendCodePoint(
w2).appendCodePoint(w3).toString();
}
if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("TW2:").appendCodePoint(w2).appendCodePoint(
w3).appendCodePoint(w4).toString();
}
if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("TW3:").appendCodePoint(w3).appendCodePoint(
w4).appendCodePoint(w5).toString();
}
if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
sb.setLength(0);
featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
w5).appendCodePoint(w6).toString();
}
if (isValid(elementList[0])) {
sb.setLength(0);
featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString();
}
if (isValid(elementList[1])) {
sb.setLength(0);
featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString();
}
if (isValid(elementList[2])) {
sb.setLength(0);
featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString();
}
if (isValid(elementList[3])) {
sb.setLength(0);
featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString();
}
if (isValid(elementList[4])) {
sb.setLength(0);
featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString();
}
if (isValid(elementList[5])) {
sb.setLength(0);
featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString();
}
if (isValid(elementList[1]) && isValid(elementList[2])) {
sb.setLength(0);
featureList[idx++] = sb.append("BB1:").
append(elementList[1].getUblock()).
append(elementList[2].getUblock()).toString();
}
if (isValid(elementList[2]) && isValid(elementList[3])) {
sb.setLength(0);
featureList[idx++] = sb.append("BB2:").
append(elementList[2].getUblock()).
append(elementList[3].getUblock()).toString();
}
if (isValid(elementList[3]) && isValid(elementList[4])) {
sb.setLength(0);
featureList[idx++] = sb.append("BB3:").
append(elementList[3].getUblock()).
append(elementList[4].getUblock()).toString();
}
if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
sb.setLength(0);
featureList[idx++] = sb.append("TB1:").
append(elementList[0].getUblock()).
append(elementList[1].getUblock()).
append(elementList[2].getUblock()).toString();
}
if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
sb.setLength(0);
featureList[idx++] = sb.append("TB2:").
append(elementList[1].getUblock()).
append(elementList[2].getUblock()).
append(elementList[3].getUblock()).toString();
}
if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
sb.setLength(0);
featureList[idx++] = sb.append("TB3:").
append(elementList[2].getUblock()).
append(elementList[3].getUblock()).
append(elementList[4].getUblock()).toString();
}
if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
sb.setLength(0);
featureList[idx++] = sb.append("TB4:").
append(elementList[3].getUblock()).
append(elementList[4].getUblock()).
append(elementList[5].getUblock()).toString();
}
int score = fNegativeSum;
for (int j = 0; j < idx; j++) {
if (fModel.containsKey(featureList[j])) {
score += (2 * fModel.get(featureList[j]));
}
}
if (score > 0) {
boundary.add(index);
}
}
/**
* Initialize the element list from the input string.
*
* @param inString A input string to be segmented.
* @param elementList A list to store the first six characters and their unicode block codes.
* @param numCodePts The number of code points of input string
* @return The number of the code units of the first six characters in inString.
*/
private int initElementList(CharacterIterator inString, Element[] elementList,
int numCodePts) {
int index = 0;
inString.setIndex(index);
int w1, w2, w3, w4, w5, w6;
w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
if (numCodePts > 0) {
w3 = current32(inString);
index += Character.charCount(w3);
}
if (numCodePts > 1) {
w4 = next32(inString);
index += Character.charCount(w3);
}
if (numCodePts > 2) {
w5 = next32(inString);
index += Character.charCount(w5);
}
if (numCodePts > 3) {
w6 = next32(inString);
index += Character.charCount(w6);
}
final String b1 = INVALID_STRING;
final String b2 = b1;
final String b3 = getUnicodeBlock(w3);
final String b4 = getUnicodeBlock(w4);
final String b5 = getUnicodeBlock(w5);
final String b6 = getUnicodeBlock(w6);
elementList[0] = new Element();
elementList[0].setCharAndUblock(w1, b1);
elementList[1] = new Element();
elementList[1].setCharAndUblock(w2, b2);
elementList[2] = new Element();
elementList[2].setCharAndUblock(w3, b3);
elementList[3] = new Element();
elementList[3].setCharAndUblock(w4, b4);
elementList[4] = new Element();
elementList[4].setCharAndUblock(w5, b5);
elementList[5] = new Element();
elementList[5].setCharAndUblock(w6, b6);
return index;
}
/**
* Get the character's unicode block code defined in UBlockCode.
*
* @param ch A char.
* @return The unicode block code which is 3 digits with '0' added in the beginning if the code
* is less than 3 digits.
*/
private String getUnicodeBlock(int ch) {
int blockId = UCharacter.UnicodeBlock.of(ch).getID();
if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID()
|| blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) {
return INVALID_STRING;
} else {
return String.format("%03d", blockId);
}
}
/**
* Load the machine learning's model file.
*/
private void loadMLModel() {
int index = 0;
UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME,
"jaml");
UResourceBundle keyBundle = rb.get("modelKeys");
UResourceBundle valueBundle = rb.get("modelValues");
int[] value = valueBundle.getIntVector();
UResourceBundleIterator iterator = keyBundle.getIterator();
while (iterator.hasNext()) {
fNegativeSum -= value[index];
fModel.put(iterator.nextString(), value[index++]);
}
}
}

View file

@ -20,6 +20,7 @@ import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.impl.ICUConfig;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
@ -124,6 +125,7 @@ public void TestExtended() {
int rulesFirstLine = 0; // Line number of the start of current <rules> block
int len = testString.length();
boolean skipTest = false;
for (charIdx = 0; charIdx < len; ) {
int c = testString.codePointAt(charIdx);
@ -157,6 +159,7 @@ public void TestExtended() {
break;
}
if (testString.startsWith("<word>", charIdx-1)) {
skipTest = false;
tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
charIdx += 5;
break;
@ -167,22 +170,46 @@ public void TestExtended() {
break;
}
if (testString.startsWith("<line>", charIdx-1)) {
skipTest = false;
tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
if (Boolean.parseBoolean(
ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
if (tp.currentLocale.getName().equals("ja@lw=phrase")) {
// skip <line> test cases of JP's phrase breaking when ML is enabled.
skipTest = true;
}
}
charIdx += 5;
break;
}
if (testString.startsWith("<lineML>", charIdx-1)) {
skipTest = false;
tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
if (!Boolean.parseBoolean(
ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) {
if (tp.currentLocale.getName().equals("ja@lw=phrase")) {
// skip <lineML> test cases of JP's phrase breaking when ML is disabled.
skipTest = true;
}
}
charIdx += 7;
break;
}
if (testString.startsWith("<sent>", charIdx-1)) {
skipTest = false;
tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
charIdx += 5;
break;
}
if (testString.startsWith("<title>", charIdx-1)) {
skipTest = false;
tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
charIdx += 6;
break;
}
if (testString.startsWith("<rules>", charIdx-1) ||
testString.startsWith("<badrules>", charIdx-1)) {
skipTest = false;
charIdx = testString.indexOf('>', charIdx) + 1;
parseState = PARSE_RULES;
rules.setLength(0);
@ -272,7 +299,9 @@ public void TestExtended() {
charIdx += 6;
// RUN THE TEST!
executeTest(tp);
if (!skipTest) {
executeTest(tp);
}
break;
}

View file

@ -1913,6 +1913,26 @@ Bangkok)•</data>
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
<locale ja@lw=phrase>
#phrase breaking test cases for the ML solution
<lineML>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data>
#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します
<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data>
#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど
<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data>
<locale ja@lw=phrase>
#phrase breaking test cases for the dictionary based solution
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
@ -2005,8 +2025,8 @@ Bangkok)•</data>
#大韓民國은 民主共和國이다
#<data>•大韓民國은 •民主•共和國이다•</data>
# All the tests for ja@lw=phrase should also work in Korean.
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>