ICU-22100 Incorporate BudouX into ICU (C++)

This commit is contained in:
Shuhei Iitsuka 2022-07-29 12:08:01 +08:00 committed by Markus Scherer
parent d02b30fc3f
commit b6b7b045e9
18 changed files with 1690 additions and 9 deletions

14
.github/adaboost.json vendored Normal file
View file

@ -0,0 +1,14 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
//
// Include Japanese adaboost model.
{
"featureFilters": {
"brkitr_adaboost": {
"includelist": [
"jaml"
]
}
}
}

View file

@ -334,6 +334,17 @@ jobs:
make clean;
make -j2 check
# Test adaboost
adaboost-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- run: |
cd icu4c/source;
ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
make clean;
make -j2 check
# Build and run testmap
testmap:
runs-on: ubuntu-latest

View file

@ -342,6 +342,7 @@ cc_library(
"dictionarydata.cpp",
"filteredbrk.cpp",
"lstmbe.cpp",
"mlbe.cpp",
"rbbi.cpp",
"rbbi_cache.cpp",
"rbbidata.cpp",

View file

@ -88,6 +88,7 @@
<ClCompile Include="brkiter.cpp" />
<ClCompile Include="dictbe.cpp" />
<ClCompile Include="lstmbe.cpp" />
<ClCompile Include="mlbe.cpp" />
<ClCompile Include="pluralmap.cpp" />
<ClCompile Include="rbbi.cpp" />
<ClCompile Include="rbbidata.cpp" />
@ -282,6 +283,7 @@
<ClInclude Include="brkeng.h" />
<ClInclude Include="dictbe.h" />
<ClInclude Include="lstmbe.h" />
<ClInclude Include="mlbe.h" />
<ClInclude Include="rbbidata.h" />
<ClInclude Include="rbbinode.h" />
<ClInclude Include="rbbirb.h" />

View file

@ -76,6 +76,9 @@
<ClCompile Include="lstmbe.cpp">
<Filter>break iteration</Filter>
</ClCompile>
<ClCompile Include="mlbe.cpp">
<Filter>break iteration</Filter>
</ClCompile>
<ClCompile Include="rbbi.cpp">
<Filter>break iteration</Filter>
</ClCompile>
@ -660,6 +663,9 @@
<ClInclude Include="lstmbe.h">
<Filter>break iteration</Filter>
</ClInclude>
<ClInclude Include="mlbe.h">
<Filter>break iteration</Filter>
</ClInclude>
<ClInclude Include="rbbidata.h">
<Filter>break iteration</Filter>
</ClInclude>

View file

@ -222,6 +222,7 @@
<ClCompile Include="brkiter.cpp" />
<ClCompile Include="dictbe.cpp" />
<ClCompile Include="lstmbe.cpp" />
<ClCompile Include="mlbe.cpp" />
<ClCompile Include="pluralmap.cpp" />
<ClCompile Include="rbbi.cpp" />
<ClCompile Include="rbbidata.cpp" />
@ -417,6 +418,7 @@
<ClInclude Include="brkeng.h" />
<ClInclude Include="dictbe.h" />
<ClInclude Include="lstmbe.h" />
<ClInclude Include="mlbe.h" />
<ClInclude Include="rbbidata.h" />
<ClInclude Include="rbbinode.h" />
<ClInclude Include="rbbirb.h" />

View file

@ -1054,9 +1054,10 @@ foundBest:
*/
static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
fMlBreakEngine = nullptr;
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
@ -1073,11 +1074,20 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
if (U_SUCCESS(status)) {
setCharacters(fHangulWordSet);
}
} else { //Chinese and Japanese
} else { // Chinese and Japanese
UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
isCj = true;
if (U_SUCCESS(status)) {
setCharacters(cjSet);
#if UCONFIG_USE_ML_PHRASE_BREAKING
fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
fClosePunctuationSet, status);
if (fMlBreakEngine == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
}
#else
initJapanesePhraseParameter(status);
#endif
}
}
UTRACE_EXIT_STATUS(status);
@ -1085,6 +1095,7 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
CjkBreakEngine::~CjkBreakEngine(){
delete fDictionary;
delete fMlBreakEngine;
}
// The katakanaCost values below are based on the length frequencies of all
@ -1251,7 +1262,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
}
}
}
#if UCONFIG_USE_ML_PHRASE_BREAKING
// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
if (isPhraseBreaking && isCj) {
return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
inputMap, status);
}
#endif
// bestSnlp[i] is the snlp of the best segmentation of the first i
// code points in the range to be matched.
UVector32 bestSnlp(numCodePts + 1, status);

View file

@ -16,11 +16,13 @@
#include "brkeng.h"
#include "hash.h"
#include "mlbe.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
class DictionaryMatcher;
class MlBreakEngine;
class Normalizer2;
/*******************************************************************
@ -374,6 +376,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
DictionaryMatcher *fDictionary;
const Normalizer2 *nfkcNorm2;
MlBreakEngine *fMlBreakEngine;
bool isCj;
private:
// Load Japanese extensions.

View file

@ -0,0 +1,452 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "cmemory.h"
#include "mlbe.h"
#include "uassert.h"
#include "ubrkimpl.h"
#include "unicode/resbund.h"
#include "unicode/udata.h"
#include "unicode/utf16.h"
#include "uresimp.h"
#include "util.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
Element::Element() : length(0) {}
void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
character = ch;
U_ASSERT(idx.length() <= 3);
length = idx.length();
idx.extract(0, length, ublock);
ublock[length] = '\0';
}
UChar32 Element::getCharacter() const {
return character;
}
char16_t* Element::getUblock() const {
return (char16_t*)ublock;
}
uint16_t Element::getLength() const {
return length;
}
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
const UnicodeSet &closePunctuationSet, UErrorCode &status)
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
fClosePunctuationSet(closePunctuationSet),
fModel(status),
fNegativeSum(0) {
if (U_FAILURE(status)) {
return;
}
loadMLModel(status);
}
MlBreakEngine::~MlBreakEngine() {}
namespace {
const char16_t INVALID = u'|';
const int32_t MAX_FEATURE = 26;
const int32_t MAX_FEATURE_LENGTH = 14;
bool isValid(const Element& element) {
return element.getLength() != 1 || element.getUblock()[0] != INVALID;
}
void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
UnicodeString result(str);
for (int i = 0; i < length; i++) {
result.append(arr[i]);
}
U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
}
void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
}
}
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
UVector32 &foundBreaks, const UnicodeString &inString,
const LocalPointer<UVector32> &inputMap,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
if (rangeStart >= rangeEnd) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UVector32 boundary(inString.countChar32() + 1, status);
if (U_FAILURE(status)) {
return 0;
}
int32_t numBreaks = 0;
UChar32 ch;
UnicodeString index;
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
// Like a sliding window, the elementList removes the first char and appends the new char from
// inString in each iteration so that its size always remains at six.
Element elementList[6];
int32_t codeUts = initElementList(inString, elementList, status);
int32_t length = inString.countChar32();
// Add a break for the start.
boundary.addElement(0, status);
numBreaks++;
if (U_FAILURE(status)) return 0;
for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
if (i + 1 >= inString.countChar32()) break;
// Remove the first element and append a new element
uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
elementList[5].setCharAndUblock(ch, index);
if (ch != INVALID) {
codeUts += U16_LENGTH(ch);
}
}
if (U_FAILURE(status)) return 0;
// Add a break for the end if there is not one there already.
if (boundary.lastElementi() != inString.countChar32()) {
boundary.addElement(inString.countChar32(), status);
numBreaks++;
}
int32_t prevCPPos = -1;
int32_t prevUTextPos = -1;
int32_t correctedNumBreaks = 0;
for (int32_t i = 0; i < numBreaks; i++) {
int32_t cpPos = boundary.elementAti(i);
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
U_ASSERT(cpPos > prevCPPos);
U_ASSERT(utextPos >= prevUTextPos);
if (utextPos > prevUTextPos) {
if (utextPos != rangeStart ||
(utextPos > 0 &&
fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
foundBreaks.push(utextPos, status);
correctedNumBreaks++;
}
} else {
// Normalization expanded the input text, the dictionary found a boundary
// within the expansion, giving two boundaries with the same index in the
// original text. Ignore the second. See ticket #12918.
--numBreaks;
}
prevCPPos = cpPos;
prevUTextPos = utextPos;
}
(void)prevCPPos; // suppress compiler warnings about unused variable
UChar32 nextChar = utext_char32At(inText, rangeEnd);
if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
// In phrase breaking, there has to be a breakpoint between Cj character and
// the number/open punctuation.
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
foundBreaks.popi();
correctedNumBreaks--;
}
}
return correctedNumBreaks;
}
void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
UVector32 &boundary, UErrorCode &status) const {
char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
if (U_FAILURE(status)) {
return;
}
UChar32 arr[4] = {-1, -1, -1, -1};
int32_t length = 0, listLength = 0;
const UChar32 w1 = elementList[0].getCharacter();
const UChar32 w2 = elementList[1].getCharacter();
const UChar32 w3 = elementList[2].getCharacter();
const UChar32 w4 = elementList[3].getCharacter();
const UChar32 w5 = elementList[4].getCharacter();
const UChar32 w6 = elementList[5].getCharacter();
length = 1;
if (w1 != INVALID) {
arr[0] = w1;
concatChar(u"UW1:", arr, length, featureList[listLength++], status);
}
if (w2 != INVALID) {
arr[0] = w2;
concatChar(u"UW2:", arr, length, featureList[listLength++], status);
}
if (w3 != INVALID) {
arr[0] = w3;
concatChar(u"UW3:", arr, length, featureList[listLength++], status);
}
if (w4 != INVALID) {
arr[0] = w4;
concatChar(u"UW4:", arr, length, featureList[listLength++], status);
}
if (w5 != INVALID) {
arr[0] = w5;
concatChar(u"UW5:", arr, length, featureList[listLength++], status);
}
if (w6 != INVALID) {
arr[0] = w6;
concatChar(u"UW6:", arr, length, featureList[listLength++], status);
}
length = 2;
if (w2 != INVALID && w3 != INVALID) {
arr[0] = w2;
arr[1] = w3;
concatChar(u"BW1:", arr, length, featureList[listLength++], status);
}
if (w3 != INVALID && w4 != INVALID) {
arr[0] = w3;
arr[1] = w4;
concatChar(u"BW2:", arr, length, featureList[listLength++], status);
}
if (w4 != INVALID && w5 != INVALID) {
arr[0] = w4;
arr[1] = w5;
concatChar(u"BW3:", arr, length, featureList[listLength++], status);
}
length = 3;
if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
arr[0] = w1;
arr[1] = w2;
arr[2] = w3;
concatChar(u"TW1:", arr, length, featureList[listLength++], status);
}
if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
arr[0] = w2;
arr[1] = w3;
arr[2] = w4;
concatChar(u"TW2:", arr, length, featureList[listLength++], status);
}
if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
arr[0] = w3;
arr[1] = w4;
arr[2] = w5;
concatChar(u"TW3:", arr, length, featureList[listLength++], status);
}
if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
arr[0] = w4;
arr[1] = w5;
arr[2] = w6;
concatChar(u"TW4:", arr, length, featureList[listLength++], status);
}
if (isValid(elementList[0])) {
writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
elementList[0].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[1])) {
writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
elementList[1].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[2])) {
writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
elementList[2].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[3])) {
writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
elementList[3].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[4])) {
writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
elementList[4].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[5])) {
writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
elementList[5].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[1]) && isValid(elementList[2])) {
writeString(UnicodeString(u"BB1:")
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
.append(elementList[2].getUblock(), 0, elementList[2].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[2]) && isValid(elementList[3])) {
writeString(UnicodeString(u"BB2:")
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
.append(elementList[3].getUblock(), 0, elementList[3].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[3]) && isValid(elementList[4])) {
writeString(UnicodeString(u"BB3:")
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
.append(elementList[4].getUblock(), 0, elementList[4].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
writeString(UnicodeString(u"TB1:")
.append(elementList[0].getUblock(), 0, elementList[0].getLength())
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
.append(elementList[2].getUblock(), 0, elementList[2].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
writeString(UnicodeString(u"TB2:")
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
.append(elementList[3].getUblock(), 0, elementList[3].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
writeString(UnicodeString(u"TB3:")
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
.append(elementList[4].getUblock(), 0, elementList[4].getLength()),
featureList[listLength++], status);
}
if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
writeString(UnicodeString(u"TB4:")
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
.append(elementList[4].getUblock(), 0, elementList[4].getLength())
.append(elementList[5].getUblock(), 0, elementList[5].getLength()),
featureList[listLength++], status);
}
if (U_FAILURE(status)) {
return;
}
int32_t score = fNegativeSum;
for (int32_t j = 0; j < listLength; j++) {
UnicodeString key(featureList[j]);
if (fModel.containsKey(key)) {
score += (2 * fModel.geti(key));
}
}
if (score > 0) {
boundary.addElement(index, status);
numBreaks++;
}
}
int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
int32_t index = 0;
int32_t length = inString.countChar32();
UChar32 w1, w2, w3, w4, w5, w6;
w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
if (length > 0) {
w3 = inString.char32At(0);
index += U16_LENGTH(w3);
}
if (length > 1) {
w4 = inString.char32At(index);
index += U16_LENGTH(w4);
}
if (length > 2) {
w5 = inString.char32At(index);
index += U16_LENGTH(w5);
}
if (length > 3) {
w6 = inString.char32At(index);
index += U16_LENGTH(w6);
}
const UnicodeString b1(INVALID);
const UnicodeString b2(b1);
const UnicodeString b3(getUnicodeBlock(w3, status));
const UnicodeString b4(getUnicodeBlock(w4, status));
const UnicodeString b5(getUnicodeBlock(w5, status));
const UnicodeString b6(getUnicodeBlock(w6, status));
elementList[0].setCharAndUblock(w1, b1);
elementList[1].setCharAndUblock(w2, b2);
elementList[2].setCharAndUblock(w3, b3);
elementList[3].setCharAndUblock(w4, b4);
elementList[4].setCharAndUblock(w5, b5);
elementList[5].setCharAndUblock(w6, b6);
return index;
}
UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
if (U_FAILURE(status)) {
return UnicodeString(INVALID);
}
UBlockCode block = ublock_getCode(ch);
if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
return UnicodeString(INVALID);
} else {
UnicodeString empty;
// Same as sprintf("%03d", block)
return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
}
}
void MlBreakEngine::loadMLModel(UErrorCode &error) {
// BudouX's model consists of pairs of the feature and its score.
// As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
// corresponding feature's score.
if (U_FAILURE(error)) return;
int32_t keySize = 0;
int32_t valueSize = 0;
int32_t stringLength = 0;
UnicodeString key;
StackUResourceBundle stackTempBundle;
ResourceDataValue modelKey;
LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
UResourceBundle* rb = rbp.orphan();
// get modelValues
LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
if (U_FAILURE(error)) return;
// get modelKeys
ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
ResourceArray stringArray = modelKey.getArray(error);
keySize = stringArray.getSize();
if (U_FAILURE(error)) return;
for (int32_t idx = 0; idx < keySize; idx++) {
stringArray.getValue(idx, modelKey);
key = UnicodeString(modelKey.getString(stringLength, error));
if (U_SUCCESS(error)) {
U_ASSERT(idx < valueSize);
fNegativeSum -= value[idx];
fModel.puti(key, value[idx], error);
}
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

152
icu4c/source/common/mlbe.h Normal file
View file

@ -0,0 +1,152 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef MLBREAKENGINE_H
#define MLBREAKENGINE_H
#include "hash.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
#if !UCONFIG_NO_BREAK_ITERATION
/**
* A class used to encapsulate a character and its unicode block index
*/
class Element : public UMemory {
public:
/**
* Default constructor.
*/
Element();
/**
* Set the character and its unicode block.
*
* @param ch A unicode character.
* @param ublock The unicode block of the character.
*/
void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
/**
* Get the unicode character.
*
* @return The unicode character.
*/
UChar32 getCharacter() const;
/**
* Get the unicode character's unicode block.
*
* @return The unicode block.
*/
char16_t* getUblock() const;
/**
* Get the length of the unicode block.
*
* @return The unicode block length.
*/
uint16_t getLength() const;
private:
UChar32 character;
char16_t ublock[4];
uint16_t length;
};
/**
* A machine learning break engine for the phrase breaking in Japanese.
*/
class MlBreakEngine : public UMemory {
public:
/**
* Constructor.
*
* @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
* alphabet.
* @param closePunctuationSet An UnicodeSet with close punctuation.
* @param status Information on any errors encountered.
*/
MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
const UnicodeSet &closePunctuationSet, UErrorCode &status);
/**
* Virtual destructor.
*/
virtual ~MlBreakEngine();
public:
/**
* Divide up a range of characters handled by this break engine.
*
* @param inText A UText representing the text
* @param rangeStart The start of the range of the characters
* @param rangeEnd The end of the range of the characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @param inString The normalized string of text ranging from rangeStart to rangeEnd
* @param inputMap The vector storing the native index of inText
* @param status Information on any errors encountered.
* @return The number of breaks found
*/
int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
UVector32 &foundBreaks, const UnicodeString &inString,
const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
private:
/**
* Load the machine learning's model file.
*
* @param error Information on any errors encountered.
*/
void loadMLModel(UErrorCode &error);
/**
* Get the character's unicode block code defined in UBlockCode.
*
* @param ch A character.
* @param error Information on any errors encountered.
* @return The unicode block code which is 3 digits with '0' added in the beginning if the code
* is less than 3 digits.
*
*/
UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
/**
* Initialize the element list from the input string.
*
* @param inString A input string to be segmented.
* @param elementList A list to store the first six characters and their unicode block codes.
* @param status Information on any errors encountered.
* @return The number of code units of the first six characters in inString.
*/
int32_t initElementList(const UnicodeString &inString, Element* elementList,
UErrorCode &status) const;
/**
* Evaluate whether the index is a potential breakpoint.
*
* @param elementList A list including 6 elements for the breakpoint evaluation.
* @param index The breakpoint index to be evaluated.
* @param numBreaks The accumulated number of breakpoints.
* @param boundary A vector including the index of the breakpoint.
* @param status Information on any errors encountered.
*/
void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
UVector32 &boundary, UErrorCode &status) const;
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
UnicodeSet fClosePunctuationSet;
Hashtable fModel;
int32_t fNegativeSum;
};
#endif
U_NAMESPACE_END
/* MLBREAKENGINE_H */
#endif

View file

@ -43,6 +43,7 @@ locutil.cpp
lsr.cpp
lstmbe.cpp
messagepattern.cpp
mlbe.cpp
normalizer2.cpp
normalizer2impl.cpp
normlzr.cpp

View file

@ -323,6 +323,16 @@
# define UCONFIG_NO_NORMALIZATION 0
#endif
/**
* \def UCONFIG_USE_ML_PHRASE_BREAKING
* This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary.
*
* @internal
*/
#ifndef UCONFIG_USE_ML_PHRASE_BREAKING
# define UCONFIG_USE_ML_PHRASE_BREAKING 0
#endif
#if UCONFIG_NO_NORMALIZATION
/* common library */
/* ICU 50 CJK dictionary BreakIterator uses normalization */

View file

@ -27,6 +27,7 @@ def generate(config, io, common_vars):
requests += generate_conversion_mappings(config, io, common_vars)
requests += generate_brkitr_brk(config, io, common_vars)
requests += generate_brkitr_lstm(config, io, common_vars)
requests += generate_brkitr_adaboost(config, io, common_vars)
requests += generate_stringprep(config, io, common_vars)
requests += generate_brkitr_dictionaries(config, io, common_vars)
requests += generate_normalization(config, io, common_vars)
@ -184,7 +185,7 @@ def generate_brkitr_brk(config, io, common_vars):
category = "brkitr_rules",
dep_targets =
[DepTarget("cnvalias"),
DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")],
input_files = input_files,
output_files = output_files,
tool = IcuTool("genbrk"),
@ -506,6 +507,32 @@ def generate_brkitr_lstm(config, io, common_vars):
)
]
def generate_brkitr_adaboost(config, io, common_vars):
input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")]
input_basenames = [v.filename[16:] for v in input_files]
output_files = [
OutFile("brkitr/%s.res" % v[:-4])
for v in input_basenames
]
return [
RepeatedOrSingleExecutionRequest(
name = "adaboost_res",
category = "brkitr_adaboost",
dep_targets = [],
input_files = input_files,
output_files = output_files,
tool = IcuTool("genrb"),
args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} "
"-k "
"{INPUT_BASENAME}",
format_with = {
},
repeat_with = {
"INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
}
)
]
def generate_tree(
config,
io,

View file

@ -0,0 +1,940 @@
// © 2022 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
jaml {
modelKeys {
"BB2:062071",
"UB3:061",
"UB3:071",
"TB2:062062062",
"TB4:062062062",
"UB3:063",
"UB4:071",
"BB3:062062",
"UB4:062",
"BB1:062071",
"BB1:062061",
"UB4:061",
"TB1:071071062",
"TB3:062063063",
"UB2:061",
"TB1:062071062",
"TB3:062062062",
"BB2:063063",
"UW3:は",
"UW3:に",
"TB3:062071062",
"UW3:が",
"UW4:こ",
"UB5:061",
"UW3:と",
"TB4:063063063",
"UW4:て",
"TB2:062062061",
"UW3:。",
"UW4:お",
"UW3:の",
"BB3:071071",
"BB3:062071",
"UW3:お",
"UW3:し",
"UW4:、",
"UW4:の",
"UW3:を",
"UW4:。",
"UW3:、",
"UW5:で",
"UW4:あ",
"BB2:062062",
"UW4:っ",
"UW5:っ",
"UW3:も",
"UW5:う",
"UW3:「",
"UW5:な",
"UW4:そ",
"UW4:る",
"UW3:っ",
"UW4:「",
"UW4:い",
"BB2:087087",
"UB4:087",
"UW5:に",
"BW3:もの",
"UW5:し",
"UW6:う",
"BW2:とい",
"UW4:に",
"UW3:る",
"TB2:071062071",
"UW4:で",
"UW5:が",
"BB1:071071",
"UW5:は",
"UW4:は",
"UW4:れ",
"UW5:き",
"BB2:071062",
"BB2:071071",
"UW3:・",
"BB2:071087",
"BB2:061062",
"TB1:062061062",
"UW3:れ",
"BB2:087062",
"TB2:087087087",
"UW4:ら",
"TB1:071071071",
"UB2:071",
"TB1:062062087",
"UW5:す",
"UW5:ん",
"UW3:で",
"UW4:が",
"UW3:こ",
"TB4:071062062",
"UW3:ら",
"UW6:に",
"UW6:。",
"UW3:た",
"TB1:061071071",
"UW5:く",
"UB1:063",
"UW1:そ",
"UW3:う",
"BW3:とい",
"BW3:とこ",
"UW3:ま",
"BW3:こと",
"UW2:っ",
"UW5:・",
"TB3:062062061",
"UW3:き",
"UW4:ん",
"UB3:062",
"UW3:く",
"UW3:」",
"UW5:あ",
"BB2:062087",
"BW3:いう",
"UW5:れ",
"UW2:一",
"UW3:",
"UW1:に",
"UW2:と",
"TB2:071071062",
"TB2:071071071",
"UW5:を",
"UW4:り",
"BW1:から",
"UW3:ち",
"BW3:いい",
"UW2:は",
"UW6:た",
"TB1:063063062",
"UW4:",
"UW4:や",
"UW2:ん",
"UW3:",
"UW4:ほ",
"TB3:062087087",
"BW2:であ",
"UW4:だ",
"BB3:071062",
"TB1:087087087",
"BW3:・・",
"BW3:とき",
"UW4:を",
"UW3:て",
"UW4:か",
"UW2:そ",
"TB4:071071062",
"TB2:062061071",
"UW2:を",
"UW4:ご",
"UW2:で",
"TB3:071071071",
"BB1:087087",
"UW2:し",
"UW4:出",
"UW2:ま",
"UW4:",
"UW5:と",
"UW4:ど",
"BW3:して",
"UW1:で",
"BB2:061071",
"BW3:ため",
"BW2:とし",
"BW2:ない",
"BW2:てい",
"UW3:間",
"UW3:",
"UW5:ー",
"UW4:す",
"UW4:",
"BW1:とが",
"UW5:の",
"TB4:062062071",
"TB2:061071071",
"UW6:・",
"UW3:",
"UW2:て",
"UW3:笑",
"UW2:こ",
"UW5:も",
"BW3:よう",
"UW3:人",
"UW2:の",
"UW3:か",
"UW3:日",
"UW1:い",
"BW2:とこ",
"UW4:私",
"UW3:…",
"UW2:に",
"UW3:今",
"BB3:087062",
"UB3:055",
"UW4:",
"BB1:087071",
"UW1:な",
"BB3:063063",
"UW5:来",
"UW3:",
"TW3:ている",
"UW4:」",
"UW4:前",
"BW1:いう",
"UW4:つ",
"UW3:",
"BW1:では",
"UW2:る",
"UW5:そ",
"UW4:ー",
"TW2:気に入",
"UW4:笑",
"UW4:ひ",
"TB4:087087087",
"UW4:け",
"UW2:も",
"BW3:ちょ",
"BW3:出来",
"TB2:062071062",
"UW4:『",
"UW3:",
"UW4:",
"UW5:つ",
"TB1:061071062",
"UW3:",
"BW3:から",
"UB5:071",
"UW4:ま",
"UW3:ば",
"UW3:り",
"BW3:その",
"UW3:ご",
"UW4:わ",
"BW2:てお",
"TB2:071062062",
"BW1:ない",
"UW2:よ",
"UB2:087",
"UW6:の",
"UW2:毎",
"UW2:結",
"TW4:の京都",
"UW3:さ",
"UW2:最",
"BW2:です",
"UW2:」",
"UW5:え",
"UW3:だ",
"TW4:ところ",
"UW4:",
"UB1:062",
"UW6:て",
"UW1:が",
"BW2:、と",
"UW3:",
"UW3:ん",
"UW3:中",
"UW4:よ",
"BW3:この",
"UW2:が",
"UW3:み",
"TW2:ではな",
"UW6:と",
"UW4:",
"TW3:、ある",
"BW3:ころ",
"UW4:",
"UW6:、",
"UW4:電",
"BB1:062040",
"UW3:後",
"UW5:い",
"UW2:、",
"UW5:て",
"BB2:062040",
"UW3:真",
"UW3:そ",
"UW5:さ",
"UB5:087",
"TW3:という",
"UW3:分",
"UB6:071",
"BW3:なっ",
"UW4:ろ",
"BB2:061061",
"TW3:ところ",
"UB1:071",
"UW1:、",
"BW1:とか",
"UW3:な",
"UW6:り",
"UW4:間",
"UW3:べ",
"UW5:べ",
"TB4:062071062",
"UW4:",
"BW2:には",
"UW5:々",
"BW1:。・",
"BW1:その",
"UW1:す",
"UW4:",
"UW6:っ",
"TB3:063063063",
"TB3:062071071",
"UB5:063",
"BW1:かも",
"UW6:る",
"TB4:062063063",
"UW3:ど",
"TW3:である",
"TW4:くらい",
"BW1:最近",
"BW1:しい",
"BW1:とも",
"BW2:と同",
"TW1:という",
"UW2:さ",
"BW2:帯電",
"TB1:071062062",
"BW3:そし",
"UW2:。",
"UW5:か",
"UW5:こ",
"BW3:ない",
"BW1:んな",
"BW2:でき",
"UW4:",
"UW3:け",
"TW4:ことが",
"BW1:こと",
"UB3:087",
"UW3:電",
"UW3:よ",
"BW1:たと",
"UW5:ま",
"UW5:た",
"UW5:ち",
"UW2:け",
"UW5:だ",
"UW3:度",
"BW1:たい",
"UW4:使",
"UW2:き",
"TW4:かなり",
"UB6:063",
"BB1:062062",
"UW4:込",
"TW3:と言っ",
"UW6:だ",
"UW5:り",
"UW5:よ",
"BW3:どう",
"UW4:…",
"UW3:や",
"BW1:かし",
"BW3:かっ",
"UW4:今",
"UW3:『",
"UW4:思",
"UB2:063",
"UW4:く",
"UW3:京",
"UW6:ー",
"UW1:ん",
"BW1:うな",
"TB2:062061061",
"UW1:と",
"TB4:062063062",
"TB2:061062062",
"BW1:この",
"BW2:ので",
"UW4:み",
"UW5:わ",
"UW6:や",
"BW1:れて",
"UW2:や",
"UW6:こ",
"UW4:な",
"UW5:め",
"BW1:もう",
"TB4:071062071",
"BW1:より",
"UW4:合",
"UW6:け",
"BW1:少し",
"BW2:でし",
"UW4:と",
"TB1:063063063",
"UW3:ー",
"BW2:くな",
"UW2:く",
"UW2:我",
"BW2:いも",
"BW3:わか",
"TB2:071063071",
"UW4:も",
"UW1:あ",
"UW4:最",
"BW1:るの",
"UW2:全",
"UW6:",
"UW4:放",
"UW4:京",
"BW3:かけ",
"UW2:少",
"BW3:もう",
"UW2:多",
"UW2:う",
"TB1:062062040",
"UW1:を",
"UW3:光",
"BW1:",
"UW2:ャ",
"BW3:すぐ",
"UW4:帯",
"UW6:し",
"BW3:でも",
"BW2:、そ",
"TB3:071087087",
"TB2:063062071",
"UW3:わ",
"UB4:063",
"TB4:071071071",
"UW5:都",
"UW5:ず",
"UW2:バ",
"UW2:京",
"UW3:ゃ",
"BW1:い、",
"BW3:よく",
"BW1:たら",
"BW2:のよ",
"UW2:思",
"BW1:うに",
"BW1:の間",
"UW6:ん",
"UW6:ず",
"BW1:った",
"TW3:ること",
"BW3:とて",
"TW1:ような",
"UW6:ぱ",
"TB3:063071062",
"TW4:って、",
"TW4:なんて",
"TW2:その後",
"UW6:ら",
"TW4:ことに",
"UW3:",
"TW3:てしま",
"UW3:い",
"TB4:071062061",
"UW2:ひ",
"UW6:め",
"UW6:で",
"BW3:なる",
"UW5:ご",
"BW2:りし",
"UW6:電",
"UW1:は",
"BW1:いも",
"BW3:すご",
"UW4:通",
"BW3:おり",
"BW3:かか",
"BW1:思い",
}
modelValues:intvector {
1800,
271,
-857,
-417,
285,
-583,
388,
828,
-853,
-820,
502,
-708,
358,
1341,
-586,
-451,
257,
-1876,
2052,
1698,
-458,
2048,
1182,
-551,
980,
773,
-1453,
-152,
3201,
2865,
1203,
144,
-369,
-2539,
-613,
-3574,
-1111,
3110,
-3022,
2039,
-1091,
1241,
-560,
-1412,
625,
1350,
297,
-2404,
-595,
1007,
-1829,
-1662,
3213,
270,
-911,
178,
-727,
2716,
-484,
-344,
929,
-1236,
760,
-299,
-419,
-728,
122,
-704,
-605,
-1507,
545,
-68,
-320,
1498,
953,
-323,
-575,
-673,
520,
-450,
-1767,
-247,
56,
231,
-764,
536,
794,
-703,
-566,
51,
390,
52,
-182,
466,
133,
354,
107,
492,
488,
-1194,
1145,
-847,
812,
151,
-517,
-314,
-553,
-783,
-117,
736,
-88,
-598,
569,
606,
287,
744,
1739,
-217,
-219,
-144,
234,
-649,
-757,
834,
-819,
869,
-275,
-267,
154,
653,
594,
255,
1018,
1124,
284,
-1624,
-372,
440,
-184,
-1936,
1318,
-1124,
453,
-92,
-343,
175,
182,
-886,
930,
-223,
-57,
-113,
103,
-200,
510,
-2099,
-498,
385,
80,
-156,
360,
1289,
771,
-1114,
-399,
870,
1230,
79,
472,
-1596,
-1092,
-572,
55,
-151,
-124,
1316,
-248,
1280,
-125,
-284,
-1023,
862,
84,
417,
568,
-88,
-528,
910,
674,
-212,
894,
-121,
1108,
762,
260,
-197,
91,
-53,
1117,
-645,
-868,
-611,
220,
422,
1431,
-532,
-157,
-476,
-846,
-1309,
-1614,
1225,
302,
-738,
-260,
892,
-778,
-193,
1221,
-779,
489,
420,
-85,
-525,
-830,
26,
270,
439,
-120,
1263,
-795,
291,
-1310,
-23,
347,
312,
-107,
-114,
701,
830,
1309,
-451,
260,
-1080,
536,
188,
-60,
643,
-1184,
31,
-194,
-51,
-514,
-442,
-120,
649,
410,
882,
-75,
-341,
-718,
-128,
340,
-1245,
-164,
-1052,
70,
-256,
279,
786,
40,
-177,
97,
-411,
222,
-89,
-277,
-146,
414,
483,
21,
-339,
-406,
-360,
-450,
-14,
-36,
513,
252,
54,
-501,
-478,
450,
-36,
-644,
-392,
714,
643,
-341,
91,
-1018,
34,
-177,
123,
80,
-695,
-44,
-357,
253,
-389,
613,
515,
418,
-396,
-553,
193,
298,
-334,
-57,
-315,
-77,
33,
88,
137,
280,
-448,
196,
-136,
-295,
-329,
-92,
-360,
-132,
-288,
-45,
-43,
174,
75,
-60,
330,
360,
217,
130,
473,
-41,
-23,
-340,
-530,
-69,
-71,
-115,
297,
-240,
229,
507,
-348,
171,
-320,
239,
16,
-195,
-277,
-41,
69,
280,
-264,
30,
249,
-97,
-163,
-221,
96,
83,
82,
-218,
-93,
-53,
40,
28,
285,
27,
283,
-211,
-92,
214,
-225,
-54,
53,
105,
-198,
-53,
-277,
198,
184,
-264,
-106,
14,
185,
-155,
185,
106,
-119,
53,
208,
92,
262,
106,
-52,
105,
-25,
-79,
104,
141,
129,
-114,
26,
64,
-113,
26,
77,
-64,
13,
13,
26,
89,
115,
-49,
89,
-114,
51,
64,
-64,
-51,
-38,
89,
13,
-64,
13,
-48,
76,
63,
62,
13,
112,
-76,
-50,
-13,
-49,
63,
-50,
13,
13,
-50,
24,
-12,
24,
12,
24,
12,
-12,
-24,
12,
-12,
-12,
12,
-12,
}
}

View file

@ -273,8 +273,8 @@ def _preprocess_file_filters(requests, config, io):
default_filter_json = "exclude" if config.strategy == "additive" else "include"
for category in all_categories:
filter_json = default_filter_json
# Special default for category "brkitr_lstm" as "exclude" for now.
if "brkitr_lstm" == category:
# Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now.
if "brkitr_lstm" == category or "brkitr_adaboost" == category:
filter_json = "exclude"
# Figure out the correct filter to create for now.
if "featureFilters" in json_data and category in json_data["featureFilters"]:

View file

@ -211,7 +211,7 @@ group: breakiterator
brkiter.o brkeng.o ubrk.o
rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o
rbbidata.o rbbirb.o rbbi_cache.o
dictionarydata.o dictbe.o lstmbe.o
dictionarydata.o dictbe.o lstmbe.o mlbe.o
# BreakIterator::makeInstance() factory implementation makes for circular dependency
# between BreakIterator base and FilteredBreakIteratorBuilder.
filteredbrk.o

View file

@ -42,6 +42,7 @@
#include "charstr.h"
#include "cmemory.h"
#include "cstr.h"
#include "cstring.h"
#include "intltest.h"
#include "lstmbe.h"
#include "rbbitst.h"
@ -835,9 +836,28 @@ void RBBITest::TestExtended() {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
skipTest = false;
#if UCONFIG_USE_ML_PHRASE_BREAKING
if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
// skip <line> test cases of JP's phrase breaking when ML is enabled.
skipTest = true;
}
#endif
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
skipTest = false;
#if !UCONFIG_USE_ML_PHRASE_BREAKING
if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
// skip <lineML> test cases of JP's phrase breaking when ML is disabled.
skipTest = true;
}
#endif
charIdx += 7;
break;
}
if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createSentenceInstance(locale, status);

View file

@ -1913,6 +1913,26 @@ Bangkok)•</data>
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
<locale ja@lw=phrase>
#phrase breaking test cases for the ML solution
<lineML>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data>
#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します
<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data>
#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど
<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data>
<locale ja@lw=phrase>
#phrase breaking test cases for the dictionary based solution
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
@ -2005,8 +2025,8 @@ Bangkok)•</data>
#大韓民國은 民主共和國이다
#<data>•大韓民國은 •民主•共和國이다•</data>
# All the tests for ja@lw=phrase should also work in Korean.
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>