mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 15:42:14 +00:00
ICU-22100 Incorporate BudouX into ICU (C++)
This commit is contained in:
parent
d02b30fc3f
commit
b6b7b045e9
18 changed files with 1690 additions and 9 deletions
14
.github/adaboost.json
vendored
Normal file
14
.github/adaboost.json
vendored
Normal file
|
@ -0,0 +1,14 @@
|
|||
// © 2022 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
|
||||
//
|
||||
// Include Japanese adaboost model.
|
||||
{
|
||||
"featureFilters": {
|
||||
"brkitr_adaboost": {
|
||||
"includelist": [
|
||||
"jaml"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
11
.github/workflows/icu_ci.yml
vendored
11
.github/workflows/icu_ci.yml
vendored
|
@ -334,6 +334,17 @@ jobs:
|
|||
make clean;
|
||||
make -j2 check
|
||||
|
||||
# Test adaboost
|
||||
adaboost-test:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- run: |
|
||||
cd icu4c/source;
|
||||
ICU_DATA_FILTER_FILE=../../.github/adaboost.json CPPFLAGS=-DUCONFIG_USE_ML_PHRASE_BREAKING=1 ./runConfigureICU --enable-debug --disable-release Linux -disable-layoutex;
|
||||
make clean;
|
||||
make -j2 check
|
||||
|
||||
# Build and run testmap
|
||||
testmap:
|
||||
runs-on: ubuntu-latest
|
||||
|
|
|
@ -342,6 +342,7 @@ cc_library(
|
|||
"dictionarydata.cpp",
|
||||
"filteredbrk.cpp",
|
||||
"lstmbe.cpp",
|
||||
"mlbe.cpp",
|
||||
"rbbi.cpp",
|
||||
"rbbi_cache.cpp",
|
||||
"rbbidata.cpp",
|
||||
|
|
|
@ -88,6 +88,7 @@
|
|||
<ClCompile Include="brkiter.cpp" />
|
||||
<ClCompile Include="dictbe.cpp" />
|
||||
<ClCompile Include="lstmbe.cpp" />
|
||||
<ClCompile Include="mlbe.cpp" />
|
||||
<ClCompile Include="pluralmap.cpp" />
|
||||
<ClCompile Include="rbbi.cpp" />
|
||||
<ClCompile Include="rbbidata.cpp" />
|
||||
|
@ -282,6 +283,7 @@
|
|||
<ClInclude Include="brkeng.h" />
|
||||
<ClInclude Include="dictbe.h" />
|
||||
<ClInclude Include="lstmbe.h" />
|
||||
<ClInclude Include="mlbe.h" />
|
||||
<ClInclude Include="rbbidata.h" />
|
||||
<ClInclude Include="rbbinode.h" />
|
||||
<ClInclude Include="rbbirb.h" />
|
||||
|
|
|
@ -76,6 +76,9 @@
|
|||
<ClCompile Include="lstmbe.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="mlbe.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="rbbi.cpp">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClCompile>
|
||||
|
@ -660,6 +663,9 @@
|
|||
<ClInclude Include="lstmbe.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="mlbe.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="rbbidata.h">
|
||||
<Filter>break iteration</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -222,6 +222,7 @@
|
|||
<ClCompile Include="brkiter.cpp" />
|
||||
<ClCompile Include="dictbe.cpp" />
|
||||
<ClCompile Include="lstmbe.cpp" />
|
||||
<ClCompile Include="mlbe.cpp" />
|
||||
<ClCompile Include="pluralmap.cpp" />
|
||||
<ClCompile Include="rbbi.cpp" />
|
||||
<ClCompile Include="rbbidata.cpp" />
|
||||
|
@ -417,6 +418,7 @@
|
|||
<ClInclude Include="brkeng.h" />
|
||||
<ClInclude Include="dictbe.h" />
|
||||
<ClInclude Include="lstmbe.h" />
|
||||
<ClInclude Include="mlbe.h" />
|
||||
<ClInclude Include="rbbidata.h" />
|
||||
<ClInclude Include="rbbinode.h" />
|
||||
<ClInclude Include="rbbirb.h" />
|
||||
|
|
|
@ -1054,9 +1054,10 @@ foundBest:
|
|||
*/
|
||||
static const uint32_t kuint32max = 0xFFFFFFFF;
|
||||
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
|
||||
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
|
||||
: DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) {
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
|
||||
fMlBreakEngine = nullptr;
|
||||
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
|
||||
// Korean dictionary only includes Hangul syllables
|
||||
fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
|
||||
|
@ -1073,11 +1074,20 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
|
|||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fHangulWordSet);
|
||||
}
|
||||
} else { //Chinese and Japanese
|
||||
} else { // Chinese and Japanese
|
||||
UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
|
||||
isCj = true;
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(cjSet);
|
||||
#if UCONFIG_USE_ML_PHRASE_BREAKING
|
||||
fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet,
|
||||
fClosePunctuationSet, status);
|
||||
if (fMlBreakEngine == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
#else
|
||||
initJapanesePhraseParameter(status);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
|
@ -1085,6 +1095,7 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
|
|||
|
||||
CjkBreakEngine::~CjkBreakEngine(){
|
||||
delete fDictionary;
|
||||
delete fMlBreakEngine;
|
||||
}
|
||||
|
||||
// The katakanaCost values below are based on the length frequencies of all
|
||||
|
@ -1251,7 +1262,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if UCONFIG_USE_ML_PHRASE_BREAKING
|
||||
// PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja.
|
||||
if (isPhraseBreaking && isCj) {
|
||||
return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString,
|
||||
inputMap, status);
|
||||
}
|
||||
#endif
|
||||
|
||||
// bestSnlp[i] is the snlp of the best segmentation of the first i
|
||||
// code points in the range to be matched.
|
||||
UVector32 bestSnlp(numCodePts + 1, status);
|
||||
|
|
|
@ -16,11 +16,13 @@
|
|||
|
||||
#include "brkeng.h"
|
||||
#include "hash.h"
|
||||
#include "mlbe.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class DictionaryMatcher;
|
||||
class MlBreakEngine;
|
||||
class Normalizer2;
|
||||
|
||||
/*******************************************************************
|
||||
|
@ -374,6 +376,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
|
|||
|
||||
DictionaryMatcher *fDictionary;
|
||||
const Normalizer2 *nfkcNorm2;
|
||||
MlBreakEngine *fMlBreakEngine;
|
||||
bool isCj;
|
||||
|
||||
private:
|
||||
// Load Japanese extensions.
|
||||
|
|
452
icu4c/source/common/mlbe.cpp
Normal file
452
icu4c/source/common/mlbe.cpp
Normal file
|
@ -0,0 +1,452 @@
|
|||
// © 2022 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "cmemory.h"
|
||||
#include "mlbe.h"
|
||||
#include "uassert.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "uresimp.h"
|
||||
#include "util.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
Element::Element() : length(0) {}
|
||||
|
||||
void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
|
||||
character = ch;
|
||||
U_ASSERT(idx.length() <= 3);
|
||||
length = idx.length();
|
||||
idx.extract(0, length, ublock);
|
||||
ublock[length] = '\0';
|
||||
}
|
||||
|
||||
UChar32 Element::getCharacter() const {
|
||||
return character;
|
||||
}
|
||||
|
||||
char16_t* Element::getUblock() const {
|
||||
return (char16_t*)ublock;
|
||||
}
|
||||
|
||||
uint16_t Element::getLength() const {
|
||||
return length;
|
||||
}
|
||||
|
||||
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
|
||||
const UnicodeSet &closePunctuationSet, UErrorCode &status)
|
||||
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
|
||||
fClosePunctuationSet(closePunctuationSet),
|
||||
fModel(status),
|
||||
fNegativeSum(0) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
loadMLModel(status);
|
||||
}
|
||||
|
||||
MlBreakEngine::~MlBreakEngine() {}
|
||||
|
||||
namespace {
|
||||
const char16_t INVALID = u'|';
|
||||
const int32_t MAX_FEATURE = 26;
|
||||
const int32_t MAX_FEATURE_LENGTH = 14;
|
||||
|
||||
bool isValid(const Element& element) {
|
||||
return element.getLength() != 1 || element.getUblock()[0] != INVALID;
|
||||
}
|
||||
|
||||
void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
UnicodeString result(str);
|
||||
for (int i = 0; i < length; i++) {
|
||||
result.append(arr[i]);
|
||||
}
|
||||
U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
|
||||
result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
|
||||
}
|
||||
|
||||
void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
|
||||
U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
|
||||
str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
|
||||
}
|
||||
}
|
||||
|
||||
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
|
||||
UVector32 &foundBreaks, const UnicodeString &inString,
|
||||
const LocalPointer<UVector32> &inputMap,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
if (rangeStart >= rangeEnd) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
UVector32 boundary(inString.countChar32() + 1, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
int32_t numBreaks = 0;
|
||||
UChar32 ch;
|
||||
UnicodeString index;
|
||||
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
|
||||
// Like a sliding window, the elementList removes the first char and appends the new char from
|
||||
// inString in each iteration so that its size always remains at six.
|
||||
Element elementList[6];
|
||||
|
||||
int32_t codeUts = initElementList(inString, elementList, status);
|
||||
int32_t length = inString.countChar32();
|
||||
|
||||
// Add a break for the start.
|
||||
boundary.addElement(0, status);
|
||||
numBreaks++;
|
||||
if (U_FAILURE(status)) return 0;
|
||||
|
||||
for (int32_t i = 1; i < length && U_SUCCESS(status); i++) {
|
||||
evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
|
||||
if (i + 1 >= inString.countChar32()) break;
|
||||
// Remove the first element and append a new element
|
||||
uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
|
||||
ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
|
||||
index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
|
||||
elementList[5].setCharAndUblock(ch, index);
|
||||
if (ch != INVALID) {
|
||||
codeUts += U16_LENGTH(ch);
|
||||
}
|
||||
}
|
||||
if (U_FAILURE(status)) return 0;
|
||||
|
||||
// Add a break for the end if there is not one there already.
|
||||
if (boundary.lastElementi() != inString.countChar32()) {
|
||||
boundary.addElement(inString.countChar32(), status);
|
||||
numBreaks++;
|
||||
}
|
||||
|
||||
int32_t prevCPPos = -1;
|
||||
int32_t prevUTextPos = -1;
|
||||
int32_t correctedNumBreaks = 0;
|
||||
for (int32_t i = 0; i < numBreaks; i++) {
|
||||
int32_t cpPos = boundary.elementAti(i);
|
||||
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
|
||||
U_ASSERT(cpPos > prevCPPos);
|
||||
U_ASSERT(utextPos >= prevUTextPos);
|
||||
|
||||
if (utextPos > prevUTextPos) {
|
||||
if (utextPos != rangeStart ||
|
||||
(utextPos > 0 &&
|
||||
fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
|
||||
foundBreaks.push(utextPos, status);
|
||||
correctedNumBreaks++;
|
||||
}
|
||||
} else {
|
||||
// Normalization expanded the input text, the dictionary found a boundary
|
||||
// within the expansion, giving two boundaries with the same index in the
|
||||
// original text. Ignore the second. See ticket #12918.
|
||||
--numBreaks;
|
||||
}
|
||||
prevCPPos = cpPos;
|
||||
prevUTextPos = utextPos;
|
||||
}
|
||||
(void)prevCPPos; // suppress compiler warnings about unused variable
|
||||
|
||||
UChar32 nextChar = utext_char32At(inText, rangeEnd);
|
||||
if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
|
||||
// In phrase breaking, there has to be a breakpoint between Cj character and
|
||||
// the number/open punctuation.
|
||||
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
|
||||
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
|
||||
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
|
||||
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
|
||||
foundBreaks.popi();
|
||||
correctedNumBreaks--;
|
||||
}
|
||||
}
|
||||
|
||||
return correctedNumBreaks;
|
||||
}
|
||||
|
||||
void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
|
||||
UVector32 &boundary, UErrorCode &status) const {
|
||||
char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UChar32 arr[4] = {-1, -1, -1, -1};
|
||||
int32_t length = 0, listLength = 0;
|
||||
|
||||
const UChar32 w1 = elementList[0].getCharacter();
|
||||
const UChar32 w2 = elementList[1].getCharacter();
|
||||
const UChar32 w3 = elementList[2].getCharacter();
|
||||
const UChar32 w4 = elementList[3].getCharacter();
|
||||
const UChar32 w5 = elementList[4].getCharacter();
|
||||
const UChar32 w6 = elementList[5].getCharacter();
|
||||
|
||||
length = 1;
|
||||
if (w1 != INVALID) {
|
||||
arr[0] = w1;
|
||||
concatChar(u"UW1:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w2 != INVALID) {
|
||||
arr[0] = w2;
|
||||
concatChar(u"UW2:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w3 != INVALID) {
|
||||
arr[0] = w3;
|
||||
concatChar(u"UW3:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w4 != INVALID) {
|
||||
arr[0] = w4;
|
||||
concatChar(u"UW4:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w5 != INVALID) {
|
||||
arr[0] = w5;
|
||||
concatChar(u"UW5:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w6 != INVALID) {
|
||||
arr[0] = w6;
|
||||
concatChar(u"UW6:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
length = 2;
|
||||
if (w2 != INVALID && w3 != INVALID) {
|
||||
arr[0] = w2;
|
||||
arr[1] = w3;
|
||||
concatChar(u"BW1:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w3 != INVALID && w4 != INVALID) {
|
||||
arr[0] = w3;
|
||||
arr[1] = w4;
|
||||
concatChar(u"BW2:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w4 != INVALID && w5 != INVALID) {
|
||||
arr[0] = w4;
|
||||
arr[1] = w5;
|
||||
concatChar(u"BW3:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
length = 3;
|
||||
if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
|
||||
arr[0] = w1;
|
||||
arr[1] = w2;
|
||||
arr[2] = w3;
|
||||
concatChar(u"TW1:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
|
||||
arr[0] = w2;
|
||||
arr[1] = w3;
|
||||
arr[2] = w4;
|
||||
concatChar(u"TW2:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
|
||||
arr[0] = w3;
|
||||
arr[1] = w4;
|
||||
arr[2] = w5;
|
||||
concatChar(u"TW3:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
|
||||
arr[0] = w4;
|
||||
arr[1] = w5;
|
||||
arr[2] = w6;
|
||||
concatChar(u"TW4:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[0])) {
|
||||
writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
|
||||
elementList[0].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[1])) {
|
||||
writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
|
||||
elementList[1].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[2])) {
|
||||
writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
|
||||
elementList[2].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[3])) {
|
||||
writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
|
||||
elementList[3].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[4])) {
|
||||
writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
|
||||
elementList[4].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[5])) {
|
||||
writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
|
||||
elementList[5].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[1]) && isValid(elementList[2])) {
|
||||
writeString(UnicodeString(u"BB1:")
|
||||
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[2]) && isValid(elementList[3])) {
|
||||
writeString(UnicodeString(u"BB2:")
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[3]) && isValid(elementList[4])) {
|
||||
writeString(UnicodeString(u"BB3:")
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
|
||||
.append(elementList[4].getUblock(), 0, elementList[4].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
|
||||
writeString(UnicodeString(u"TB1:")
|
||||
.append(elementList[0].getUblock(), 0, elementList[0].getLength())
|
||||
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
|
||||
writeString(UnicodeString(u"TB2:")
|
||||
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
|
||||
writeString(UnicodeString(u"TB3:")
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
|
||||
.append(elementList[4].getUblock(), 0, elementList[4].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
|
||||
writeString(UnicodeString(u"TB4:")
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
|
||||
.append(elementList[4].getUblock(), 0, elementList[4].getLength())
|
||||
.append(elementList[5].getUblock(), 0, elementList[5].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t score = fNegativeSum;
|
||||
for (int32_t j = 0; j < listLength; j++) {
|
||||
UnicodeString key(featureList[j]);
|
||||
if (fModel.containsKey(key)) {
|
||||
score += (2 * fModel.geti(key));
|
||||
}
|
||||
}
|
||||
if (score > 0) {
|
||||
boundary.addElement(index, status);
|
||||
numBreaks++;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
int32_t index = 0;
|
||||
int32_t length = inString.countChar32();
|
||||
UChar32 w1, w2, w3, w4, w5, w6;
|
||||
w1 = w2 = w3 = w4 = w5 = w6 = INVALID;
|
||||
if (length > 0) {
|
||||
w3 = inString.char32At(0);
|
||||
index += U16_LENGTH(w3);
|
||||
}
|
||||
if (length > 1) {
|
||||
w4 = inString.char32At(index);
|
||||
index += U16_LENGTH(w4);
|
||||
}
|
||||
if (length > 2) {
|
||||
w5 = inString.char32At(index);
|
||||
index += U16_LENGTH(w5);
|
||||
}
|
||||
if (length > 3) {
|
||||
w6 = inString.char32At(index);
|
||||
index += U16_LENGTH(w6);
|
||||
}
|
||||
|
||||
const UnicodeString b1(INVALID);
|
||||
const UnicodeString b2(b1);
|
||||
const UnicodeString b3(getUnicodeBlock(w3, status));
|
||||
const UnicodeString b4(getUnicodeBlock(w4, status));
|
||||
const UnicodeString b5(getUnicodeBlock(w5, status));
|
||||
const UnicodeString b6(getUnicodeBlock(w6, status));
|
||||
|
||||
elementList[0].setCharAndUblock(w1, b1);
|
||||
elementList[1].setCharAndUblock(w2, b2);
|
||||
elementList[2].setCharAndUblock(w3, b3);
|
||||
elementList[3].setCharAndUblock(w4, b4);
|
||||
elementList[4].setCharAndUblock(w5, b5);
|
||||
elementList[5].setCharAndUblock(w6, b6);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return UnicodeString(INVALID);
|
||||
}
|
||||
|
||||
UBlockCode block = ublock_getCode(ch);
|
||||
if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
|
||||
return UnicodeString(INVALID);
|
||||
} else {
|
||||
UnicodeString empty;
|
||||
// Same as sprintf("%03d", block)
|
||||
return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
|
||||
}
|
||||
}
|
||||
|
||||
void MlBreakEngine::loadMLModel(UErrorCode &error) {
|
||||
// BudouX's model consists of pairs of the feature and its score.
|
||||
// As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
|
||||
// corresponding feature's score.
|
||||
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
int32_t keySize = 0;
|
||||
int32_t valueSize = 0;
|
||||
int32_t stringLength = 0;
|
||||
UnicodeString key;
|
||||
StackUResourceBundle stackTempBundle;
|
||||
ResourceDataValue modelKey;
|
||||
|
||||
LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error));
|
||||
UResourceBundle* rb = rbp.orphan();
|
||||
// get modelValues
|
||||
LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error));
|
||||
const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error);
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
// get modelKeys
|
||||
ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error);
|
||||
ResourceArray stringArray = modelKey.getArray(error);
|
||||
keySize = stringArray.getSize();
|
||||
if (U_FAILURE(error)) return;
|
||||
|
||||
for (int32_t idx = 0; idx < keySize; idx++) {
|
||||
stringArray.getValue(idx, modelKey);
|
||||
key = UnicodeString(modelKey.getString(stringLength, error));
|
||||
if (U_SUCCESS(error)) {
|
||||
U_ASSERT(idx < valueSize);
|
||||
fNegativeSum -= value[idx];
|
||||
fModel.puti(key, value[idx], error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
152
icu4c/source/common/mlbe.h
Normal file
152
icu4c/source/common/mlbe.h
Normal file
|
@ -0,0 +1,152 @@
|
|||
// © 2022 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#ifndef MLBREAKENGINE_H
|
||||
#define MLBREAKENGINE_H
|
||||
|
||||
#include "hash.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* A class used to encapsulate a character and its unicode block index
|
||||
*/
|
||||
class Element : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
Element();
|
||||
|
||||
/**
|
||||
* Set the character and its unicode block.
|
||||
*
|
||||
* @param ch A unicode character.
|
||||
* @param ublock The unicode block of the character.
|
||||
*/
|
||||
void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
|
||||
|
||||
/**
|
||||
* Get the unicode character.
|
||||
*
|
||||
* @return The unicode character.
|
||||
*/
|
||||
UChar32 getCharacter() const;
|
||||
|
||||
/**
|
||||
* Get the unicode character's unicode block.
|
||||
*
|
||||
* @return The unicode block.
|
||||
*/
|
||||
char16_t* getUblock() const;
|
||||
|
||||
/**
|
||||
* Get the length of the unicode block.
|
||||
*
|
||||
* @return The unicode block length.
|
||||
*/
|
||||
uint16_t getLength() const;
|
||||
|
||||
private:
|
||||
UChar32 character;
|
||||
char16_t ublock[4];
|
||||
uint16_t length;
|
||||
};
|
||||
|
||||
/**
|
||||
* A machine learning break engine for the phrase breaking in Japanese.
|
||||
*/
|
||||
class MlBreakEngine : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Constructor.
|
||||
*
|
||||
* @param digitOrOpenPunctuationOrAlphabetSet An UnicodeSet with the digit, open punctuation and
|
||||
* alphabet.
|
||||
* @param closePunctuationSet An UnicodeSet with close punctuation.
|
||||
* @param status Information on any errors encountered.
|
||||
*/
|
||||
MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
|
||||
const UnicodeSet &closePunctuationSet, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Virtual destructor.
|
||||
*/
|
||||
virtual ~MlBreakEngine();
|
||||
|
||||
public:
|
||||
/**
|
||||
* Divide up a range of characters handled by this break engine.
|
||||
*
|
||||
* @param inText A UText representing the text
|
||||
* @param rangeStart The start of the range of the characters
|
||||
* @param rangeEnd The end of the range of the characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @param inString The normalized string of text ranging from rangeStart to rangeEnd
|
||||
* @param inputMap The vector storing the native index of inText
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
int32_t divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
|
||||
UVector32 &foundBreaks, const UnicodeString &inString,
|
||||
const LocalPointer<UVector32> &inputMap, UErrorCode &status) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Load the machine learning's model file.
|
||||
*
|
||||
* @param error Information on any errors encountered.
|
||||
*/
|
||||
void loadMLModel(UErrorCode &error);
|
||||
|
||||
/**
|
||||
* Get the character's unicode block code defined in UBlockCode.
|
||||
*
|
||||
* @param ch A character.
|
||||
* @param error Information on any errors encountered.
|
||||
* @return The unicode block code which is 3 digits with '0' added in the beginning if the code
|
||||
* is less than 3 digits.
|
||||
*
|
||||
*/
|
||||
UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Initialize the element list from the input string.
|
||||
*
|
||||
* @param inString A input string to be segmented.
|
||||
* @param elementList A list to store the first six characters and their unicode block codes.
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of code units of the first six characters in inString.
|
||||
*/
|
||||
int32_t initElementList(const UnicodeString &inString, Element* elementList,
|
||||
UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Evaluate whether the index is a potential breakpoint.
|
||||
*
|
||||
* @param elementList A list including 6 elements for the breakpoint evaluation.
|
||||
* @param index The breakpoint index to be evaluated.
|
||||
* @param numBreaks The accumulated number of breakpoints.
|
||||
* @param boundary A vector including the index of the breakpoint.
|
||||
* @param status Information on any errors encountered.
|
||||
*/
|
||||
void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
|
||||
UVector32 &boundary, UErrorCode &status) const;
|
||||
|
||||
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||||
UnicodeSet fClosePunctuationSet;
|
||||
Hashtable fModel;
|
||||
int32_t fNegativeSum;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* MLBREAKENGINE_H */
|
||||
#endif
|
|
@ -43,6 +43,7 @@ locutil.cpp
|
|||
lsr.cpp
|
||||
lstmbe.cpp
|
||||
messagepattern.cpp
|
||||
mlbe.cpp
|
||||
normalizer2.cpp
|
||||
normalizer2impl.cpp
|
||||
normlzr.cpp
|
||||
|
|
|
@ -323,6 +323,16 @@
|
|||
# define UCONFIG_NO_NORMALIZATION 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def UCONFIG_USE_ML_PHRASE_BREAKING
|
||||
* This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
#ifndef UCONFIG_USE_ML_PHRASE_BREAKING
|
||||
# define UCONFIG_USE_ML_PHRASE_BREAKING 0
|
||||
#endif
|
||||
|
||||
#if UCONFIG_NO_NORMALIZATION
|
||||
/* common library */
|
||||
/* ICU 50 CJK dictionary BreakIterator uses normalization */
|
||||
|
|
|
@ -27,6 +27,7 @@ def generate(config, io, common_vars):
|
|||
requests += generate_conversion_mappings(config, io, common_vars)
|
||||
requests += generate_brkitr_brk(config, io, common_vars)
|
||||
requests += generate_brkitr_lstm(config, io, common_vars)
|
||||
requests += generate_brkitr_adaboost(config, io, common_vars)
|
||||
requests += generate_stringprep(config, io, common_vars)
|
||||
requests += generate_brkitr_dictionaries(config, io, common_vars)
|
||||
requests += generate_normalization(config, io, common_vars)
|
||||
|
@ -184,7 +185,7 @@ def generate_brkitr_brk(config, io, common_vars):
|
|||
category = "brkitr_rules",
|
||||
dep_targets =
|
||||
[DepTarget("cnvalias"),
|
||||
DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res")],
|
||||
DepTarget("ulayout"), DepTarget("uemoji"), DepTarget("lstm_res"), DepTarget("adaboost_res")],
|
||||
input_files = input_files,
|
||||
output_files = output_files,
|
||||
tool = IcuTool("genbrk"),
|
||||
|
@ -506,6 +507,32 @@ def generate_brkitr_lstm(config, io, common_vars):
|
|||
)
|
||||
]
|
||||
|
||||
def generate_brkitr_adaboost(config, io, common_vars):
|
||||
input_files = [InFile(filename) for filename in io.glob("brkitr/adaboost/*.txt")]
|
||||
input_basenames = [v.filename[16:] for v in input_files]
|
||||
output_files = [
|
||||
OutFile("brkitr/%s.res" % v[:-4])
|
||||
for v in input_basenames
|
||||
]
|
||||
return [
|
||||
RepeatedOrSingleExecutionRequest(
|
||||
name = "adaboost_res",
|
||||
category = "brkitr_adaboost",
|
||||
dep_targets = [],
|
||||
input_files = input_files,
|
||||
output_files = output_files,
|
||||
tool = IcuTool("genrb"),
|
||||
args = "-s {IN_DIR}/brkitr/adaboost -d {OUT_DIR}/brkitr -i {OUT_DIR} "
|
||||
"-k "
|
||||
"{INPUT_BASENAME}",
|
||||
format_with = {
|
||||
},
|
||||
repeat_with = {
|
||||
"INPUT_BASENAME": utils.SpaceSeparatedList(input_basenames)
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
def generate_tree(
|
||||
config,
|
||||
io,
|
||||
|
|
940
icu4c/source/data/brkitr/adaboost/jaml.txt
Normal file
940
icu4c/source/data/brkitr/adaboost/jaml.txt
Normal file
|
@ -0,0 +1,940 @@
|
|||
// © 2022 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
jaml {
|
||||
modelKeys {
|
||||
"BB2:062071",
|
||||
"UB3:061",
|
||||
"UB3:071",
|
||||
"TB2:062062062",
|
||||
"TB4:062062062",
|
||||
"UB3:063",
|
||||
"UB4:071",
|
||||
"BB3:062062",
|
||||
"UB4:062",
|
||||
"BB1:062071",
|
||||
"BB1:062061",
|
||||
"UB4:061",
|
||||
"TB1:071071062",
|
||||
"TB3:062063063",
|
||||
"UB2:061",
|
||||
"TB1:062071062",
|
||||
"TB3:062062062",
|
||||
"BB2:063063",
|
||||
"UW3:は",
|
||||
"UW3:に",
|
||||
"TB3:062071062",
|
||||
"UW3:が",
|
||||
"UW4:こ",
|
||||
"UB5:061",
|
||||
"UW3:と",
|
||||
"TB4:063063063",
|
||||
"UW4:て",
|
||||
"TB2:062062061",
|
||||
"UW3:。",
|
||||
"UW4:お",
|
||||
"UW3:の",
|
||||
"BB3:071071",
|
||||
"BB3:062071",
|
||||
"UW3:お",
|
||||
"UW3:し",
|
||||
"UW4:、",
|
||||
"UW4:の",
|
||||
"UW3:を",
|
||||
"UW4:。",
|
||||
"UW3:、",
|
||||
"UW5:で",
|
||||
"UW4:あ",
|
||||
"BB2:062062",
|
||||
"UW4:っ",
|
||||
"UW5:っ",
|
||||
"UW3:も",
|
||||
"UW5:う",
|
||||
"UW3:「",
|
||||
"UW5:な",
|
||||
"UW4:そ",
|
||||
"UW4:る",
|
||||
"UW3:っ",
|
||||
"UW4:「",
|
||||
"UW4:い",
|
||||
"BB2:087087",
|
||||
"UB4:087",
|
||||
"UW5:に",
|
||||
"BW3:もの",
|
||||
"UW5:し",
|
||||
"UW6:う",
|
||||
"BW2:とい",
|
||||
"UW4:に",
|
||||
"UW3:る",
|
||||
"TB2:071062071",
|
||||
"UW4:で",
|
||||
"UW5:が",
|
||||
"BB1:071071",
|
||||
"UW5:は",
|
||||
"UW4:は",
|
||||
"UW4:れ",
|
||||
"UW5:き",
|
||||
"BB2:071062",
|
||||
"BB2:071071",
|
||||
"UW3:・",
|
||||
"BB2:071087",
|
||||
"BB2:061062",
|
||||
"TB1:062061062",
|
||||
"UW3:れ",
|
||||
"BB2:087062",
|
||||
"TB2:087087087",
|
||||
"UW4:ら",
|
||||
"TB1:071071071",
|
||||
"UB2:071",
|
||||
"TB1:062062087",
|
||||
"UW5:す",
|
||||
"UW5:ん",
|
||||
"UW3:で",
|
||||
"UW4:が",
|
||||
"UW3:こ",
|
||||
"TB4:071062062",
|
||||
"UW3:ら",
|
||||
"UW6:に",
|
||||
"UW6:。",
|
||||
"UW3:た",
|
||||
"TB1:061071071",
|
||||
"UW5:く",
|
||||
"UB1:063",
|
||||
"UW1:そ",
|
||||
"UW3:う",
|
||||
"BW3:とい",
|
||||
"BW3:とこ",
|
||||
"UW3:ま",
|
||||
"BW3:こと",
|
||||
"UW2:っ",
|
||||
"UW5:・",
|
||||
"TB3:062062061",
|
||||
"UW3:き",
|
||||
"UW4:ん",
|
||||
"UB3:062",
|
||||
"UW3:く",
|
||||
"UW3:」",
|
||||
"UW5:あ",
|
||||
"BB2:062087",
|
||||
"BW3:いう",
|
||||
"UW5:れ",
|
||||
"UW2:一",
|
||||
"UW3:,",
|
||||
"UW1:に",
|
||||
"UW2:と",
|
||||
"TB2:071071062",
|
||||
"TB2:071071071",
|
||||
"UW5:を",
|
||||
"UW4:り",
|
||||
"BW1:から",
|
||||
"UW3:ち",
|
||||
"BW3:いい",
|
||||
"UW2:は",
|
||||
"UW6:た",
|
||||
"TB1:063063062",
|
||||
"UW4:1",
|
||||
"UW4:や",
|
||||
"UW2:ん",
|
||||
"UW3:]",
|
||||
"UW4:ほ",
|
||||
"TB3:062087087",
|
||||
"BW2:であ",
|
||||
"UW4:だ",
|
||||
"BB3:071062",
|
||||
"TB1:087087087",
|
||||
"BW3:・・",
|
||||
"BW3:とき",
|
||||
"UW4:を",
|
||||
"UW3:て",
|
||||
"UW4:か",
|
||||
"UW2:そ",
|
||||
"TB4:071071062",
|
||||
"TB2:062061071",
|
||||
"UW2:を",
|
||||
"UW4:ご",
|
||||
"UW2:で",
|
||||
"TB3:071071071",
|
||||
"BB1:087087",
|
||||
"UW2:し",
|
||||
"UW4:出",
|
||||
"UW2:ま",
|
||||
"UW4:,",
|
||||
"UW5:と",
|
||||
"UW4:ど",
|
||||
"BW3:して",
|
||||
"UW1:で",
|
||||
"BB2:061071",
|
||||
"BW3:ため",
|
||||
"BW2:とし",
|
||||
"BW2:ない",
|
||||
"BW2:てい",
|
||||
"UW3:間",
|
||||
"UW3:!",
|
||||
"UW5:ー",
|
||||
"UW4:す",
|
||||
"UW4:!",
|
||||
"BW1:とが",
|
||||
"UW5:の",
|
||||
"TB4:062062071",
|
||||
"TB2:061071071",
|
||||
"UW6:・",
|
||||
"UW3:.",
|
||||
"UW2:て",
|
||||
"UW3:笑",
|
||||
"UW2:こ",
|
||||
"UW5:も",
|
||||
"BW3:よう",
|
||||
"UW3:人",
|
||||
"UW2:の",
|
||||
"UW3:か",
|
||||
"UW3:日",
|
||||
"UW1:い",
|
||||
"BW2:とこ",
|
||||
"UW4:私",
|
||||
"UW3:…",
|
||||
"UW2:に",
|
||||
"UW3:今",
|
||||
"BB3:087062",
|
||||
"UB3:055",
|
||||
"UW4:(",
|
||||
"BB1:087071",
|
||||
"UW1:な",
|
||||
"BB3:063063",
|
||||
"UW5:来",
|
||||
"UW3:?",
|
||||
"TW3:ている",
|
||||
"UW4:」",
|
||||
"UW4:前",
|
||||
"BW1:いう",
|
||||
"UW4:つ",
|
||||
"UW3:)",
|
||||
"BW1:では",
|
||||
"UW2:る",
|
||||
"UW5:そ",
|
||||
"UW4:ー",
|
||||
"TW2:気に入",
|
||||
"UW4:笑",
|
||||
"UW4:ひ",
|
||||
"TB4:087087087",
|
||||
"UW4:け",
|
||||
"UW2:も",
|
||||
"BW3:ちょ",
|
||||
"BW3:出来",
|
||||
"TB2:062071062",
|
||||
"UW4:『",
|
||||
"UW3:[",
|
||||
"UW4:2",
|
||||
"UW5:つ",
|
||||
"TB1:061071062",
|
||||
"UW3:1",
|
||||
"BW3:から",
|
||||
"UB5:071",
|
||||
"UW4:ま",
|
||||
"UW3:ば",
|
||||
"UW3:り",
|
||||
"BW3:その",
|
||||
"UW3:ご",
|
||||
"UW4:わ",
|
||||
"BW2:てお",
|
||||
"TB2:071062062",
|
||||
"BW1:ない",
|
||||
"UW2:よ",
|
||||
"UB2:087",
|
||||
"UW6:の",
|
||||
"UW2:毎",
|
||||
"UW2:結",
|
||||
"TW4:の京都",
|
||||
"UW3:さ",
|
||||
"UW2:最",
|
||||
"BW2:です",
|
||||
"UW2:」",
|
||||
"UW5:え",
|
||||
"UW3:だ",
|
||||
"TW4:ところ",
|
||||
"UW4:.",
|
||||
"UB1:062",
|
||||
"UW6:て",
|
||||
"UW1:が",
|
||||
"BW2:、と",
|
||||
"UW3:0",
|
||||
"UW3:ん",
|
||||
"UW3:中",
|
||||
"UW4:よ",
|
||||
"BW3:この",
|
||||
"UW2:が",
|
||||
"UW3:み",
|
||||
"TW2:ではな",
|
||||
"UW6:と",
|
||||
"UW4:[",
|
||||
"TW3:、ある",
|
||||
"BW3:ころ",
|
||||
"UW4:?",
|
||||
"UW6:、",
|
||||
"UW4:電",
|
||||
"BB1:062040",
|
||||
"UW3:後",
|
||||
"UW5:い",
|
||||
"UW2:、",
|
||||
"UW5:て",
|
||||
"BB2:062040",
|
||||
"UW3:真",
|
||||
"UW3:そ",
|
||||
"UW5:さ",
|
||||
"UB5:087",
|
||||
"TW3:という",
|
||||
"UW3:分",
|
||||
"UB6:071",
|
||||
"BW3:なっ",
|
||||
"UW4:ろ",
|
||||
"BB2:061061",
|
||||
"TW3:ところ",
|
||||
"UB1:071",
|
||||
"UW1:、",
|
||||
"BW1:とか",
|
||||
"UW3:な",
|
||||
"UW6:り",
|
||||
"UW4:間",
|
||||
"UW3:べ",
|
||||
"UW5:べ",
|
||||
"TB4:062071062",
|
||||
"UW4:]",
|
||||
"BW2:には",
|
||||
"UW5:々",
|
||||
"BW1:。・",
|
||||
"BW1:その",
|
||||
"UW1:す",
|
||||
"UW4:)",
|
||||
"UW6:っ",
|
||||
"TB3:063063063",
|
||||
"TB3:062071071",
|
||||
"UB5:063",
|
||||
"BW1:かも",
|
||||
"UW6:る",
|
||||
"TB4:062063063",
|
||||
"UW3:ど",
|
||||
"TW3:である",
|
||||
"TW4:くらい",
|
||||
"BW1:最近",
|
||||
"BW1:しい",
|
||||
"BW1:とも",
|
||||
"BW2:と同",
|
||||
"TW1:という",
|
||||
"UW2:さ",
|
||||
"BW2:帯電",
|
||||
"TB1:071062062",
|
||||
"BW3:そし",
|
||||
"UW2:。",
|
||||
"UW5:か",
|
||||
"UW5:こ",
|
||||
"BW3:ない",
|
||||
"BW1:んな",
|
||||
"BW2:でき",
|
||||
"UW4:3",
|
||||
"UW3:け",
|
||||
"TW4:ことが",
|
||||
"BW1:こと",
|
||||
"UB3:087",
|
||||
"UW3:電",
|
||||
"UW3:よ",
|
||||
"BW1:たと",
|
||||
"UW5:ま",
|
||||
"UW5:た",
|
||||
"UW5:ち",
|
||||
"UW2:け",
|
||||
"UW5:だ",
|
||||
"UW3:度",
|
||||
"BW1:たい",
|
||||
"UW4:使",
|
||||
"UW2:き",
|
||||
"TW4:かなり",
|
||||
"UB6:063",
|
||||
"BB1:062062",
|
||||
"UW4:込",
|
||||
"TW3:と言っ",
|
||||
"UW6:だ",
|
||||
"UW5:り",
|
||||
"UW5:よ",
|
||||
"BW3:どう",
|
||||
"UW4:…",
|
||||
"UW3:や",
|
||||
"BW1:かし",
|
||||
"BW3:かっ",
|
||||
"UW4:今",
|
||||
"UW3:『",
|
||||
"UW4:思",
|
||||
"UB2:063",
|
||||
"UW4:く",
|
||||
"UW3:京",
|
||||
"UW6:ー",
|
||||
"UW1:ん",
|
||||
"BW1:うな",
|
||||
"TB2:062061061",
|
||||
"UW1:と",
|
||||
"TB4:062063062",
|
||||
"TB2:061062062",
|
||||
"BW1:この",
|
||||
"BW2:ので",
|
||||
"UW4:み",
|
||||
"UW5:わ",
|
||||
"UW6:や",
|
||||
"BW1:れて",
|
||||
"UW2:や",
|
||||
"UW6:こ",
|
||||
"UW4:な",
|
||||
"UW5:め",
|
||||
"BW1:もう",
|
||||
"TB4:071062071",
|
||||
"BW1:より",
|
||||
"UW4:合",
|
||||
"UW6:け",
|
||||
"BW1:少し",
|
||||
"BW2:でし",
|
||||
"UW4:と",
|
||||
"TB1:063063063",
|
||||
"UW3:ー",
|
||||
"BW2:くな",
|
||||
"UW2:く",
|
||||
"UW2:我",
|
||||
"BW2:いも",
|
||||
"BW3:わか",
|
||||
"TB2:071063071",
|
||||
"UW4:も",
|
||||
"UW1:あ",
|
||||
"UW4:最",
|
||||
"BW1:るの",
|
||||
"UW2:全",
|
||||
"UW6:0",
|
||||
"UW4:放",
|
||||
"UW4:京",
|
||||
"BW3:かけ",
|
||||
"UW2:少",
|
||||
"BW3:もう",
|
||||
"UW2:多",
|
||||
"UW2:う",
|
||||
"TB1:062062040",
|
||||
"UW1:を",
|
||||
"UW3:光",
|
||||
"BW1:!!",
|
||||
"UW2:ャ",
|
||||
"BW3:すぐ",
|
||||
"UW4:帯",
|
||||
"UW6:し",
|
||||
"BW3:でも",
|
||||
"BW2:、そ",
|
||||
"TB3:071087087",
|
||||
"TB2:063062071",
|
||||
"UW3:わ",
|
||||
"UB4:063",
|
||||
"TB4:071071071",
|
||||
"UW5:都",
|
||||
"UW5:ず",
|
||||
"UW2:バ",
|
||||
"UW2:京",
|
||||
"UW3:ゃ",
|
||||
"BW1:い、",
|
||||
"BW3:よく",
|
||||
"BW1:たら",
|
||||
"BW2:のよ",
|
||||
"UW2:思",
|
||||
"BW1:うに",
|
||||
"BW1:の間",
|
||||
"UW6:ん",
|
||||
"UW6:ず",
|
||||
"BW1:った",
|
||||
"TW3:ること",
|
||||
"BW3:とて",
|
||||
"TW1:ような",
|
||||
"UW6:ぱ",
|
||||
"TB3:063071062",
|
||||
"TW4:って、",
|
||||
"TW4:なんて",
|
||||
"TW2:その後",
|
||||
"UW6:ら",
|
||||
"TW4:ことに",
|
||||
"UW3:>",
|
||||
"TW3:てしま",
|
||||
"UW3:い",
|
||||
"TB4:071062061",
|
||||
"UW2:ひ",
|
||||
"UW6:め",
|
||||
"UW6:で",
|
||||
"BW3:なる",
|
||||
"UW5:ご",
|
||||
"BW2:りし",
|
||||
"UW6:電",
|
||||
"UW1:は",
|
||||
"BW1:いも",
|
||||
"BW3:すご",
|
||||
"UW4:通",
|
||||
"BW3:おり",
|
||||
"BW3:かか",
|
||||
"BW1:思い",
|
||||
}
|
||||
modelValues:intvector {
|
||||
1800,
|
||||
271,
|
||||
-857,
|
||||
-417,
|
||||
285,
|
||||
-583,
|
||||
388,
|
||||
828,
|
||||
-853,
|
||||
-820,
|
||||
502,
|
||||
-708,
|
||||
358,
|
||||
1341,
|
||||
-586,
|
||||
-451,
|
||||
257,
|
||||
-1876,
|
||||
2052,
|
||||
1698,
|
||||
-458,
|
||||
2048,
|
||||
1182,
|
||||
-551,
|
||||
980,
|
||||
773,
|
||||
-1453,
|
||||
-152,
|
||||
3201,
|
||||
2865,
|
||||
1203,
|
||||
144,
|
||||
-369,
|
||||
-2539,
|
||||
-613,
|
||||
-3574,
|
||||
-1111,
|
||||
3110,
|
||||
-3022,
|
||||
2039,
|
||||
-1091,
|
||||
1241,
|
||||
-560,
|
||||
-1412,
|
||||
625,
|
||||
1350,
|
||||
297,
|
||||
-2404,
|
||||
-595,
|
||||
1007,
|
||||
-1829,
|
||||
-1662,
|
||||
3213,
|
||||
270,
|
||||
-911,
|
||||
178,
|
||||
-727,
|
||||
2716,
|
||||
-484,
|
||||
-344,
|
||||
929,
|
||||
-1236,
|
||||
760,
|
||||
-299,
|
||||
-419,
|
||||
-728,
|
||||
122,
|
||||
-704,
|
||||
-605,
|
||||
-1507,
|
||||
545,
|
||||
-68,
|
||||
-320,
|
||||
1498,
|
||||
953,
|
||||
-323,
|
||||
-575,
|
||||
-673,
|
||||
520,
|
||||
-450,
|
||||
-1767,
|
||||
-247,
|
||||
56,
|
||||
231,
|
||||
-764,
|
||||
536,
|
||||
794,
|
||||
-703,
|
||||
-566,
|
||||
51,
|
||||
390,
|
||||
52,
|
||||
-182,
|
||||
466,
|
||||
133,
|
||||
354,
|
||||
107,
|
||||
492,
|
||||
488,
|
||||
-1194,
|
||||
1145,
|
||||
-847,
|
||||
812,
|
||||
151,
|
||||
-517,
|
||||
-314,
|
||||
-553,
|
||||
-783,
|
||||
-117,
|
||||
736,
|
||||
-88,
|
||||
-598,
|
||||
569,
|
||||
606,
|
||||
287,
|
||||
744,
|
||||
1739,
|
||||
-217,
|
||||
-219,
|
||||
-144,
|
||||
234,
|
||||
-649,
|
||||
-757,
|
||||
834,
|
||||
-819,
|
||||
869,
|
||||
-275,
|
||||
-267,
|
||||
154,
|
||||
653,
|
||||
594,
|
||||
255,
|
||||
1018,
|
||||
1124,
|
||||
284,
|
||||
-1624,
|
||||
-372,
|
||||
440,
|
||||
-184,
|
||||
-1936,
|
||||
1318,
|
||||
-1124,
|
||||
453,
|
||||
-92,
|
||||
-343,
|
||||
175,
|
||||
182,
|
||||
-886,
|
||||
930,
|
||||
-223,
|
||||
-57,
|
||||
-113,
|
||||
103,
|
||||
-200,
|
||||
510,
|
||||
-2099,
|
||||
-498,
|
||||
385,
|
||||
80,
|
||||
-156,
|
||||
360,
|
||||
1289,
|
||||
771,
|
||||
-1114,
|
||||
-399,
|
||||
870,
|
||||
1230,
|
||||
79,
|
||||
472,
|
||||
-1596,
|
||||
-1092,
|
||||
-572,
|
||||
55,
|
||||
-151,
|
||||
-124,
|
||||
1316,
|
||||
-248,
|
||||
1280,
|
||||
-125,
|
||||
-284,
|
||||
-1023,
|
||||
862,
|
||||
84,
|
||||
417,
|
||||
568,
|
||||
-88,
|
||||
-528,
|
||||
910,
|
||||
674,
|
||||
-212,
|
||||
894,
|
||||
-121,
|
||||
1108,
|
||||
762,
|
||||
260,
|
||||
-197,
|
||||
91,
|
||||
-53,
|
||||
1117,
|
||||
-645,
|
||||
-868,
|
||||
-611,
|
||||
220,
|
||||
422,
|
||||
1431,
|
||||
-532,
|
||||
-157,
|
||||
-476,
|
||||
-846,
|
||||
-1309,
|
||||
-1614,
|
||||
1225,
|
||||
302,
|
||||
-738,
|
||||
-260,
|
||||
892,
|
||||
-778,
|
||||
-193,
|
||||
1221,
|
||||
-779,
|
||||
489,
|
||||
420,
|
||||
-85,
|
||||
-525,
|
||||
-830,
|
||||
26,
|
||||
270,
|
||||
439,
|
||||
-120,
|
||||
1263,
|
||||
-795,
|
||||
291,
|
||||
-1310,
|
||||
-23,
|
||||
347,
|
||||
312,
|
||||
-107,
|
||||
-114,
|
||||
701,
|
||||
830,
|
||||
1309,
|
||||
-451,
|
||||
260,
|
||||
-1080,
|
||||
536,
|
||||
188,
|
||||
-60,
|
||||
643,
|
||||
-1184,
|
||||
31,
|
||||
-194,
|
||||
-51,
|
||||
-514,
|
||||
-442,
|
||||
-120,
|
||||
649,
|
||||
410,
|
||||
882,
|
||||
-75,
|
||||
-341,
|
||||
-718,
|
||||
-128,
|
||||
340,
|
||||
-1245,
|
||||
-164,
|
||||
-1052,
|
||||
70,
|
||||
-256,
|
||||
279,
|
||||
786,
|
||||
40,
|
||||
-177,
|
||||
97,
|
||||
-411,
|
||||
222,
|
||||
-89,
|
||||
-277,
|
||||
-146,
|
||||
414,
|
||||
483,
|
||||
21,
|
||||
-339,
|
||||
-406,
|
||||
-360,
|
||||
-450,
|
||||
-14,
|
||||
-36,
|
||||
513,
|
||||
252,
|
||||
54,
|
||||
-501,
|
||||
-478,
|
||||
450,
|
||||
-36,
|
||||
-644,
|
||||
-392,
|
||||
714,
|
||||
643,
|
||||
-341,
|
||||
91,
|
||||
-1018,
|
||||
34,
|
||||
-177,
|
||||
123,
|
||||
80,
|
||||
-695,
|
||||
-44,
|
||||
-357,
|
||||
253,
|
||||
-389,
|
||||
613,
|
||||
515,
|
||||
418,
|
||||
-396,
|
||||
-553,
|
||||
193,
|
||||
298,
|
||||
-334,
|
||||
-57,
|
||||
-315,
|
||||
-77,
|
||||
33,
|
||||
88,
|
||||
137,
|
||||
280,
|
||||
-448,
|
||||
196,
|
||||
-136,
|
||||
-295,
|
||||
-329,
|
||||
-92,
|
||||
-360,
|
||||
-132,
|
||||
-288,
|
||||
-45,
|
||||
-43,
|
||||
174,
|
||||
75,
|
||||
-60,
|
||||
330,
|
||||
360,
|
||||
217,
|
||||
130,
|
||||
473,
|
||||
-41,
|
||||
-23,
|
||||
-340,
|
||||
-530,
|
||||
-69,
|
||||
-71,
|
||||
-115,
|
||||
297,
|
||||
-240,
|
||||
229,
|
||||
507,
|
||||
-348,
|
||||
171,
|
||||
-320,
|
||||
239,
|
||||
16,
|
||||
-195,
|
||||
-277,
|
||||
-41,
|
||||
69,
|
||||
280,
|
||||
-264,
|
||||
30,
|
||||
249,
|
||||
-97,
|
||||
-163,
|
||||
-221,
|
||||
96,
|
||||
83,
|
||||
82,
|
||||
-218,
|
||||
-93,
|
||||
-53,
|
||||
40,
|
||||
28,
|
||||
285,
|
||||
27,
|
||||
283,
|
||||
-211,
|
||||
-92,
|
||||
214,
|
||||
-225,
|
||||
-54,
|
||||
53,
|
||||
105,
|
||||
-198,
|
||||
-53,
|
||||
-277,
|
||||
198,
|
||||
184,
|
||||
-264,
|
||||
-106,
|
||||
14,
|
||||
185,
|
||||
-155,
|
||||
185,
|
||||
106,
|
||||
-119,
|
||||
53,
|
||||
208,
|
||||
92,
|
||||
262,
|
||||
106,
|
||||
-52,
|
||||
105,
|
||||
-25,
|
||||
-79,
|
||||
104,
|
||||
141,
|
||||
129,
|
||||
-114,
|
||||
26,
|
||||
64,
|
||||
-113,
|
||||
26,
|
||||
77,
|
||||
-64,
|
||||
13,
|
||||
13,
|
||||
26,
|
||||
89,
|
||||
115,
|
||||
-49,
|
||||
89,
|
||||
-114,
|
||||
51,
|
||||
64,
|
||||
-64,
|
||||
-51,
|
||||
-38,
|
||||
89,
|
||||
13,
|
||||
-64,
|
||||
13,
|
||||
-48,
|
||||
76,
|
||||
63,
|
||||
62,
|
||||
13,
|
||||
112,
|
||||
-76,
|
||||
-50,
|
||||
-13,
|
||||
-49,
|
||||
63,
|
||||
-50,
|
||||
13,
|
||||
13,
|
||||
-50,
|
||||
24,
|
||||
-12,
|
||||
24,
|
||||
12,
|
||||
24,
|
||||
12,
|
||||
-12,
|
||||
-24,
|
||||
12,
|
||||
-12,
|
||||
-12,
|
||||
12,
|
||||
-12,
|
||||
}
|
||||
}
|
|
@ -273,8 +273,8 @@ def _preprocess_file_filters(requests, config, io):
|
|||
default_filter_json = "exclude" if config.strategy == "additive" else "include"
|
||||
for category in all_categories:
|
||||
filter_json = default_filter_json
|
||||
# Special default for category "brkitr_lstm" as "exclude" for now.
|
||||
if "brkitr_lstm" == category:
|
||||
# Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now.
|
||||
if "brkitr_lstm" == category or "brkitr_adaboost" == category:
|
||||
filter_json = "exclude"
|
||||
# Figure out the correct filter to create for now.
|
||||
if "featureFilters" in json_data and category in json_data["featureFilters"]:
|
||||
|
|
|
@ -211,7 +211,7 @@ group: breakiterator
|
|||
brkiter.o brkeng.o ubrk.o
|
||||
rbbi.o rbbinode.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o
|
||||
rbbidata.o rbbirb.o rbbi_cache.o
|
||||
dictionarydata.o dictbe.o lstmbe.o
|
||||
dictionarydata.o dictbe.o lstmbe.o mlbe.o
|
||||
# BreakIterator::makeInstance() factory implementation makes for circular dependency
|
||||
# between BreakIterator base and FilteredBreakIteratorBuilder.
|
||||
filteredbrk.o
|
||||
|
|
|
@ -42,6 +42,7 @@
|
|||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstr.h"
|
||||
#include "cstring.h"
|
||||
#include "intltest.h"
|
||||
#include "lstmbe.h"
|
||||
#include "rbbitst.h"
|
||||
|
@ -835,9 +836,28 @@ void RBBITest::TestExtended() {
|
|||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createLineInstance(locale, status);
|
||||
skipTest = false;
|
||||
#if UCONFIG_USE_ML_PHRASE_BREAKING
|
||||
if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
|
||||
// skip <line> test cases of JP's phrase breaking when ML is enabled.
|
||||
skipTest = true;
|
||||
}
|
||||
#endif
|
||||
charIdx += 5;
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 8, u"<lineML>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createLineInstance(locale, status);
|
||||
skipTest = false;
|
||||
#if !UCONFIG_USE_ML_PHRASE_BREAKING
|
||||
if(uprv_strcmp(locale.getName(), "ja@lw=phrase") == 0) {
|
||||
// skip <lineML> test cases of JP's phrase breaking when ML is disabled.
|
||||
skipTest = true;
|
||||
}
|
||||
#endif
|
||||
charIdx += 7;
|
||||
break;
|
||||
}
|
||||
if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
|
||||
delete tp.bi;
|
||||
tp.bi = BreakIterator::createSentenceInstance(locale, status);
|
||||
|
|
24
icu4c/source/test/testdata/rbbitst.txt
vendored
24
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1913,6 +1913,26 @@ Bangkok)•</data>
|
|||
<data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data>
|
||||
|
||||
<locale ja@lw=phrase>
|
||||
#phrase breaking test cases for the ML solution
|
||||
<lineML>
|
||||
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
|
||||
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
|
||||
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
|
||||
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
|
||||
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
|
||||
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
|
||||
<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
|
||||
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
|
||||
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
|
||||
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
|
||||
<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data>
|
||||
#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します
|
||||
<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data>
|
||||
#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど
|
||||
<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data>
|
||||
|
||||
<locale ja@lw=phrase>
|
||||
#phrase breaking test cases for the dictionary based solution
|
||||
<line>
|
||||
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
|
||||
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
|
||||
|
@ -2005,8 +2025,8 @@ Bangkok)•</data>
|
|||
#大韓民國은 民主共和國이다
|
||||
#<data>•大韓民國은 •民主•共和國이다•</data>
|
||||
# All the tests for ja@lw=phrase should also work in Korean.
|
||||
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
|
||||
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
|
||||
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
|
||||
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
|
||||
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
|
||||
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue