From 80fb309c8a5f865767ef72f85ea1bf70c29e2b39 Mon Sep 17 00:00:00 2001 From: allenwtsu Date: Mon, 9 Jan 2023 05:56:18 +0000 Subject: [PATCH] ICU-22100 Remove unicode blocks from Japanese ML phrase breaking See #2278 --- icu4c/source/common/mlbe.cpp | 199 +- icu4c/source/common/mlbe.h | 64 +- icu4c/source/data/brkitr/adaboost/jaml.txt | 1620 +++++++---------- icu4c/source/test/testdata/rbbitst.txt | 6 +- .../ibm/icu/impl/breakiter/MlBreakEngine.java | 222 +-- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 6 +- 6 files changed, 780 insertions(+), 1337 deletions(-) diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp index 3ccf470e5b1..79b163299eb 100644 --- a/icu4c/source/common/mlbe.cpp +++ b/icu4c/source/common/mlbe.cpp @@ -18,28 +18,6 @@ U_NAMESPACE_BEGIN -Element::Element() : length(0) {} - -void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) { - character = ch; - U_ASSERT(idx.length() <= 3); - length = idx.length(); - idx.extract(0, length, ublock); - ublock[length] = '\0'; -} - -UChar32 Element::getCharacter() const { - return character; -} - -char16_t* Element::getUblock() const { - return (char16_t*)ublock; -} - -uint16_t Element::getLength() const { - return length; -} - MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet, const UnicodeSet &closePunctuationSet, UErrorCode &status) : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet), @@ -56,12 +34,8 @@ MlBreakEngine::~MlBreakEngine() {} namespace { const char16_t INVALID = u'|'; - const int32_t MAX_FEATURE = 26; - const int32_t MAX_FEATURE_LENGTH = 14; - - bool isValid(const Element& element) { - return element.getLength() != 1 || element.getUblock()[0] != INVALID; - } + const int32_t MAX_FEATURE = 13; + const int32_t MAX_FEATURE_LENGTH = 11; void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) { if (U_FAILURE(status)) { @@ -74,11 +48,6 @@ namespace { U_ASSERT(result.length() < MAX_FEATURE_LENGTH); result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates } - - void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) { - U_ASSERT(str.length() < MAX_FEATURE_LENGTH); - str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates - } } int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd, @@ -98,12 +67,11 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t return 0; } int32_t numBreaks = 0; - UChar32 ch; UnicodeString index; // The ML model groups six char to evaluate if the 4th char is a breakpoint. // Like a sliding window, the elementList removes the first char and appends the new char from // inString in each iteration so that its size always remains at six. - Element elementList[6]; + UChar32 elementList[6]; int32_t codeUts = initElementList(inString, elementList, status); int32_t length = inString.countChar32(); @@ -117,12 +85,10 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t evaluateBreakpoint(elementList, i, numBreaks, boundary, status); if (i + 1 >= inString.countChar32()) break; // Remove the first element and append a new element - uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element)); - ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID; - index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID); - elementList[5].setCharAndUblock(ch, index); - if (ch != INVALID) { - codeUts += U16_LENGTH(ch); + uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32)); + elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID; + if (elementList[5] != INVALID) { + codeUts += U16_LENGTH(elementList[5]); } } if (U_FAILURE(status)) return 0; @@ -176,7 +142,7 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t return correctedNumBreaks; } -void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks, +void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks, UVector32 &boundary, UErrorCode &status) const { char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH]; if (U_FAILURE(status)) { @@ -186,12 +152,12 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3 UChar32 arr[4] = {-1, -1, -1, -1}; int32_t length = 0, listLength = 0; - const UChar32 w1 = elementList[0].getCharacter(); - const UChar32 w2 = elementList[1].getCharacter(); - const UChar32 w3 = elementList[2].getCharacter(); - const UChar32 w4 = elementList[3].getCharacter(); - const UChar32 w5 = elementList[4].getCharacter(); - const UChar32 w6 = elementList[5].getCharacter(); + const UChar32 w1 = elementList[0]; + const UChar32 w2 = elementList[1]; + const UChar32 w3 = elementList[2]; + const UChar32 w4 = elementList[3]; + const UChar32 w5 = elementList[4]; + const UChar32 w6 = elementList[5]; length = 1; if (w1 != INVALID) { @@ -259,82 +225,6 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3 arr[2] = w6; concatChar(u"TW4:", arr, length, featureList[listLength++], status); } - if (isValid(elementList[0])) { - writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0, - elementList[0].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[1])) { - writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0, - elementList[1].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[2])) { - writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0, - elementList[2].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[3])) { - writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0, - elementList[3].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[4])) { - writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0, - elementList[4].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[5])) { - writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0, - elementList[5].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[1]) && isValid(elementList[2])) { - writeString(UnicodeString(u"BB1:") - .append(elementList[1].getUblock(), 0, elementList[1].getLength()) - .append(elementList[2].getUblock(), 0, elementList[2].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[2]) && isValid(elementList[3])) { - writeString(UnicodeString(u"BB2:") - .append(elementList[2].getUblock(), 0, elementList[2].getLength()) - .append(elementList[3].getUblock(), 0, elementList[3].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[3]) && isValid(elementList[4])) { - writeString(UnicodeString(u"BB3:") - .append(elementList[3].getUblock(), 0, elementList[3].getLength()) - .append(elementList[4].getUblock(), 0, elementList[4].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) { - writeString(UnicodeString(u"TB1:") - .append(elementList[0].getUblock(), 0, elementList[0].getLength()) - .append(elementList[1].getUblock(), 0, elementList[1].getLength()) - .append(elementList[2].getUblock(), 0, elementList[2].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) { - writeString(UnicodeString(u"TB2:") - .append(elementList[1].getUblock(), 0, elementList[1].getLength()) - .append(elementList[2].getUblock(), 0, elementList[2].getLength()) - .append(elementList[3].getUblock(), 0, elementList[3].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) { - writeString(UnicodeString(u"TB3:") - .append(elementList[2].getUblock(), 0, elementList[2].getLength()) - .append(elementList[3].getUblock(), 0, elementList[3].getLength()) - .append(elementList[4].getUblock(), 0, elementList[4].getLength()), - featureList[listLength++], status); - } - if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) { - writeString(UnicodeString(u"TB4:") - .append(elementList[3].getUblock(), 0, elementList[3].getLength()) - .append(elementList[4].getUblock(), 0, elementList[4].getLength()) - .append(elementList[5].getUblock(), 0, elementList[5].getLength()), - featureList[listLength++], status); - } if (U_FAILURE(status)) { return; } @@ -351,7 +241,7 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3 } } -int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList, +int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList, UErrorCode &status) const { if (U_FAILURE(status)) { return 0; @@ -363,52 +253,29 @@ int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* e if (length > 0) { w3 = inString.char32At(0); index += U16_LENGTH(w3); + if (length > 1) { + w4 = inString.char32At(index); + index += U16_LENGTH(w4); + if (length > 2) { + w5 = inString.char32At(index); + index += U16_LENGTH(w5); + if (length > 3) { + w6 = inString.char32At(index); + index += U16_LENGTH(w6); + } + } + } } - if (length > 1) { - w4 = inString.char32At(index); - index += U16_LENGTH(w4); - } - if (length > 2) { - w5 = inString.char32At(index); - index += U16_LENGTH(w5); - } - if (length > 3) { - w6 = inString.char32At(index); - index += U16_LENGTH(w6); - } - - const UnicodeString b1(INVALID); - const UnicodeString b2(b1); - const UnicodeString b3(getUnicodeBlock(w3, status)); - const UnicodeString b4(getUnicodeBlock(w4, status)); - const UnicodeString b5(getUnicodeBlock(w5, status)); - const UnicodeString b6(getUnicodeBlock(w6, status)); - - elementList[0].setCharAndUblock(w1, b1); - elementList[1].setCharAndUblock(w2, b2); - elementList[2].setCharAndUblock(w3, b3); - elementList[3].setCharAndUblock(w4, b4); - elementList[4].setCharAndUblock(w5, b5); - elementList[5].setCharAndUblock(w6, b6); + elementList[0] = w1; + elementList[1] = w2; + elementList[2] = w3; + elementList[3] = w4; + elementList[4] = w5; + elementList[5] = w6; return index; } -UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const { - if (U_FAILURE(status)) { - return UnicodeString(INVALID); - } - - UBlockCode block = ublock_getCode(ch); - if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) { - return UnicodeString(INVALID); - } else { - UnicodeString empty; - // Same as sprintf("%03d", block) - return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3); - } -} - void MlBreakEngine::loadMLModel(UErrorCode &error) { // BudouX's model consists of pairs of the feature and its score. // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h index 8943fa3414f..2f0edd6c4f2 100644 --- a/icu4c/source/common/mlbe.h +++ b/icu4c/source/common/mlbe.h @@ -13,51 +13,6 @@ U_NAMESPACE_BEGIN #if !UCONFIG_NO_BREAK_ITERATION -/** - * A class used to encapsulate a character and its unicode block index - */ -class Element : public UMemory { - public: - /** - * Default constructor. - */ - Element(); - - /** - * Set the character and its unicode block. - * - * @param ch A unicode character. - * @param ublock The unicode block of the character. - */ - void setCharAndUblock(UChar32 ch, const UnicodeString& ublock); - - /** - * Get the unicode character. - * - * @return The unicode character. - */ - UChar32 getCharacter() const; - - /** - * Get the unicode character's unicode block. - * - * @return The unicode block. - */ - char16_t* getUblock() const; - - /** - * Get the length of the unicode block. - * - * @return The unicode block length. - */ - uint16_t getLength() const; - - private: - UChar32 character; - char16_t ublock[4]; - uint16_t length; -}; - /** * A machine learning break engine for the phrase breaking in Japanese. */ @@ -104,38 +59,27 @@ class MlBreakEngine : public UMemory { */ void loadMLModel(UErrorCode &error); - /** - * Get the character's unicode block code defined in UBlockCode. - * - * @param ch A character. - * @param error Information on any errors encountered. - * @return The unicode block code which is 3 digits with '0' added in the beginning if the code - * is less than 3 digits. - * - */ - UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const; - /** * Initialize the element list from the input string. * * @param inString A input string to be segmented. - * @param elementList A list to store the first six characters and their unicode block codes. + * @param elementList A list to store the first six characters. * @param status Information on any errors encountered. * @return The number of code units of the first six characters in inString. */ - int32_t initElementList(const UnicodeString &inString, Element* elementList, + int32_t initElementList(const UnicodeString &inString, UChar32* elementList, UErrorCode &status) const; /** * Evaluate whether the index is a potential breakpoint. * - * @param elementList A list including 6 elements for the breakpoint evaluation. + * @param elementList A list including six elements for the breakpoint evaluation. * @param index The breakpoint index to be evaluated. * @param numBreaks The accumulated number of breakpoints. * @param boundary A vector including the index of the breakpoint. * @param status Information on any errors encountered. */ - void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks, + void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks, UVector32 &boundary, UErrorCode &status) const; UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt index 0500ff73fbf..4ddea6c78b2 100644 --- a/icu4c/source/data/brkitr/adaboost/jaml.txt +++ b/icu4c/source/data/brkitr/adaboost/jaml.txt @@ -2,939 +2,727 @@ // License & terms of use: http://www.unicode.org/copyright.html jaml { modelKeys { - "BB2:062071", - "UB3:061", - "UB3:071", - "TB2:062062062", - "TB4:062062062", - "UB3:063", - "UB4:071", - "BB3:062062", - "UB4:062", - "BB1:062071", - "BB1:062061", - "UB4:061", - "TB1:071071062", - "TB3:062063063", - "UB2:061", - "TB1:062071062", - "TB3:062062062", - "BB2:063063", - "UW3:は", - "UW3:に", - "TB3:062071062", - "UW3:が", - "UW4:こ", - "UB5:061", - "UW3:と", - "TB4:063063063", - "UW4:て", - "TB2:062062061", - "UW3:。", - "UW4:お", - "UW3:の", - "BB3:071071", - "BB3:062071", - "UW3:お", - "UW3:し", - "UW4:、", - "UW4:の", - "UW3:を", - "UW4:。", "UW3:、", - "UW5:で", - "UW4:あ", - "BB2:062062", - "UW4:っ", - "UW5:っ", - "UW3:も", - "UW5:う", - "UW3:「", - "UW5:な", - "UW4:そ", - "UW4:る", - "UW3:っ", - "UW4:「", - "UW4:い", - "BB2:087087", - "UB4:087", - "UW5:に", - "BW3:もの", - "UW5:し", - "UW6:う", - "BW2:とい", + "UW3:。", + "UW4:の", + "UW4:、", + "UW3:の", + "UW4:。", + "UW3:に", + "UW5:。", + "UW4:て", + "UW3:は", "UW4:に", - "UW3:る", - "TB2:071062071", - "UW4:で", - "UW5:が", - "BB1:071071", - "UW5:は", - "UW4:は", - "UW4:れ", - "UW5:き", - "BB2:071062", - "BB2:071071", - "UW3:・", - "BB2:071087", - "BB2:061062", - "TB1:062061062", - "UW3:れ", - "BB2:087062", - "TB2:087087087", - "UW4:ら", - "TB1:071071071", - "UB2:071", - "TB1:062062087", - "UW5:す", - "UW5:ん", - "UW3:で", - "UW4:が", - "UW3:こ", - "TB4:071062062", - "UW3:ら", - "UW6:に", - "UW6:。", - "UW3:た", - "TB1:061071071", - "UW5:く", - "UB1:063", - "UW1:そ", - "UW3:う", - "BW3:とい", - "BW3:とこ", - "UW3:ま", - "BW3:こと", - "UW2:っ", - "UW5:・", - "TB3:062062061", - "UW3:き", - "UW4:ん", - "UB3:062", - "UW3:く", - "UW3:」", - "UW5:あ", - "BB2:062087", - "BW3:いう", - "UW5:れ", - "UW2:一", - "UW3:,", - "UW1:に", - "UW2:と", - "TB2:071071062", - "TB2:071071071", - "UW5:を", - "UW4:り", - "BW1:から", - "UW3:ち", - "BW3:いい", - "UW2:は", - "UW6:た", - "TB1:063063062", - "UW4:1", - "UW4:や", - "UW2:ん", - "UW3:]", - "UW4:ほ", - "TB3:062087087", - "BW2:であ", - "UW4:だ", - "BB3:071062", - "TB1:087087087", - "BW3:・・", - "BW3:とき", - "UW4:を", - "UW3:て", - "UW4:か", - "UW2:そ", - "TB4:071071062", - "TB2:062061071", - "UW2:を", - "UW4:ご", - "UW2:で", - "TB3:071071071", - "BB1:087087", - "UW2:し", - "UW4:出", - "UW2:ま", - "UW4:,", - "UW5:と", - "UW4:ど", - "BW3:して", - "UW1:で", - "BB2:061071", - "BW3:ため", - "BW2:とし", - "BW2:ない", - "BW2:てい", - "UW3:間", - "UW3:!", - "UW5:ー", - "UW4:す", - "UW4:!", - "BW1:とが", - "UW5:の", - "TB4:062062071", - "TB2:061071071", - "UW6:・", - "UW3:.", - "UW2:て", - "UW3:笑", - "UW2:こ", - "UW5:も", - "BW3:よう", - "UW3:人", - "UW2:の", - "UW3:か", - "UW3:日", - "UW1:い", - "BW2:とこ", - "UW4:私", - "UW3:…", - "UW2:に", - "UW3:今", - "BB3:087062", - "UB3:055", - "UW4:(", - "BB1:087071", - "UW1:な", - "BB3:063063", - "UW5:来", - "UW3:?", - "TW3:ている", - "UW4:」", - "UW4:前", - "BW1:いう", - "UW4:つ", - "UW3:)", - "BW1:では", - "UW2:る", - "UW5:そ", - "UW4:ー", - "TW2:気に入", - "UW4:笑", - "UW4:ひ", - "TB4:087087087", - "UW4:け", - "UW2:も", - "BW3:ちょ", - "BW3:出来", - "TB2:062071062", - "UW4:『", - "UW3:[", - "UW4:2", - "UW5:つ", - "TB1:061071062", - "UW3:1", - "BW3:から", - "UB5:071", - "UW4:ま", - "UW3:ば", - "UW3:り", - "BW3:その", - "UW3:ご", - "UW4:わ", - "BW2:てお", - "TB2:071062062", - "BW1:ない", - "UW2:よ", - "UB2:087", - "UW6:の", - "UW2:毎", - "UW2:結", - "TW4:の京都", - "UW3:さ", - "UW2:最", - "BW2:です", - "UW2:」", - "UW5:え", - "UW3:だ", - "TW4:ところ", - "UW4:.", - "UB1:062", - "UW6:て", - "UW1:が", - "BW2:、と", - "UW3:0", - "UW3:ん", - "UW3:中", - "UW4:よ", - "BW3:この", - "UW2:が", - "UW3:み", - "TW2:ではな", - "UW6:と", - "UW4:[", - "TW3:、ある", - "BW3:ころ", - "UW4:?", - "UW6:、", - "UW4:電", - "BB1:062040", - "UW3:後", - "UW5:い", + "UW3:を", + "UW5:、", "UW2:、", - "UW5:て", - "BB2:062040", - "UW3:真", - "UW3:そ", - "UW5:さ", - "UB5:087", - "TW3:という", - "UW3:分", - "UB6:071", - "BW3:なっ", - "UW4:ろ", - "BB2:061061", - "TW3:ところ", - "UB1:071", - "UW1:、", - "BW1:とか", - "UW3:な", - "UW6:り", - "UW4:間", - "UW3:べ", - "UW5:べ", - "TB4:062071062", - "UW4:]", - "BW2:には", - "UW5:々", - "BW1:。・", - "BW1:その", - "UW1:す", - "UW4:)", - "UW6:っ", - "TB3:063063063", - "TB3:062071071", - "UB5:063", - "BW1:かも", - "UW6:る", - "TB4:062063063", - "UW3:ど", - "TW3:である", - "TW4:くらい", - "BW1:最近", - "BW1:しい", - "BW1:とも", - "BW2:と同", - "TW1:という", - "UW2:さ", - "BW2:帯電", - "TB1:071062062", - "BW3:そし", + "UW3:が", "UW2:。", - "UW5:か", - "UW5:こ", - "BW3:ない", - "BW1:んな", - "BW2:でき", - "UW4:3", - "UW3:け", - "TW4:ことが", - "BW1:こと", - "UB3:087", - "UW3:電", - "UW3:よ", - "BW1:たと", - "UW5:ま", - "UW5:た", - "UW5:ち", - "UW2:け", - "UW5:だ", - "UW3:度", - "BW1:たい", - "UW4:使", - "UW2:き", - "TW4:かなり", - "UB6:063", - "BB1:062062", - "UW4:込", - "TW3:と言っ", - "UW6:だ", - "UW5:り", - "UW5:よ", - "BW3:どう", - "UW4:…", - "UW3:や", - "BW1:かし", - "BW3:かっ", - "UW4:今", - "UW3:『", - "UW4:思", - "UB2:063", - "UW4:く", - "UW3:京", - "UW6:ー", - "UW1:ん", - "BW1:うな", - "TB2:062061061", - "UW1:と", - "TB4:062063062", - "TB2:061062062", - "BW1:この", - "BW2:ので", - "UW4:み", - "UW5:わ", - "UW6:や", - "BW1:れて", - "UW2:や", - "UW6:こ", - "UW4:な", - "UW5:め", - "BW1:もう", - "TB4:071062071", - "BW1:より", - "UW4:合", - "UW6:け", - "BW1:少し", - "BW2:でし", + "UW4:で", + "UW3:と", + "UW4:は", + "UW4:が", + "UW4:る", + "UW4:っ", + "UW3:も", + "UW5:な", + "UW3:で", + "UW3:る", + "UW5:で", + "UW4:を", + "UW4:か", + "UW3:っ", + "UW2:の", "UW4:と", - "TB1:063063063", - "UW3:ー", - "BW2:くな", - "UW2:く", - "UW2:我", - "BW2:いも", - "BW3:わか", - "TB2:071063071", - "UW4:も", - "UW1:あ", - "UW4:最", - "BW1:るの", - "UW2:全", - "UW6:0", - "UW4:放", - "UW4:京", - "BW3:かけ", - "UW2:少", - "BW3:もう", - "UW2:多", - "UW2:う", - "TB1:062062040", - "UW1:を", - "UW3:光", - "BW1:!!", - "UW2:ャ", - "BW3:すぐ", - "UW4:帯", - "UW6:し", - "BW3:でも", - "BW2:、そ", - "TB3:071087087", - "TB2:063062071", - "UW3:わ", - "UB4:063", - "TB4:071071071", - "UW5:都", - "UW5:ず", - "UW2:バ", - "UW2:京", - "UW3:ゃ", - "BW1:い、", - "BW3:よく", - "BW1:たら", - "BW2:のよ", - "UW2:思", - "BW1:うに", - "BW1:の間", - "UW6:ん", - "UW6:ず", - "BW1:った", - "TW3:ること", - "BW3:とて", - "TW1:ような", - "UW6:ぱ", - "TB3:063071062", - "TW4:って、", - "TW4:なんて", - "TW2:その後", - "UW6:ら", - "TW4:ことに", - "UW3:>", - "TW3:てしま", + "UW5:っ", + "UW4:な", + "UW3:て", + "UW4:た", + "UW4:こ", + "UW6:に", + "UW4:ら", + "UW3:た", + "UW2:を", + "UW3:ら", + "UW6:。", + "UW4:し", + "UW3:な", + "UW2:に", + "UW4:い", + "UW4:り", + "UW6:う", + "UW3:う", + "UW3:く", + "UW4:れ", + "UW2:は", + "UW4:だ", + "UW4:う", "UW3:い", - "TB4:071062061", - "UW2:ひ", - "UW6:め", + "UW6:い", + "UW4:ん", + "UW2:か", + "UW4:ー", + "UW6:を", + "UW2:も", + "UW5:き", + "UW3:り", "UW6:で", - "BW3:なる", - "UW5:ご", - "BW2:りし", - "UW6:電", + "UW2:る", + "UW2:と", + "UW3:]", + "UW4:そ", + "UW3:,", + "UW4:も", + "UW4:く", + "UW3:か", + "BW2:とい", + "UW4:お", + "UW4:ま", + "UW6:が", + "UW4:き", + "UW2:し", + "UW2:て", + "UW3:!", + "UW2:ま", + "UW5:に", + "UW3:や", + "UW6:て", + "BW3:もの", + "UW6:の", + "UW2:ん", + "UW2:が", + "UW5:が", + "BW1:いう", + "UW2:で", + "UW5:す", + "UW3:?", + "UW5:と", + "UW6:は", + "UW3:.", + "UW4:「", + "UW3:ば", + "UW5:ん", + "BW3:いう", + "UW4:す", + "BW1:から", + "UW3:ど", + "UW5:し", + "UW2:っ", + "UW4:思", + "UW3:…", + "UW5:る", + "BW2:てい", + "BW3:よう", + "UW5:え", + "UW4:私", + "UW3:・", + "UW4:人", + "UW5:く", + "UW3:)", + "UW4:京", + "BW2:ない", + "UW3:ー", + "BW3:とこ", + "UW5:は", + "UW4:」", + "UW2:一", + "UW4:よ", + "BW3:こと", + "UW5:ー", + "UW6:し", + "UW4:け", + "BW1:ない", + "BW2:です", + "UW4:一", + "UW5:帯", + "UW5:を", + "UW6:な", + "UW5:べ", + "BW3:いい", + "BW2:であ", + "BW2:ので", + "UW4:,", + "UW5:れ", + "UW5:ろ", + "UW1:そ", + "UW5:い", + "UW1:い", + "UW5:・", + "UW5:わ", + "UW4:1", + "UW5:う", + "UW4:大", + "UW3:ま", + "BW2:とこ", + "UW4:!", + "UW4:見", + "UW4:行", + "BW1:こと", + "UW1:な", + "UW2:さ", + "UW3:☆", + "UW4:さ", + "UW2:よ", + "BW1:とか", + "UW4:(", + "BW3:でも", + "UW5:の", + "UW4:・", + "UW5:た", + "UW1:す", + "UW5:か", + "UW4:使", + "UW3:♪", + "UW4:え", + "UW4:今", + "BW2:、と", + "BW3:とき", + "UW4:ろ", + "UW5:つ", + "UW1:に", + "UW5:じ", + "UW1:で", + "UW4:ン", + "UW3:ず", + "BW3:して", + "UW4:食", + "UW4:気", + "UW4:時", + "UW3:日", + "BW1:しい", + "UW4:自", + "UW3:笑", + "UW2:毎", + "TW1:という", + "UW4:み", + "UW4:…", + "TW2:ではな", + "UW6:さ", + "UW5:め", + "UW2:少", + "UW5:あ", + "UW4:2", + "UW3:へ", + "TW3:という", + "UW4:何", + "UW2:く", + "UW2:結", + "BW1:うな", + "BW1:もう", + "UW1:が", + "UW4:じ", + "UW2:う", + "UW4:ル", + "UW3:」", + "BW1:とが", + "UW2:最", + "BW1:るの", + "UW3:間", + "UW6:た", + "UW3:つ", + "UW4:ど", + "UW1:と", + "UW3:ん", + "UW4:.", + "UW3:だ", + "UW4:わ", + "UW4:最", + "UW4:?", + "UW3:ろ", + "UW4:ば", + "TW3:ている", + "BW3:この", + "UW5:も", + "UW3:人", + "BW3:とい", + "UW4:つ", + "BW3:その", + "BW3:もう", + "UW2:そ", + "BW2:には", + "BW3:かけ", + "TW4:の京都", + "TW4:ところ", + "UW3:京", + "UW4:携", + "BW1:かも", + "BW1:では", + "UW4:ち", + "UW3:分", + "UW4:べ", + "BW3:ころ", + "UW3:ゃ", + "UW2:す", + "BW1:。・", + "UW3:電", + "BW3:なっ", + "UW3:す", + "BW1:最近", + "UW4:め", + "UW3:ぐ", + "UW2:お", + "BW3:そし", + "BW1:かし", + "BW1:同じ", + "BW3:メー", + "UW5:て", + "UW6:り", + "TW4:くらい", + "UW3:今", + "UW5:そ", + "UW4:や", + "UW5:」", + "UW4:帯", + "UW6:ー", + "BW2:とし", + "TW1:ような", + "BW2:てお", + "UW4:笑", "UW1:は", - "BW1:いも", - "BW3:すご", - "UW4:通", - "BW3:おり", "BW3:かか", - "BW1:思い", + "TW4:かなり", + "UW4:)", + "BW1:んな", + "UW1:ち", + "TW2:気に入", + "TW1:・・・", + "UW6:と", + "UW5:ち", + "BW3:ため", + "UW4:ず", + "UW3:0", + "BW1:んで", + "UW3:中", + "UW3:々", + "BW2:のよ", + "BW2:帯電", + "BW2:でも", + "BW1:には", + "BW3:ちょ", + "UW4:せ", + "UW3:度", + "BW1:でも", + "BW1:が、", + "UW2:な", + "UW5:思", + "UW6:0", + "UW6:寺", + "BW3:とて", + "BW3:ある", + "BW2:もし", + "UW4:ッ", + "UW1:て", + "BW2:にも", + "BW1:れた", + "UW4:ひ", + "TW3:ること", + "BW1:てい", + "UW4:』", + "BW1:だけ", + "UW3:お", + "BW1:少し", + "TW3:、ある", + "UW5:!", + "UW6:ル", + "UW2:多", + "UW6:ご", + "UW6:や", + "UW3:後", + "BW2:てみ", + "BW1:とき", + "UW4:ゃ", + "BW1:たい", + "UW3:き", + "TW4:ことが", + "UW3:真", + "BW2:など", + "UW6:ぱ", + "BW1:った", + "BW1:ても", + "UW5:日", + "BW1:たと", + "UW4:]", + "UW3:ッ", + "TW4:メール", + "BW2:はな", + "BW3:・・", + "BW3:なる", + "BW1:とい", + "UW2:全", + "BW1:にも", + "BW1:たら", + "BW2:くな", + "UW3:「", + "BW1:その", + "UW3:観", + "BW1:うに", + "UW3:イ", + "BW3:もん", + "UW5:ず", + "BW3:しま", + "BW1:より", + "UW5:分", } modelValues:intvector { - 1800, - 271, - -857, - -417, - 285, - -583, - 388, - 828, - -853, - -820, - 502, - -708, - 358, - 1341, - -586, - -451, - 257, - -1876, - 2052, - 1698, - -458, - 2048, - 1182, - -551, - 980, - 773, - -1453, - -152, - 3201, - 2865, - 1203, - 144, - -369, - -2539, - -613, - -3574, - -1111, - 3110, - -3022, - 2039, + 3634, + 4347, + -2581, + -4812, + 2538, + -4206, + 2701, + -1455, + -2403, + 2977, + -2678, + 4165, + -818, + -1011, + 2996, + -904, + -1808, + 2064, + -2164, + -2180, + -2760, + -2310, + 2360, + -388, + 1842, + 1706, + -706, + -2408, + -1628, + -1005, + -434, + -1442, + 543, -1091, - 1241, - -560, - -1412, - 625, - 1350, - 297, - -2404, - -595, - 1007, - -1829, - -1662, - 3213, - 270, - -911, - 178, - -727, - 2716, - -484, - -344, - 929, - -1236, - 760, - -299, - -419, - -728, - 122, - -704, - -605, - -1507, - 545, - -68, - -320, - 1498, - 953, - -323, - -575, - -673, - 520, - -450, - -1767, - -247, - 56, - 231, - -764, - 536, - 794, - -703, - -566, - 51, - 390, - 52, - -182, - 466, - 133, - 354, - 107, - 492, - 488, - -1194, - 1145, - -847, - 812, - 151, - -517, - -314, - -553, - -783, - -117, - 736, - -88, - -598, - 569, - 606, - 287, - 744, - 1739, - -217, - -219, - -144, - 234, - -649, - -757, - 834, - -819, - 869, - -275, - -267, - 154, - 653, - 594, - 255, - 1018, - 1124, - 284, - -1624, - -372, - 440, - -184, - -1936, - 1318, - -1124, - 453, - -92, - -343, - 175, - 182, + 1355, + -1056, + 258, + 277, + -2999, + 1331, + -1305, + 1242, + -337, + -1073, + 1392, + -576, -886, - 930, - -223, - -57, - -113, - 103, - -200, - 510, - -2099, - -498, - 385, - 80, - -156, - 360, - 1289, - 771, - -1114, - -399, - 870, - 1230, - 79, - 472, - -1596, - -1092, - -572, - 55, - -151, - -124, - 1316, - -248, - 1280, - -125, - -284, - -1023, - 862, - 84, - 417, - 568, - -88, - -528, - 910, - 674, - -212, - 894, - -121, - 1108, - 762, - 260, - -197, - 91, - -53, - 1117, - -645, - -868, - -611, - 220, - 422, - 1431, - -532, - -157, - -476, - -846, - -1309, - -1614, - 1225, - 302, - -738, - -260, - 892, - -778, - -193, - 1221, - -779, - 489, - 420, - -85, - -525, - -830, - 26, - 270, - 439, - -120, - 1263, - -795, - 291, - -1310, - -23, - 347, - 312, - -107, - -114, - 701, - 830, - 1309, - -451, - 260, - -1080, - 536, - 188, - -60, - 643, - -1184, - 31, - -194, - -51, - -514, - -442, - -120, - 649, - 410, - 882, - -75, - -341, - -718, - -128, - 340, - -1245, - -164, - -1052, - 70, - -256, - 279, - 786, - 40, - -177, - 97, - -411, - 222, - -89, - -277, - -146, - 414, - 483, - 21, - -339, - -406, - -360, - -450, - -14, - -36, - 513, - 252, - 54, - -501, - -478, - 450, - -36, - -644, - -392, - 714, - 643, - -341, - 91, - -1018, - 34, - -177, - 123, - 80, - -695, - -44, - -357, - 253, - -389, - 613, - 515, - 418, - -396, - -553, - 193, - 298, - -334, - -57, - -315, - -77, - 33, - 88, - 137, - 280, - -448, - 196, - -136, - -295, - -329, - -92, - -360, - -132, - -288, - -45, - -43, - 174, - 75, - -60, - 330, - 360, - 217, - 130, - 473, - -41, - -23, - -340, - -530, - -69, - -71, - -115, - 297, - -240, - 229, - 507, - -348, - 171, - -320, - 239, - 16, - -195, - -277, - -41, - 69, - 280, - -264, - 30, - 249, - -97, - -163, - -221, - 96, - 83, - 82, - -218, - -93, - -53, - 40, - 28, - 285, - 27, - 283, - -211, - -92, - 214, - -225, - -54, - 53, - 105, - -198, - -53, - -277, - 198, - 184, - -264, - -106, - 14, - 185, - -155, - 185, - 106, - -119, - 53, - 208, - 92, - 262, - 106, - -52, - 105, - -25, - -79, - 104, - 141, - 129, - -114, - 26, - 64, - -113, - 26, - 77, - -64, - 13, - 13, - 26, - 89, - 115, - -49, - 89, - -114, - 51, - 64, - -64, - -51, - -38, - 89, - 13, - -64, - 13, + -2405, + -386, + 1031, + 1470, + -2105, + -594, + -1461, + -1160, + 964, -48, - 76, - 63, - 62, - 13, + -2158, + 110, + -1750, + 228, + -603, + 801, + 972, + 102, + -395, + -508, + 1640, + 191, + 2468, + -1580, + -1529, + 1148, + 515, + 539, + -774, + 111, + -1275, + 113, + -432, + 1736, + 588, + -413, + 1360, + 49, + 2322, + 48, + 255, + -521, + -366, + 529, + -493, + -557, + 1719, + -476, + 104, + 1311, + 1314, + 1307, + 520, + 666, + -412, + 627, + 1098, + -209, + 163, + 955, + 1798, + -39, + -753, + -1262, + 411, + 1247, + 914, + 522, + 348, + 2156, + 510, + -1522, + -243, + 1337, + -378, + -1957, + 834, + -450, + 235, + 87, + 236, + -1615, + 485, + -1445, + 488, + 404, + -333, + 66, + 787, + 647, + -1495, + -756, + -1700, + 279, + -81, + 260, + 162, + -51, + -851, + 462, + 493, + 161, + 396, + -238, + -1044, + -1685, + 433, + 276, + -695, + -148, + 416, + 1235, + -748, + 257, + 784, + 748, + 767, + -262, + -490, + -26, + 152, + 186, + 544, + 1035, + -711, + 549, + -517, + 799, + -1024, + 542, + -118, + 432, + -56, + -694, + 668, + 249, + 175, + 329, + 305, + 287, + 423, + 438, + 934, + 628, + 292, + -536, + -995, + -814, + 237, + 263, + 571, + -138, + 402, + 701, + 387, + 474, + -183, + 661, + 280, + 767, + -53, + -793, + -191, + -401, + 526, + -679, + 279, + -407, + 493, + -82, + 365, + -334, + 36, + 284, + -813, + 424, + -425, + 423, + -796, + 452, + -635, + -389, + 404, + -141, + 415, + -277, + -400, + 502, + 766, + -182, + -426, + 720, + 1005, + 422, + -396, + 123, + -533, + -91, + -355, + 333, + -596, + -333, + 434, + 31, + 567, + -356, + -309, + 251, + 365, + -399, + 411, + -235, + -526, + 468, + 438, + 136, + 103, + 74, + 585, + 324, + -115, + -219, + -217, + -289, + -88, + 143, + 361, + -558, + -614, + -56, + 456, + 441, + -566, + 102, 112, - -76, - -50, + -466, + 325, + -27, + 128, + 294, + -321, + -224, + -206, + 252, + 209, + -207, + -224, + -207, + 109, + 316, + -234, + 222, + 95, + 192, + -40, + -98, + 82, + 68, + 230, + -28, + -67, + -149, + 14, + -120, + 95, + 122, + -81, + -67, + -296, + 122, + -81, + 134, + -200, + -67, + 14, + 67, + 119, + 40, + 118, + -92, + 91, + -105, + 53, + 40, + -51, + 39, + -64, + 105, + 13, + 39, + 26, + -52, + -52, + -52, + 26, + -26, + -39, + 13, -13, - -49, - 63, - -50, + 39, + 26, 13, + -39, + -26, + -26, + -26, + -13, + -13, + 39, + 26, + -13, + 26, 13, - -50, - 24, - -12, - 24, - 12, - 24, - 12, - -12, - -24, - 12, - -12, - -12, - 12, - -12, } } \ No newline at end of file diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 40c6745dd06..7a3c8e46f0e 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1919,9 +1919,9 @@ Bangkok)• •\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f• #る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• •\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d• -#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 -#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし) -•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09• +#Kana supplement: 𛁛 (U+1B05B), 𛂦(U+1B0A6) +#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。 +•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002• #中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です •\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059• #しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!! diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java index ceeb4879ae5..196579d0a58 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java @@ -24,61 +24,12 @@ public class MlBreakEngine { private static final int INVALID = '|'; private static final String INVALID_STRING = "|"; - private static final int MAX_FEATURE = 26; + private static final int MAX_FEATURE = 13; private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; private UnicodeSet fClosePunctuationSet; private HashMap fModel; - private int fNegativeSum; - static class Element { - private int character; - private String ublock; - - /** - * Default constructor. - */ - public Element() { - character = 0; - ublock = null; - } - - /** - * Set the character and its unicode block. - * - * @param ch A unicode character. - * @param str The unicode block of the character. - */ - public void setCharAndUblock(int ch, String str) { - Assert.assrt(str.length() <= 3); - this.character = ch; - ublock = str; - } - - /** - * Get the unicode character. - * - * @return The unicode character. - */ - public int getCharacter() { - return character; - } - - /** - * Get the unicode character's unicode block. - * - * @return The unicode block. - */ - public String getUblock() { - return ublock; - } - } - - private static boolean isValid(Element element) { - String ublock = element.getUblock(); - return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID; - } - /** * Constructor for Chinese and Japanese phrase breaking. * @@ -114,12 +65,10 @@ public class MlBreakEngine { return 0; } ArrayList boundary = new ArrayList(numCodePts); - int ch; - String ublock; // The ML model groups six char to evaluate if the 4th char is a breakpoint. // Like a sliding window, the elementList removes the first char and appends the new char // from inString in each iteration so that its size always remains at six. - Element elementList[] = new Element[6]; + int elementList[] = new int[6]; initElementList(inString, elementList, numCodePts); // Add a break for the start. @@ -130,10 +79,7 @@ public class MlBreakEngine { break; } shiftLeftOne(elementList); - - ch = (i + 3) < numCodePts ? next32(inString) : INVALID; - ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING; - elementList[5].setCharAndUblock(ch, ublock); + elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID; } // Add a break for the end if there is not one there already. @@ -181,11 +127,10 @@ public class MlBreakEngine { return correctedNumBreaks; } - private void shiftLeftOne(Element[] elementList) { + private void shiftLeftOne(int[] elementList) { int length = elementList.length; for (int i = 1; i < length; i++) { - elementList[i - 1].character = elementList[i].character; - elementList[i - 1].ublock = elementList[i].ublock; + elementList[i - 1] = elementList[i]; } } @@ -196,14 +141,14 @@ public class MlBreakEngine { * @param index The breakpoint index to be evaluated. * @param boundary An list including the index of the breakpoint. */ - private void evaluateBreakpoint(Element[] elementList, int index, ArrayList boundary) { + private void evaluateBreakpoint(int[] elementList, int index, ArrayList boundary) { String[] featureList = new String[MAX_FEATURE]; - final int w1 = elementList[0].getCharacter(); - final int w2 = elementList[1].getCharacter(); - final int w3 = elementList[2].getCharacter(); - final int w4 = elementList[3].getCharacter(); - final int w5 = elementList[4].getCharacter(); - final int w6 = elementList[5].getCharacter(); + final int w1 = elementList[0]; + final int w2 = elementList[1]; + final int w3 = elementList[2]; + final int w4 = elementList[3]; + final int w5 = elementList[4]; + final int w6 = elementList[5]; StringBuilder sb = new StringBuilder(); int idx = 0; @@ -265,76 +210,7 @@ public class MlBreakEngine { featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint( w5).appendCodePoint(w6).toString(); } - if (isValid(elementList[0])) { - sb.setLength(0); - featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString(); - } - if (isValid(elementList[1])) { - sb.setLength(0); - featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString(); - } - if (isValid(elementList[2])) { - sb.setLength(0); - featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString(); - } - if (isValid(elementList[3])) { - sb.setLength(0); - featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString(); - } - if (isValid(elementList[4])) { - sb.setLength(0); - featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString(); - } - if (isValid(elementList[5])) { - sb.setLength(0); - featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString(); - } - if (isValid(elementList[1]) && isValid(elementList[2])) { - sb.setLength(0); - featureList[idx++] = sb.append("BB1:"). - append(elementList[1].getUblock()). - append(elementList[2].getUblock()).toString(); - } - if (isValid(elementList[2]) && isValid(elementList[3])) { - sb.setLength(0); - featureList[idx++] = sb.append("BB2:"). - append(elementList[2].getUblock()). - append(elementList[3].getUblock()).toString(); - } - if (isValid(elementList[3]) && isValid(elementList[4])) { - sb.setLength(0); - featureList[idx++] = sb.append("BB3:"). - append(elementList[3].getUblock()). - append(elementList[4].getUblock()).toString(); - } - if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) { - sb.setLength(0); - featureList[idx++] = sb.append("TB1:"). - append(elementList[0].getUblock()). - append(elementList[1].getUblock()). - append(elementList[2].getUblock()).toString(); - } - if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) { - sb.setLength(0); - featureList[idx++] = sb.append("TB2:"). - append(elementList[1].getUblock()). - append(elementList[2].getUblock()). - append(elementList[3].getUblock()).toString(); - } - if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) { - sb.setLength(0); - featureList[idx++] = sb.append("TB3:"). - append(elementList[2].getUblock()). - append(elementList[3].getUblock()). - append(elementList[4].getUblock()).toString(); - } - if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) { - sb.setLength(0); - featureList[idx++] = sb.append("TB4:"). - append(elementList[3].getUblock()). - append(elementList[4].getUblock()). - append(elementList[5].getUblock()).toString(); - } + int score = fNegativeSum; for (int j = 0; j < idx; j++) { if (fModel.containsKey(featureList[j])) { @@ -350,12 +226,11 @@ public class MlBreakEngine { * Initialize the element list from the input string. * * @param inString A input string to be segmented. - * @param elementList A list to store the first six characters and their unicode block codes. + * @param elementList A list to store the first six characters. * @param numCodePts The number of code points of input string * @return The number of the code units of the first six characters in inString. */ - private int initElementList(CharacterIterator inString, Element[] elementList, - int numCodePts) { + private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) { int index = 0; inString.setIndex(index); int w1, w2, w3, w4, w5, w6; @@ -363,60 +238,29 @@ public class MlBreakEngine { if (numCodePts > 0) { w3 = current32(inString); index += Character.charCount(w3); + if (numCodePts > 1) { + w4 = next32(inString); + index += Character.charCount(w3); + if (numCodePts > 2) { + w5 = next32(inString); + index += Character.charCount(w5); + if (numCodePts > 3) { + w6 = next32(inString); + index += Character.charCount(w6); + } + } + } } - if (numCodePts > 1) { - w4 = next32(inString); - index += Character.charCount(w3); - } - if (numCodePts > 2) { - w5 = next32(inString); - index += Character.charCount(w5); - } - if (numCodePts > 3) { - w6 = next32(inString); - index += Character.charCount(w6); - } - - final String b1 = INVALID_STRING; - final String b2 = b1; - final String b3 = getUnicodeBlock(w3); - final String b4 = getUnicodeBlock(w4); - final String b5 = getUnicodeBlock(w5); - final String b6 = getUnicodeBlock(w6); - - elementList[0] = new Element(); - elementList[0].setCharAndUblock(w1, b1); - elementList[1] = new Element(); - elementList[1].setCharAndUblock(w2, b2); - elementList[2] = new Element(); - elementList[2].setCharAndUblock(w3, b3); - elementList[3] = new Element(); - elementList[3].setCharAndUblock(w4, b4); - elementList[4] = new Element(); - elementList[4].setCharAndUblock(w5, b5); - elementList[5] = new Element(); - elementList[5].setCharAndUblock(w6, b6); + elementList[0] = w1; + elementList[1] = w2; + elementList[2] = w3; + elementList[3] = w4; + elementList[4] = w5; + elementList[5] = w6; return index; } - /** - * Get the character's unicode block code defined in UBlockCode. - * - * @param ch A char. - * @return The unicode block code which is 3 digits with '0' added in the beginning if the code - * is less than 3 digits. - */ - private String getUnicodeBlock(int ch) { - int blockId = UCharacter.UnicodeBlock.of(ch).getID(); - if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID() - || blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) { - return INVALID_STRING; - } else { - return String.format("%03d", blockId); - } - } - /** * Load the machine learning's model file. */ diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 40c6745dd06..7a3c8e46f0e 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1919,9 +1919,9 @@ Bangkok)• •\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f• #る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• •\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d• -#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 -#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし) -•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09• +#Kana supplement: 𛁛 (U+1B05B), 𛂦(U+1B0A6) +#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。 +•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002• #中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です •\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059• #しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!