From 3f05361b4192d6d337c3dacc63a91f53c966da3e Mon Sep 17 00:00:00 2001 From: allenwtsu Date: Tue, 31 Jan 2023 18:17:02 +0800 Subject: [PATCH] ICU-22100 Modify ML model to improve Japanese phrase breaking performance --- icu4c/source/common/mlbe.cpp | 217 ++- icu4c/source/common/mlbe.h | 42 +- icu4c/source/data/brkitr/adaboost/jaml.txt | 1478 +++++++++-------- .../ibm/icu/impl/breakiter/MlBreakEngine.java | 287 ++-- 4 files changed, 1044 insertions(+), 980 deletions(-) diff --git a/icu4c/source/common/mlbe.cpp b/icu4c/source/common/mlbe.cpp index 14f68d2a126..7e734f2c8ad 100644 --- a/icu4c/source/common/mlbe.cpp +++ b/icu4c/source/common/mlbe.cpp @@ -18,11 +18,12 @@ U_NAMESPACE_BEGIN +enum class ModelIndex { kUWStart = 0, kBWStart = 6, kTWStart = 9 }; + MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet, - const UnicodeSet &closePunctuationSet, UErrorCode &status) + const UnicodeSet &closePunctuationSet, UErrorCode &status) : fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet), fClosePunctuationSet(closePunctuationSet), - fModel(status), fNegativeSum(0) { if (U_FAILURE(status)) { return; @@ -32,14 +33,10 @@ MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetS MlBreakEngine::~MlBreakEngine() {} -namespace { - const char16_t INVALID = u'|'; -} - int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd, - UVector32 &foundBreaks, const UnicodeString &inString, - const LocalPointer &inputMap, - UErrorCode &status) const { + UVector32 &foundBreaks, const UnicodeString &inString, + const LocalPointer &inputMap, + UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } @@ -53,30 +50,35 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t return 0; } int32_t numBreaks = 0; - UnicodeString index; - // The ML model groups six char to evaluate if the 4th char is a breakpoint. - // Like a sliding window, the elementList removes the first char and appends the new char from - // inString in each iteration so that its size always remains at six. - UChar32 elementList[6]; - - int32_t codeUts = initElementList(inString, elementList, status); - int32_t length = inString.countChar32(); + int32_t codePointLength = inString.countChar32(); + // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint. + // In each iteration, it evaluates the 4th char and then moves forward one char like a sliding + // window. Initially, the first six values in the indexList are [-1, -1, 0, 1, 2, 3]. After + // moving forward, finally the last six values in the indexList are + // [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra "-1". + int32_t indexSize = codePointLength + 4; + int32_t *indexList = (int32_t *)uprv_malloc(indexSize * sizeof(int32_t)); + if (indexList == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + int32_t numCodeUnits = initIndexList(inString, indexList, status); // Add a break for the start. boundary.addElement(0, status); numBreaks++; if (U_FAILURE(status)) return 0; - for (int32_t i = 1; i < length && U_SUCCESS(status); i++) { - evaluateBreakpoint(elementList, i, numBreaks, boundary, status); - if (i + 1 >= inString.countChar32()) break; - // Remove the first element and append a new element - uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32)); - elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID; - if (elementList[5] != INVALID) { - codeUts += U16_LENGTH(elementList[5]); + for (int32_t idx = 0; idx + 1 < codePointLength && U_SUCCESS(status); idx++) { + numBreaks = + evaluateBreakpoint(inString, indexList, idx, numCodeUnits, numBreaks, boundary, status); + if (idx + 4 < codePointLength) { + indexList[idx + 6] = numCodeUnits; + numCodeUnits += U16_LENGTH(inString.char32At(indexList[idx + 6])); } } + uprv_free(indexList); + if (U_FAILURE(status)) return 0; // Add a break for the end if there is not one there already. @@ -128,119 +130,112 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t return correctedNumBreaks; } -void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks, - UVector32 &boundary, UErrorCode &status) const { +int32_t MlBreakEngine::evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, + int32_t startIdx, int32_t numCodeUnits, int32_t numBreaks, + UVector32 &boundary, UErrorCode &status) const { if (U_FAILURE(status)) { - return; + return numBreaks; } - - UnicodeString feature; + int32_t start = 0, end = 0; int32_t score = fNegativeSum; - if (elementList[0] != INVALID) { - // When the key doesn't exist, Hashtable.geti(key) returns 0 and 2 * 0 = 0. - // So, we can skip to check whether fModel includes key featureList[j] or not. - score += (2 * fModel.geti(feature.setTo(u"UW1:", 4).append(elementList[0]))); + for (int i = 0; i < 6; i++) { + // UW1 ~ UW6 + start = startIdx + i; + if (indexList[start] != -1) { + end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits; + score += fModel[static_cast(ModelIndex::kUWStart) + i].geti( + inString.tempSubString(indexList[start], end - indexList[start])); + } } - if (elementList[1] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"UW2:", 4).append(elementList[1]))); + for (int i = 0; i < 3; i++) { + // BW1 ~ BW3 + start = startIdx + i + 1; + if (indexList[start] != -1 && indexList[start + 1] != -1) { + end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits; + score += fModel[static_cast(ModelIndex::kBWStart) + i].geti( + inString.tempSubString(indexList[start], end - indexList[start])); + } } - if (elementList[2] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"UW3:", 4).append(elementList[2]))); - } - if (elementList[3] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"UW4:", 4).append(elementList[3]))); - } - if (elementList[4] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"UW5:", 4).append(elementList[4]))); - } - if (elementList[5] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"UW6:", 4).append(elementList[5]))); - } - if (elementList[1] != INVALID && elementList[2] != INVALID) { - score += (2 * fModel.geti( - feature.setTo(u"BW1:", 4).append(elementList[1]).append(elementList[2]))); - } - if (elementList[2] != INVALID && elementList[3] != INVALID) { - score += (2 * fModel.geti( - feature.setTo(u"BW2:", 4).append(elementList[2]).append(elementList[3]))); - } - if (elementList[3] != INVALID && elementList[4] != INVALID) { - score += (2 * fModel.geti( - feature.setTo(u"BW3:", 4).append(elementList[3]).append(elementList[4]))); - } - if (elementList[0] != INVALID && elementList[1] != INVALID && elementList[2] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"TW1:", 4) - .append(elementList[0]) - .append(elementList[1]) - .append(elementList[2]))); - } - if (elementList[1] != INVALID && elementList[2] != INVALID && elementList[3] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"TW2:", 4) - .append(elementList[1]) - .append(elementList[2]) - .append(elementList[3]))); - } - if (elementList[2] != INVALID && elementList[3] != INVALID && elementList[4] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"TW3:", 4) - .append(elementList[2]) - .append(elementList[3]) - .append(elementList[4]))); - } - if (elementList[3] != INVALID && elementList[4] != INVALID && elementList[5] != INVALID) { - score += (2 * fModel.geti(feature.setTo(u"TW4:", 4) - .append(elementList[3]) - .append(elementList[4]) - .append(elementList[5]))); + for (int i = 0; i < 4; i++) { + // TW1 ~ TW4 + start = startIdx + i; + if (indexList[start] != -1 && indexList[start + 1] != -1 && indexList[start + 2] != -1) { + end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits; + score += fModel[static_cast(ModelIndex::kTWStart) + i].geti( + inString.tempSubString(indexList[start], end - indexList[start])); + } } + if (score > 0) { - boundary.addElement(index, status); + boundary.addElement(startIdx + 1, status); numBreaks++; } + return numBreaks; } -int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList, - UErrorCode &status) const { +int32_t MlBreakEngine::initIndexList(const UnicodeString &inString, int32_t *indexList, + UErrorCode &status) const { if (U_FAILURE(status)) { return 0; } int32_t index = 0; int32_t length = inString.countChar32(); - UChar32 w1, w2, w3, w4, w5, w6; - w1 = w2 = w3 = w4 = w5 = w6 = INVALID; + // Set all (lenght+4) items inside indexLength to -1 presuming -1 is 4 bytes of 0xff. + uprv_memset(indexList, 0xff, (length + 4) * sizeof(int32_t)); if (length > 0) { - w3 = inString.char32At(0); - index += U16_LENGTH(w3); + indexList[2] = 0; + index = U16_LENGTH(inString.char32At(0)); if (length > 1) { - w4 = inString.char32At(index); - index += U16_LENGTH(w4); + indexList[3] = index; + index += U16_LENGTH(inString.char32At(index)); if (length > 2) { - w5 = inString.char32At(index); - index += U16_LENGTH(w5); + indexList[4] = index; + index += U16_LENGTH(inString.char32At(index)); if (length > 3) { - w6 = inString.char32At(index); - index += U16_LENGTH(w6); + indexList[5] = index; + index += U16_LENGTH(inString.char32At(index)); } } } } - elementList[0] = w1; - elementList[1] = w2; - elementList[2] = w3; - elementList[3] = w4; - elementList[4] = w5; - elementList[5] = w6; - return index; } void MlBreakEngine::loadMLModel(UErrorCode &error) { - // BudouX's model consists of pairs of the feature and its score. - // As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the - // corresponding feature's score. + // BudouX's model consists of thirteen categories, each of which is make up of pairs of the + // feature and its score. As integrating it into jaml.txt, we define thirteen kinds of key and + // value to represent the feature and the corresponding score respectively. if (U_FAILURE(error)) return; + UnicodeString key; + StackUResourceBundle stackTempBundle; + ResourceDataValue modelKey; + + LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error)); + UResourceBundle *rb = rbp.getAlias(); + if (U_FAILURE(error)) return; + + int32_t index = 0; + initKeyValue(rb, "UW1Keys", "UW1Values", fModel[index++], error); + initKeyValue(rb, "UW2Keys", "UW2Values", fModel[index++], error); + initKeyValue(rb, "UW3Keys", "UW3Values", fModel[index++], error); + initKeyValue(rb, "UW4Keys", "UW4Values", fModel[index++], error); + initKeyValue(rb, "UW5Keys", "UW5Values", fModel[index++], error); + initKeyValue(rb, "UW6Keys", "UW6Values", fModel[index++], error); + initKeyValue(rb, "BW1Keys", "BW1Values", fModel[index++], error); + initKeyValue(rb, "BW2Keys", "BW2Values", fModel[index++], error); + initKeyValue(rb, "BW3Keys", "BW3Values", fModel[index++], error); + initKeyValue(rb, "TW1Keys", "TW1Values", fModel[index++], error); + initKeyValue(rb, "TW2Keys", "TW2Values", fModel[index++], error); + initKeyValue(rb, "TW3Keys", "TW3Values", fModel[index++], error); + initKeyValue(rb, "TW4Keys", "TW4Values", fModel[index++], error); + fNegativeSum /= 2; +} + +void MlBreakEngine::initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName, + Hashtable &model, UErrorCode &error) { int32_t keySize = 0; int32_t valueSize = 0; int32_t stringLength = 0; @@ -248,15 +243,13 @@ void MlBreakEngine::loadMLModel(UErrorCode &error) { StackUResourceBundle stackTempBundle; ResourceDataValue modelKey; - LocalUResourceBundlePointer rbp(ures_openDirect(U_ICUDATA_BRKITR, "jaml", &error)); - UResourceBundle* rb = rbp.orphan(); // get modelValues - LocalUResourceBundlePointer modelValue(ures_getByKey(rb, "modelValues", nullptr, &error)); - const int32_t* value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error); + LocalUResourceBundlePointer modelValue(ures_getByKey(rb, valueName, nullptr, &error)); + const int32_t *value = ures_getIntVector(modelValue.getAlias(), &valueSize, &error); if (U_FAILURE(error)) return; // get modelKeys - ures_getValueWithFallback(rb, "modelKeys", stackTempBundle.getAlias(), modelKey, error); + ures_getValueWithFallback(rb, keyName, stackTempBundle.getAlias(), modelKey, error); ResourceArray stringArray = modelKey.getArray(error); keySize = stringArray.getSize(); if (U_FAILURE(error)) return; @@ -267,7 +260,7 @@ void MlBreakEngine::loadMLModel(UErrorCode &error) { if (U_SUCCESS(error)) { U_ASSERT(idx < valueSize); fNegativeSum -= value[idx]; - fModel.puti(key, value[idx], error); + model.puti(key, value[idx], error); } } } diff --git a/icu4c/source/common/mlbe.h b/icu4c/source/common/mlbe.h index 2f0edd6c4f2..38de47e5f57 100644 --- a/icu4c/source/common/mlbe.h +++ b/icu4c/source/common/mlbe.h @@ -5,6 +5,7 @@ #define MLBREAKENGINE_H #include "hash.h" +#include "unicode/resbund.h" #include "unicode/uniset.h" #include "unicode/utext.h" #include "uvectr32.h" @@ -27,7 +28,7 @@ class MlBreakEngine : public UMemory { * @param status Information on any errors encountered. */ MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet, - const UnicodeSet &closePunctuationSet, UErrorCode &status); + const UnicodeSet &closePunctuationSet, UErrorCode &status); /** * Virtual destructor. @@ -60,31 +61,50 @@ class MlBreakEngine : public UMemory { void loadMLModel(UErrorCode &error); /** - * Initialize the element list from the input string. + * In the machine learning's model file, specify the name of the key and value to load the + * corresponding feature and its score. + * + * @param rb A ResouceBundle corresponding to the model file. + * @param keyName The kay name in the model file. + * @param valueName The value name in the model file. + * @param model A hashtable to store the pairs of the feature and its score. + * @param error Information on any errors encountered. + */ + void initKeyValue(UResourceBundle *rb, const char *keyName, const char *valueName, + Hashtable &model, UErrorCode &error); + + /** + * Initialize the index list from the input string. * * @param inString A input string to be segmented. - * @param elementList A list to store the first six characters. + * @param indexList A code unit index list of inString. * @param status Information on any errors encountered. - * @return The number of code units of the first six characters in inString. + * @return The number of code units of the first four characters in inString. */ - int32_t initElementList(const UnicodeString &inString, UChar32* elementList, - UErrorCode &status) const; + int32_t initIndexList(const UnicodeString &inString, int32_t *indexList, + UErrorCode &status) const; /** * Evaluate whether the index is a potential breakpoint. * - * @param elementList A list including six elements for the breakpoint evaluation. - * @param index The breakpoint index to be evaluated. + * @param inString A input string to be segmented. + * @param indexList A code unit index list of the inString. + * @param startIdx The start index of the indexList. + * @param numCodeUnits The current code unit boundary of the indexList. * @param numBreaks The accumulated number of breakpoints. * @param boundary A vector including the index of the breakpoint. * @param status Information on any errors encountered. + * @return The number of breakpoints */ - void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks, - UVector32 &boundary, UErrorCode &status) const; + int32_t evaluateBreakpoint(const UnicodeString &inString, int32_t *indexList, int32_t startIdx, + int32_t numCodeUnits, int32_t numBreaks, UVector32 &boundary, + UErrorCode &status) const; + + void printUnicodeString(const UnicodeString &s) const; UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; UnicodeSet fClosePunctuationSet; - Hashtable fModel; + Hashtable fModel[13]; // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13 int32_t fNegativeSum; }; diff --git a/icu4c/source/data/brkitr/adaboost/jaml.txt b/icu4c/source/data/brkitr/adaboost/jaml.txt index 4ddea6c78b2..f931331229d 100644 --- a/icu4c/source/data/brkitr/adaboost/jaml.txt +++ b/icu4c/source/data/brkitr/adaboost/jaml.txt @@ -1,728 +1,776 @@ // © 2022 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html jaml { - modelKeys { - "UW3:、", - "UW3:。", - "UW4:の", - "UW4:、", - "UW3:の", - "UW4:。", - "UW3:に", - "UW5:。", - "UW4:て", - "UW3:は", - "UW4:に", - "UW3:を", - "UW5:、", - "UW2:、", - "UW3:が", - "UW2:。", - "UW4:で", - "UW3:と", - "UW4:は", - "UW4:が", - "UW4:る", - "UW4:っ", - "UW3:も", - "UW5:な", - "UW3:で", - "UW3:る", - "UW5:で", - "UW4:を", - "UW4:か", - "UW3:っ", - "UW2:の", - "UW4:と", - "UW5:っ", - "UW4:な", - "UW3:て", - "UW4:た", - "UW4:こ", - "UW6:に", - "UW4:ら", - "UW3:た", - "UW2:を", - "UW3:ら", - "UW6:。", - "UW4:し", - "UW3:な", - "UW2:に", - "UW4:い", - "UW4:り", - "UW6:う", - "UW3:う", - "UW3:く", - "UW4:れ", - "UW2:は", - "UW4:だ", - "UW4:う", - "UW3:い", - "UW6:い", - "UW4:ん", - "UW2:か", - "UW4:ー", - "UW6:を", - "UW2:も", - "UW5:き", - "UW3:り", - "UW6:で", - "UW2:る", - "UW2:と", - "UW3:]", - "UW4:そ", - "UW3:,", - "UW4:も", - "UW4:く", - "UW3:か", - "BW2:とい", - "UW4:お", - "UW4:ま", - "UW6:が", - "UW4:き", - "UW2:し", - "UW2:て", - "UW3:!", - "UW2:ま", - "UW5:に", - "UW3:や", - "UW6:て", - "BW3:もの", - "UW6:の", - "UW2:ん", - "UW2:が", - "UW5:が", - "BW1:いう", - "UW2:で", - "UW5:す", - "UW3:?", - "UW5:と", - "UW6:は", - "UW3:.", - "UW4:「", - "UW3:ば", - "UW5:ん", - "BW3:いう", - "UW4:す", - "BW1:から", - "UW3:ど", - "UW5:し", - "UW2:っ", - "UW4:思", - "UW3:…", - "UW5:る", - "BW2:てい", - "BW3:よう", - "UW5:え", - "UW4:私", - "UW3:・", - "UW4:人", - "UW5:く", - "UW3:)", - "UW4:京", - "BW2:ない", - "UW3:ー", - "BW3:とこ", - "UW5:は", - "UW4:」", - "UW2:一", - "UW4:よ", - "BW3:こと", - "UW5:ー", - "UW6:し", - "UW4:け", - "BW1:ない", - "BW2:です", - "UW4:一", - "UW5:帯", - "UW5:を", - "UW6:な", - "UW5:べ", - "BW3:いい", - "BW2:であ", - "BW2:ので", - "UW4:,", - "UW5:れ", - "UW5:ろ", - "UW1:そ", - "UW5:い", - "UW1:い", - "UW5:・", - "UW5:わ", - "UW4:1", - "UW5:う", - "UW4:大", - "UW3:ま", - "BW2:とこ", - "UW4:!", - "UW4:見", - "UW4:行", - "BW1:こと", - "UW1:な", - "UW2:さ", - "UW3:☆", - "UW4:さ", - "UW2:よ", - "BW1:とか", - "UW4:(", - "BW3:でも", - "UW5:の", - "UW4:・", - "UW5:た", - "UW1:す", - "UW5:か", - "UW4:使", - "UW3:♪", - "UW4:え", - "UW4:今", - "BW2:、と", - "BW3:とき", - "UW4:ろ", - "UW5:つ", - "UW1:に", - "UW5:じ", - "UW1:で", - "UW4:ン", - "UW3:ず", - "BW3:して", - "UW4:食", - "UW4:気", - "UW4:時", - "UW3:日", - "BW1:しい", - "UW4:自", - "UW3:笑", - "UW2:毎", - "TW1:という", - "UW4:み", - "UW4:…", - "TW2:ではな", - "UW6:さ", - "UW5:め", - "UW2:少", - "UW5:あ", - "UW4:2", - "UW3:へ", - "TW3:という", - "UW4:何", - "UW2:く", - "UW2:結", - "BW1:うな", - "BW1:もう", - "UW1:が", - "UW4:じ", - "UW2:う", - "UW4:ル", - "UW3:」", - "BW1:とが", - "UW2:最", - "BW1:るの", - "UW3:間", - "UW6:た", - "UW3:つ", - "UW4:ど", - "UW1:と", - "UW3:ん", - "UW4:.", - "UW3:だ", - "UW4:わ", - "UW4:最", - "UW4:?", - "UW3:ろ", - "UW4:ば", - "TW3:ている", - "BW3:この", - "UW5:も", - "UW3:人", - "BW3:とい", - "UW4:つ", - "BW3:その", - "BW3:もう", - "UW2:そ", - "BW2:には", - "BW3:かけ", - "TW4:の京都", - "TW4:ところ", - "UW3:京", - "UW4:携", - "BW1:かも", - "BW1:では", - "UW4:ち", - "UW3:分", - "UW4:べ", - "BW3:ころ", - "UW3:ゃ", - "UW2:す", - "BW1:。・", - "UW3:電", - "BW3:なっ", - "UW3:す", - "BW1:最近", - "UW4:め", - "UW3:ぐ", - "UW2:お", - "BW3:そし", - "BW1:かし", - "BW1:同じ", - "BW3:メー", - "UW5:て", - "UW6:り", - "TW4:くらい", - "UW3:今", - "UW5:そ", - "UW4:や", - "UW5:」", - "UW4:帯", - "UW6:ー", - "BW2:とし", - "TW1:ような", - "BW2:てお", - "UW4:笑", - "UW1:は", - "BW3:かか", - "TW4:かなり", - "UW4:)", - "BW1:んな", - "UW1:ち", - "TW2:気に入", - "TW1:・・・", - "UW6:と", - "UW5:ち", - "BW3:ため", - "UW4:ず", - "UW3:0", - "BW1:んで", - "UW3:中", - "UW3:々", - "BW2:のよ", - "BW2:帯電", - "BW2:でも", - "BW1:には", - "BW3:ちょ", - "UW4:せ", - "UW3:度", - "BW1:でも", - "BW1:が、", - "UW2:な", - "UW5:思", - "UW6:0", - "UW6:寺", - "BW3:とて", - "BW3:ある", - "BW2:もし", - "UW4:ッ", - "UW1:て", - "BW2:にも", - "BW1:れた", - "UW4:ひ", - "TW3:ること", - "BW1:てい", - "UW4:』", - "BW1:だけ", - "UW3:お", - "BW1:少し", - "TW3:、ある", - "UW5:!", - "UW6:ル", - "UW2:多", - "UW6:ご", - "UW6:や", - "UW3:後", - "BW2:てみ", - "BW1:とき", - "UW4:ゃ", - "BW1:たい", - "UW3:き", - "TW4:ことが", - "UW3:真", - "BW2:など", - "UW6:ぱ", - "BW1:った", - "BW1:ても", - "UW5:日", - "BW1:たと", - "UW4:]", - "UW3:ッ", - "TW4:メール", - "BW2:はな", - "BW3:・・", - "BW3:なる", - "BW1:とい", - "UW2:全", - "BW1:にも", - "BW1:たら", - "BW2:くな", - "UW3:「", - "BW1:その", - "UW3:観", - "BW1:うに", - "UW3:イ", - "BW3:もん", - "UW5:ず", - "BW3:しま", - "BW1:より", - "UW5:分", + BW1Keys { + "。・", + "いう", + "うな", + "うに", + "かし", + "かも", + "から", + "が、", + "こと", + "しい", + "その", + "たい", + "たと", + "たら", + "だけ", + "った", + "てい", + "ても", + "では", + "でも", + "とい", + "とか", + "とが", + "とき", + "ない", + "には", + "にも", + "もう", + "より", + "るの", + "れた", + "んで", + "んな", + "同じ", + "少し", + "最近", } - modelValues:intvector { - 3634, - 4347, - -2581, - -4812, - 2538, - -4206, - 2701, - -1455, - -2403, - 2977, - -2678, - 4165, - -818, - -1011, - 2996, - -904, - -1808, - 2064, - -2164, - -2180, - -2760, - -2310, - 2360, - -388, - 1842, - 1706, - -706, - -2408, - -1628, - -1005, - -434, - -1442, - 543, - -1091, - 1355, - -1056, - 258, - 277, - -2999, - 1331, - -1305, - 1242, - -337, - -1073, - 1392, - -576, - -886, - -2405, - -386, - 1031, - 1470, - -2105, - -594, - -1461, - -1160, - 964, - -48, - -2158, - 110, - -1750, - 228, - -603, - 801, - 972, - 102, - -395, - -508, - 1640, - 191, - 2468, - -1580, - -1529, - 1148, - 515, - 539, - -774, - 111, - -1275, - 113, - -432, - 1736, - 588, - -413, - 1360, - 49, - 2322, - 48, - 255, - -521, - -366, + BW1Values:intvector { + 567, 529, - -493, - -557, - 1719, - -476, - 104, - 1311, - 1314, - 1307, - 520, - 666, - -412, + 280, + -13, + 468, + -533, 627, - 1098, - -209, - 163, - 955, - 1798, + 192, + -695, + 423, + -26, + 53, + -52, + 13, + 122, + 13, + -67, + 39, + -91, + 95, + -13, + 784, + -679, + 91, + 485, + 109, + 26, + 767, + 26, + -407, + 95, + -206, + 102, + 438, + 134, + 365, + } + BW2Keys { + "、と", + "くな", + "てい", + "てお", + "てみ", + "であ", + "です", + "でも", + "とい", + "とこ", + "とし", + "ない", + "など", + "には", + "にも", + "ので", + "のよ", + "はな", + "もし", + "帯電", + } + BW2Values:intvector { + -517, -39, -753, - -1262, - 411, - 1247, - 914, - 522, - 348, - 2156, - 510, - -1522, - -243, - 1337, - -378, - -1957, - 834, - -450, - 235, - 87, - 236, - -1615, - 485, - -1445, - 488, - 404, - -333, - 66, - 787, - 647, - -1495, - -756, - -1700, - 279, - -81, - 260, - 162, - -51, - -851, - 462, - 493, - 161, - 396, - -238, - -1044, - -1685, - 433, - 276, - -695, - -148, - 416, - 1235, - -748, - 257, - 784, - 748, - 767, - -262, - -490, - -26, - 152, - 186, - 544, - 1035, - -711, - 549, - -517, - 799, - -1024, - 542, - -118, - 432, - -56, - -694, - 668, - 249, - 175, - 329, - 305, - 287, - 423, - 438, - 934, - 628, - 292, - -536, - -995, - -814, - 237, - 263, - 571, - -138, - 402, - 701, - 387, - 474, - -183, - 661, - 280, - 767, - -53, - -793, - -191, - -401, - 526, - -679, - 279, - -407, - 493, - -82, - 365, - -334, - 36, - 284, - -813, - 424, - -425, - 423, - -796, - 452, - -635, - -389, - 404, - -141, - 415, - -277, - -400, - 502, - 766, - -182, - -426, - 720, - 1005, - 422, - -396, - 123, - -533, - -91, - -355, - 333, - -596, - -333, - 434, - 31, - 567, - -356, - -309, - 251, - 365, - -399, - 411, - -235, - -526, - 468, - 438, - 136, - 103, - 74, - 585, - 324, - -115, - -219, - -217, - -289, - -88, - 143, - 361, -558, - -614, - -56, - 456, - 441, - -566, - 102, - 112, - -466, - 325, - -27, - 128, - 294, - -321, - -224, - -206, - 252, - 209, - -207, - -224, - -207, - 109, - 316, - -234, - 222, - 95, - 192, - -40, - -98, - 82, - 68, - 230, - -28, - -67, - -149, - 14, - -120, - 95, - 122, - -81, - -67, - -296, - 122, - -81, - 134, - -200, - -67, - 14, - 67, - 119, - 40, - 118, -92, - 91, - -105, - 53, - 40, - -51, - 39, + -1495, + -1445, + -207, + 515, + -1044, + 143, + -1522, -64, - 105, - 13, - 39, - 26, - -52, - -52, - -52, - 26, + -426, + -120, + -756, + -207, -26, - -39, - 13, - -13, - 39, - 26, - 13, - -39, - -26, - -26, - -26, - -13, - -13, - 39, - 26, - -13, - 26, - 13, + -67, + -224, } -} \ No newline at end of file + BW3Keys { + "ある", + "いい", + "いう", + "かか", + "かけ", + "こと", + "この", + "ころ", + "して", + "しま", + "そし", + "その", + "ため", + "ちょ", + "でも", + "とい", + "とき", + "とこ", + "とて", + "なっ", + "なる", + "もう", + "もの", + "もん", + "よう", + "メー", + "・・", + } + BW3Values:intvector { + -28, + 647, + 666, + 456, + 720, + 235, + 404, + -333, + 249, + -13, + -526, + 502, + 294, + 316, + 767, + -277, + 799, + 1337, + 230, + -309, + 13, + 766, + 2322, + 39, + -1262, + 136, + -39, + } + TW1Keys { + "という", + "ような", + "・・・", + } + TW1Values:intvector { + 292, + 361, + 325, + } + TW2Keys { + "ではな", + "気に入", + } + TW2Values:intvector { + -814, + -466, + } + TW3Keys { + "、ある", + "ている", + "という", + "ること", + } + TW3Values:intvector { + -200, + -389, + 387, + -81, + } + TW4Keys { + "かなり", + "くらい", + "ことが", + "ところ", + "の京都", + "メール", + } + TW4Values:intvector { + 441, + 585, + -51, + 422, + 1005, + 26, + } + UW1Keys { + "い", + "が", + "す", + "そ", + "ち", + "て", + "で", + "と", + "な", + "に", + "は", + } + UW1Values:intvector { + -51, + -53, + 152, + 260, + 112, + 14, + -56, + 36, + -148, + -118, + -56, + } + UW2Keys { + "、", + "。", + "う", + "お", + "か", + "が", + "く", + "さ", + "し", + "す", + "そ", + "っ", + "て", + "で", + "と", + "な", + "に", + "の", + "は", + "ま", + "も", + "よ", + "る", + "を", + "ん", + "一", + "全", + "多", + "少", + "最", + "毎", + "結", + } + UW2Values:intvector { + -1011, + -904, + -191, + -235, + 110, + -521, + -183, + 416, + 113, + 31, + -182, + 163, + -432, + -493, + -508, + -40, + -576, + -434, + -594, + 588, + -603, + 257, + -395, + -1305, + 255, + 834, + 39, + 67, + 571, + 279, + 628, + 661, + } + UW3Keys { + "…", + "☆", + "♪", + "、", + "。", + "々", + "「", + "」", + "い", + "う", + "お", + "か", + "が", + "き", + "く", + "ぐ", + "す", + "ず", + "た", + "だ", + "っ", + "つ", + "て", + "で", + "と", + "ど", + "な", + "に", + "の", + "は", + "ば", + "へ", + "ま", + "も", + "ゃ", + "や", + "ら", + "り", + "る", + "ろ", + "を", + "ん", + "イ", + "ッ", + "・", + "ー", + "中", + "京", + "人", + "今", + "分", + "度", + "後", + "日", + "真", + "笑", + "観", + "間", + "電", + "!", + ")", + ",", + ".", + "0", + "?", + "]", + } + UW3Values:intvector { + 1798, + 1235, + 1035, + 3634, + 4347, + 209, + -26, + 526, + 964, + 1031, + -81, + 1148, + 2996, + 40, + 1470, + 411, + 251, + 668, + 1331, + 424, + -1005, + 365, + 1355, + 1842, + 2064, + 1098, + 1392, + 2701, + 2538, + 2977, + 1307, + 701, + -238, + 2360, + 434, + 1360, + 1242, + 972, + 1706, + 452, + 4165, + 284, + -13, + -52, + 914, + -243, + 252, + -396, + 415, + 324, + 333, + 222, + 118, + 287, + 39, + 934, + -26, + 493, + -356, + 1736, + 2156, + 2468, + 1311, + -224, + 1719, + 1640, + } + UW4Keys { + "…", + "、", + "。", + "「", + "」", + "』", + "い", + "う", + "え", + "お", + "か", + "が", + "き", + "く", + "け", + "こ", + "さ", + "し", + "じ", + "す", + "ず", + "せ", + "そ", + "た", + "だ", + "ち", + "っ", + "つ", + "て", + "で", + "と", + "ど", + "な", + "に", + "の", + "は", + "ば", + "ひ", + "べ", + "ま", + "み", + "め", + "も", + "ゃ", + "や", + "よ", + "ら", + "り", + "る", + "れ", + "ろ", + "わ", + "を", + "ん", + "ッ", + "ル", + "ン", + "・", + "ー", + "一", + "京", + "人", + "今", + "何", + "使", + "大", + "帯", + "思", + "携", + "時", + "最", + "気", + "私", + "笑", + "自", + "行", + "見", + "食", + "!", + "(", + ")", + ",", + ".", + "1", + "2", + "?", + "]", + } + UW4Values:intvector { + -995, + -4812, + -4206, + 1314, + -1957, + -296, + -886, + -1160, + -711, + 539, + -1628, + -2180, + -1275, + -1529, + -1615, + 258, + -748, + -1073, + -793, + -412, + -321, + -234, + 191, + -1056, + -1461, + -355, + -2310, + -400, + -2403, + -1808, + -1442, + -334, + -1091, + -2678, + -2581, + -2164, + -635, + 122, + -596, + -774, + -536, + -399, + -1580, + -105, + -219, + -450, + -2999, + -2405, + -2760, + -2105, + -1024, + -425, + -2408, + -2158, + -149, + -401, + -694, + -490, + -1750, + 488, + 510, + 522, + 549, + 474, + 544, + 396, + -289, + 955, + 123, + 305, + 423, + 329, + 1247, + -614, + 438, + 276, + 433, + 175, + -1685, + 748, + -566, + -1700, + -813, + 493, + 402, + -796, + -52, + } + UW5Keys { + "、", + "。", + "」", + "あ", + "い", + "う", + "え", + "か", + "が", + "き", + "く", + "し", + "じ", + "す", + "ず", + "そ", + "た", + "ち", + "っ", + "つ", + "て", + "で", + "と", + "な", + "に", + "の", + "は", + "べ", + "め", + "も", + "る", + "れ", + "ろ", + "わ", + "を", + "ん", + "・", + "ー", + "分", + "帯", + "思", + "日", + "!", + } + UW5Values:intvector { + -818, + -1455, + -217, + -138, + 162, + 161, + 411, + 186, + -366, + 801, + 348, + -209, + 432, + -557, + 26, + -115, + -26, + 128, + 543, + 542, + 103, + -706, + -476, + -388, + -413, + -262, + -378, + 787, + 263, + -141, + -39, + 279, + -81, + 462, + -333, + 520, + -851, + 87, + 13, + 404, + -98, + 26, + -67, + } + UW6Keys { + "。", + "い", + "う", + "が", + "ご", + "さ", + "し", + "た", + "て", + "で", + "と", + "な", + "に", + "の", + "は", + "ぱ", + "や", + "り", + "を", + "ル", + "ー", + "寺", + "0", + } + UW6Values:intvector { + -337, + -48, + -386, + 111, + 119, + 237, + 236, + -82, + 49, + 102, + -27, + 66, + 277, + 48, + 104, + 105, + 40, + 74, + 228, + 14, + -88, + 68, + 82, + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java index 196579d0a58..e09c1763d5f 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java @@ -8,26 +8,36 @@ import static com.ibm.icu.impl.CharacterIteration.current32; import static com.ibm.icu.impl.CharacterIteration.next32; import static com.ibm.icu.impl.CharacterIteration.previous32; -import com.ibm.icu.impl.Assert; import com.ibm.icu.impl.ICUData; -import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.UResourceBundle; import com.ibm.icu.util.UResourceBundleIterator; -import java.lang.System; import java.text.CharacterIterator; +import java.util.Arrays; import java.util.ArrayList; +import java.util.List; import java.util.HashMap; -public class MlBreakEngine { +enum ModelIndex { + kUWStart(0), kBWStart(6), kTWStart(9); + private final int value; - private static final int INVALID = '|'; - private static final String INVALID_STRING = "|"; + private ModelIndex(int value) { + this.value = value; + } + + public int getValue() { + return value; + } +} + +public class MlBreakEngine { + // {UW1, UW2, ... UW6, BW1, ... BW3, TW1, TW2, ... TW4} 6+3+4= 13 private static final int MAX_FEATURE = 13; private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; private UnicodeSet fClosePunctuationSet; - private HashMap fModel; + private List> fModel; private int fNegativeSum; /** @@ -41,7 +51,10 @@ public class MlBreakEngine { UnicodeSet closePunctuationSet) { fDigitOrOpenPunctuationOrAlphabetSet = digitOrOpenPunctuationOrAlphabetSet; fClosePunctuationSet = closePunctuationSet; - fModel = new HashMap(); + fModel = new ArrayList>(MAX_FEATURE); + for (int i = 0; i < MAX_FEATURE; i++) { + fModel.add(new HashMap()); + } fNegativeSum = 0; loadMLModel(); } @@ -49,42 +62,47 @@ public class MlBreakEngine { /** * Divide up a range of characters handled by this break engine. * - * @param inText A input text. - * @param startPos The start index of the input text. - * @param endPos The end index of the input text. - * @param inString A input string normalized from inText from startPos to endPos - * @param numCodePts The number of code points of inString - * @param charPositions A map that transforms inString's code point index to code unit index. - * @param foundBreaks A list to store the breakpoint. + * @param inText An input text. + * @param startPos The start index of the input text. + * @param endPos The end index of the input text. + * @param inString A input string normalized from inText from startPos to endPos + * @param codePointLength The number of code points of inString + * @param charPositions A map that transforms inString's code point index to code unit index. + * @param foundBreaks A list to store the breakpoint. * @return The number of breakpoints */ public int divideUpRange(CharacterIterator inText, int startPos, int endPos, - CharacterIterator inString, int numCodePts, int[] charPositions, + CharacterIterator inString, int codePointLength, int[] charPositions, DictionaryBreakEngine.DequeI foundBreaks) { if (startPos >= endPos) { return 0; } - ArrayList boundary = new ArrayList(numCodePts); - // The ML model groups six char to evaluate if the 4th char is a breakpoint. - // Like a sliding window, the elementList removes the first char and appends the new char - // from inString in each iteration so that its size always remains at six. - int elementList[] = new int[6]; - initElementList(inString, elementList, numCodePts); + ArrayList boundary = new ArrayList(codePointLength); + String inputStr = transform(inString); + // The ML algorithm groups six char and evaluates whether the 4th char is a breakpoint. + // In each iteration, it evaluates the 4th char and then moves forward one char like + // sliding window. Initially, the first six values in the indexList are + // [-1, -1, 0, 1, 2, 3]. After moving forward, finally the last six values in the indexList + // are [length-4, length-3, length-2, length-1, -1, -1]. The "+4" here means four extra + // "-1". + int indexSize = codePointLength + 4; + int indexList[] = new int[indexSize]; + int numCodeUnits = initIndexList(inString, indexList, codePointLength); // Add a break for the start. boundary.add(0, 0); - for (int i = 1; i < numCodePts; i++) { - evaluateBreakpoint(elementList, i, boundary); - if (i + 1 > numCodePts) { - break; + + for (int idx = 0; idx + 1 < codePointLength; idx++) { + evaluateBreakpoint(inputStr, indexList, idx, numCodeUnits, boundary); + if (idx + 4 < codePointLength) { + indexList[idx + 6] = numCodeUnits; + numCodeUnits += Character.charCount(next32(inString)); } - shiftLeftOne(elementList); - elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID; } // Add a break for the end if there is not one there already. - if (boundary.get(boundary.size() - 1) != numCodePts) { - boundary.add(numCodePts); + if (boundary.get(boundary.size() - 1) != codePointLength) { + boundary.add(codePointLength); } int correctedNumBreaks = 0; @@ -127,137 +145,94 @@ public class MlBreakEngine { return correctedNumBreaks; } - private void shiftLeftOne(int[] elementList) { - int length = elementList.length; - for (int i = 1; i < length; i++) { - elementList[i - 1] = elementList[i]; + /** + * Transform a CharacterIterator into a String. + */ + private String transform(CharacterIterator inString) { + StringBuilder sb = new StringBuilder(); + inString.setIndex(0); + for (char c = inString.first(); c != CharacterIterator.DONE; c = inString.next()) { + sb.append(c); } + return sb.toString(); } /** - * Evaluate whether the index is a potential breakpoint. + * Evaluate whether the breakpointIdx is a potential breakpoint. * - * @param elementList A list including six elements for the breakpoint evaluation. - * @param index The breakpoint index to be evaluated. - * @param boundary An list including the index of the breakpoint. + * @param inputStr An input string to be segmented. + * @param indexList A code unit index list of the inputStr. + * @param startIdx The start index of the indexList. + * @param numCodeUnits The current code unit boundary of the indexList. + * @param boundary A list including the index of the breakpoint. */ - private void evaluateBreakpoint(int[] elementList, int index, ArrayList boundary) { - String[] featureList = new String[MAX_FEATURE]; - final int w1 = elementList[0]; - final int w2 = elementList[1]; - final int w3 = elementList[2]; - final int w4 = elementList[3]; - final int w5 = elementList[4]; - final int w6 = elementList[5]; - - StringBuilder sb = new StringBuilder(); - int idx = 0; - if (w1 != INVALID) { - featureList[idx++] = sb.append("UW1:").appendCodePoint(w1).toString(); - } - if (w2 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("UW2:").appendCodePoint(w2).toString(); - } - if (w3 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("UW3:").appendCodePoint(w3).toString(); - } - if (w4 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("UW4:").appendCodePoint(w4).toString(); - } - if (w5 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("UW5:").appendCodePoint(w5).toString(); - } - if (w6 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("UW6:").appendCodePoint(w6).toString(); - } - if (w2 != INVALID && w3 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("BW1:").appendCodePoint(w2).appendCodePoint( - w3).toString(); - } - if (w3 != INVALID && w4 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("BW2:").appendCodePoint(w3).appendCodePoint( - w4).toString(); - } - if (w4 != INVALID && w5 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("BW3:").appendCodePoint(w4).appendCodePoint( - w5).toString(); - } - if (w1 != INVALID && w2 != INVALID && w3 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("TW1:").appendCodePoint(w1).appendCodePoint( - w2).appendCodePoint(w3).toString(); - } - if (w2 != INVALID && w3 != INVALID && w4 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("TW2:").appendCodePoint(w2).appendCodePoint( - w3).appendCodePoint(w4).toString(); - } - if (w3 != INVALID && w4 != INVALID && w5 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("TW3:").appendCodePoint(w3).appendCodePoint( - w4).appendCodePoint(w5).toString(); - } - if (w4 != INVALID && w5 != INVALID && w6 != INVALID) { - sb.setLength(0); - featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint( - w5).appendCodePoint(w6).toString(); - } - + private void evaluateBreakpoint(String inputStr, int[] indexList, int startIdx, + int numCodeUnits, ArrayList boundary) { + int start = 0, end = 0; int score = fNegativeSum; - for (int j = 0; j < idx; j++) { - if (fModel.containsKey(featureList[j])) { - score += (2 * fModel.get(featureList[j])); + + for (int i = 0; i < 6; i++) { + // UW1 ~ UW6 + start = startIdx + i; + if (indexList[start] != -1) { + end = (indexList[start + 1] != -1) ? indexList[start + 1] : numCodeUnits; + score += fModel.get(ModelIndex.kUWStart.getValue() + i).getOrDefault( + inputStr.substring(indexList[start], end), 0); + } + } + for (int i = 0; i < 3; i++) { + // BW1 ~ BW3 + start = startIdx + i + 1; + if (indexList[start] != -1 && indexList[start + 1] != -1) { + end = (indexList[start + 2] != -1) ? indexList[start + 2] : numCodeUnits; + score += fModel.get(ModelIndex.kBWStart.getValue() + i).getOrDefault( + inputStr.substring(indexList[start], end), 0); + } + } + for (int i = 0; i < 4; i++) { + // TW1 ~ TW4 + start = startIdx + i; + if (indexList[start] != -1 + && indexList[start + 1] != -1 + && indexList[start + 2] != -1) { + end = (indexList[start + 3] != -1) ? indexList[start + 3] : numCodeUnits; + score += fModel.get(ModelIndex.kTWStart.getValue() + i).getOrDefault( + inputStr.substring(indexList[start], end), 0); } } if (score > 0) { - boundary.add(index); + boundary.add(startIdx + 1); } } /** - * Initialize the element list from the input string. + * Initialize the index list from the input string. * - * @param inString A input string to be segmented. - * @param elementList A list to store the first six characters. - * @param numCodePts The number of code points of input string + * @param inString An input string to be segmented. + * @param indexList A code unit index list of the inString. + * @param codePointLength The number of code points of the input string * @return The number of the code units of the first six characters in inString. */ - private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) { + private int initIndexList(CharacterIterator inString, int[] indexList, int codePointLength) { int index = 0; inString.setIndex(index); - int w1, w2, w3, w4, w5, w6; - w1 = w2 = w3 = w4 = w5 = w6 = INVALID; - if (numCodePts > 0) { - w3 = current32(inString); - index += Character.charCount(w3); - if (numCodePts > 1) { - w4 = next32(inString); - index += Character.charCount(w3); - if (numCodePts > 2) { - w5 = next32(inString); - index += Character.charCount(w5); - if (numCodePts > 3) { - w6 = next32(inString); - index += Character.charCount(w6); + Arrays.fill(indexList, -1); + if (codePointLength > 0) { + indexList[2] = 0; + index += Character.charCount(current32(inString)); + if (codePointLength > 1) { + indexList[3] = index; + index += Character.charCount(next32(inString)); + if (codePointLength > 2) { + indexList[4] = index; + index += Character.charCount(next32(inString)); + if (codePointLength > 3) { + indexList[5] = index; + index += Character.charCount(next32(inString)); } } } } - elementList[0] = w1; - elementList[1] = w2; - elementList[2] = w3; - elementList[3] = w4; - elementList[4] = w5; - elementList[5] = w6; - return index; } @@ -268,13 +243,41 @@ public class MlBreakEngine { int index = 0; UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, "jaml"); - UResourceBundle keyBundle = rb.get("modelKeys"); - UResourceBundle valueBundle = rb.get("modelValues"); + initKeyValue(rb, "UW1Keys", "UW1Values", fModel.get(index++)); + initKeyValue(rb, "UW2Keys", "UW2Values", fModel.get(index++)); + initKeyValue(rb, "UW3Keys", "UW3Values", fModel.get(index++)); + initKeyValue(rb, "UW4Keys", "UW4Values", fModel.get(index++)); + initKeyValue(rb, "UW5Keys", "UW5Values", fModel.get(index++)); + initKeyValue(rb, "UW6Keys", "UW6Values", fModel.get(index++)); + initKeyValue(rb, "BW1Keys", "BW1Values", fModel.get(index++)); + initKeyValue(rb, "BW2Keys", "BW2Values", fModel.get(index++)); + initKeyValue(rb, "BW3Keys", "BW3Values", fModel.get(index++)); + initKeyValue(rb, "TW1Keys", "TW1Values", fModel.get(index++)); + initKeyValue(rb, "TW2Keys", "TW2Values", fModel.get(index++)); + initKeyValue(rb, "TW3Keys", "TW3Values", fModel.get(index++)); + initKeyValue(rb, "TW4Keys", "TW4Values", fModel.get(index++)); + fNegativeSum /= 2; + } + + /** + * In the machine learning's model file, specify the name of the key and value to load the + * corresponding feature and its score. + * + * @param rb A RedouceBundle corresponding to the model file. + * @param keyName The kay name in the model file. + * @param valueName The value name in the model file. + * @param map A HashMap to store the pairs of the feature and its score. + */ + private void initKeyValue(UResourceBundle rb, String keyName, String valueName, + HashMap map) { + int idx = 0; + UResourceBundle keyBundle = rb.get(keyName); + UResourceBundle valueBundle = rb.get(valueName); int[] value = valueBundle.getIntVector(); UResourceBundleIterator iterator = keyBundle.getIterator(); while (iterator.hasNext()) { - fNegativeSum -= value[index]; - fModel.put(iterator.nextString(), value[index++]); + fNegativeSum -= value[idx]; + map.put(iterator.nextString(), value[idx++]); } } }