mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-22100 Remove unicode blocks from Japanese ML phrase breaking
See #2278
This commit is contained in:
parent
0c6d7fc98d
commit
80fb309c8a
6 changed files with 780 additions and 1337 deletions
|
@ -18,28 +18,6 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
Element::Element() : length(0) {}
|
||||
|
||||
void Element::setCharAndUblock(UChar32 ch, const UnicodeString &idx) {
|
||||
character = ch;
|
||||
U_ASSERT(idx.length() <= 3);
|
||||
length = idx.length();
|
||||
idx.extract(0, length, ublock);
|
||||
ublock[length] = '\0';
|
||||
}
|
||||
|
||||
UChar32 Element::getCharacter() const {
|
||||
return character;
|
||||
}
|
||||
|
||||
char16_t* Element::getUblock() const {
|
||||
return (char16_t*)ublock;
|
||||
}
|
||||
|
||||
uint16_t Element::getLength() const {
|
||||
return length;
|
||||
}
|
||||
|
||||
MlBreakEngine::MlBreakEngine(const UnicodeSet &digitOrOpenPunctuationOrAlphabetSet,
|
||||
const UnicodeSet &closePunctuationSet, UErrorCode &status)
|
||||
: fDigitOrOpenPunctuationOrAlphabetSet(digitOrOpenPunctuationOrAlphabetSet),
|
||||
|
@ -56,12 +34,8 @@ MlBreakEngine::~MlBreakEngine() {}
|
|||
|
||||
namespace {
|
||||
const char16_t INVALID = u'|';
|
||||
const int32_t MAX_FEATURE = 26;
|
||||
const int32_t MAX_FEATURE_LENGTH = 14;
|
||||
|
||||
bool isValid(const Element& element) {
|
||||
return element.getLength() != 1 || element.getUblock()[0] != INVALID;
|
||||
}
|
||||
const int32_t MAX_FEATURE = 13;
|
||||
const int32_t MAX_FEATURE_LENGTH = 11;
|
||||
|
||||
void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -74,11 +48,6 @@ namespace {
|
|||
U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
|
||||
result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
|
||||
}
|
||||
|
||||
void writeString(const UnicodeString &str, char16_t *feature, UErrorCode &status) {
|
||||
U_ASSERT(str.length() < MAX_FEATURE_LENGTH);
|
||||
str.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
|
||||
}
|
||||
}
|
||||
|
||||
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
|
||||
|
@ -98,12 +67,11 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
|
|||
return 0;
|
||||
}
|
||||
int32_t numBreaks = 0;
|
||||
UChar32 ch;
|
||||
UnicodeString index;
|
||||
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
|
||||
// Like a sliding window, the elementList removes the first char and appends the new char from
|
||||
// inString in each iteration so that its size always remains at six.
|
||||
Element elementList[6];
|
||||
UChar32 elementList[6];
|
||||
|
||||
int32_t codeUts = initElementList(inString, elementList, status);
|
||||
int32_t length = inString.countChar32();
|
||||
|
@ -117,12 +85,10 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
|
|||
evaluateBreakpoint(elementList, i, numBreaks, boundary, status);
|
||||
if (i + 1 >= inString.countChar32()) break;
|
||||
// Remove the first element and append a new element
|
||||
uprv_memmove(elementList, elementList + 1, 5 * sizeof(Element));
|
||||
ch = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
|
||||
index = (ch != INVALID) ? getUnicodeBlock(ch, status) : UnicodeString(INVALID);
|
||||
elementList[5].setCharAndUblock(ch, index);
|
||||
if (ch != INVALID) {
|
||||
codeUts += U16_LENGTH(ch);
|
||||
uprv_memmove(elementList, elementList + 1, 5 * sizeof(UChar32));
|
||||
elementList[5] = inString.countChar32(0, codeUts) < length ? inString.char32At(codeUts) : INVALID;
|
||||
if (elementList[5] != INVALID) {
|
||||
codeUts += U16_LENGTH(elementList[5]);
|
||||
}
|
||||
}
|
||||
if (U_FAILURE(status)) return 0;
|
||||
|
@ -176,7 +142,7 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
|
|||
return correctedNumBreaks;
|
||||
}
|
||||
|
||||
void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
|
||||
void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
|
||||
UVector32 &boundary, UErrorCode &status) const {
|
||||
char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -186,12 +152,12 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3
|
|||
UChar32 arr[4] = {-1, -1, -1, -1};
|
||||
int32_t length = 0, listLength = 0;
|
||||
|
||||
const UChar32 w1 = elementList[0].getCharacter();
|
||||
const UChar32 w2 = elementList[1].getCharacter();
|
||||
const UChar32 w3 = elementList[2].getCharacter();
|
||||
const UChar32 w4 = elementList[3].getCharacter();
|
||||
const UChar32 w5 = elementList[4].getCharacter();
|
||||
const UChar32 w6 = elementList[5].getCharacter();
|
||||
const UChar32 w1 = elementList[0];
|
||||
const UChar32 w2 = elementList[1];
|
||||
const UChar32 w3 = elementList[2];
|
||||
const UChar32 w4 = elementList[3];
|
||||
const UChar32 w5 = elementList[4];
|
||||
const UChar32 w6 = elementList[5];
|
||||
|
||||
length = 1;
|
||||
if (w1 != INVALID) {
|
||||
|
@ -259,82 +225,6 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3
|
|||
arr[2] = w6;
|
||||
concatChar(u"TW4:", arr, length, featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[0])) {
|
||||
writeString(UnicodeString(u"UB1:").append(elementList[0].getUblock(), 0,
|
||||
elementList[0].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[1])) {
|
||||
writeString(UnicodeString(u"UB2:").append(elementList[1].getUblock(), 0,
|
||||
elementList[1].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[2])) {
|
||||
writeString(UnicodeString(u"UB3:").append(elementList[2].getUblock(), 0,
|
||||
elementList[2].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[3])) {
|
||||
writeString(UnicodeString(u"UB4:").append(elementList[3].getUblock(), 0,
|
||||
elementList[3].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[4])) {
|
||||
writeString(UnicodeString(u"UB5:").append(elementList[4].getUblock(), 0,
|
||||
elementList[4].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[5])) {
|
||||
writeString(UnicodeString(u"UB6:").append(elementList[5].getUblock(), 0,
|
||||
elementList[5].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[1]) && isValid(elementList[2])) {
|
||||
writeString(UnicodeString(u"BB1:")
|
||||
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[2]) && isValid(elementList[3])) {
|
||||
writeString(UnicodeString(u"BB2:")
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[3]) && isValid(elementList[4])) {
|
||||
writeString(UnicodeString(u"BB3:")
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
|
||||
.append(elementList[4].getUblock(), 0, elementList[4].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
|
||||
writeString(UnicodeString(u"TB1:")
|
||||
.append(elementList[0].getUblock(), 0, elementList[0].getLength())
|
||||
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
|
||||
writeString(UnicodeString(u"TB2:")
|
||||
.append(elementList[1].getUblock(), 0, elementList[1].getLength())
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
|
||||
writeString(UnicodeString(u"TB3:")
|
||||
.append(elementList[2].getUblock(), 0, elementList[2].getLength())
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
|
||||
.append(elementList[4].getUblock(), 0, elementList[4].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
|
||||
writeString(UnicodeString(u"TB4:")
|
||||
.append(elementList[3].getUblock(), 0, elementList[3].getLength())
|
||||
.append(elementList[4].getUblock(), 0, elementList[4].getLength())
|
||||
.append(elementList[5].getUblock(), 0, elementList[5].getLength()),
|
||||
featureList[listLength++], status);
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -351,7 +241,7 @@ void MlBreakEngine::evaluateBreakpoint(Element* elementList, int32_t index, int3
|
|||
}
|
||||
}
|
||||
|
||||
int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* elementList,
|
||||
int32_t MlBreakEngine::initElementList(const UnicodeString &inString, UChar32* elementList,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
|
@ -363,52 +253,29 @@ int32_t MlBreakEngine::initElementList(const UnicodeString &inString, Element* e
|
|||
if (length > 0) {
|
||||
w3 = inString.char32At(0);
|
||||
index += U16_LENGTH(w3);
|
||||
if (length > 1) {
|
||||
w4 = inString.char32At(index);
|
||||
index += U16_LENGTH(w4);
|
||||
if (length > 2) {
|
||||
w5 = inString.char32At(index);
|
||||
index += U16_LENGTH(w5);
|
||||
if (length > 3) {
|
||||
w6 = inString.char32At(index);
|
||||
index += U16_LENGTH(w6);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (length > 1) {
|
||||
w4 = inString.char32At(index);
|
||||
index += U16_LENGTH(w4);
|
||||
}
|
||||
if (length > 2) {
|
||||
w5 = inString.char32At(index);
|
||||
index += U16_LENGTH(w5);
|
||||
}
|
||||
if (length > 3) {
|
||||
w6 = inString.char32At(index);
|
||||
index += U16_LENGTH(w6);
|
||||
}
|
||||
|
||||
const UnicodeString b1(INVALID);
|
||||
const UnicodeString b2(b1);
|
||||
const UnicodeString b3(getUnicodeBlock(w3, status));
|
||||
const UnicodeString b4(getUnicodeBlock(w4, status));
|
||||
const UnicodeString b5(getUnicodeBlock(w5, status));
|
||||
const UnicodeString b6(getUnicodeBlock(w6, status));
|
||||
|
||||
elementList[0].setCharAndUblock(w1, b1);
|
||||
elementList[1].setCharAndUblock(w2, b2);
|
||||
elementList[2].setCharAndUblock(w3, b3);
|
||||
elementList[3].setCharAndUblock(w4, b4);
|
||||
elementList[4].setCharAndUblock(w5, b5);
|
||||
elementList[5].setCharAndUblock(w6, b6);
|
||||
elementList[0] = w1;
|
||||
elementList[1] = w2;
|
||||
elementList[2] = w3;
|
||||
elementList[3] = w4;
|
||||
elementList[4] = w5;
|
||||
elementList[5] = w6;
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
UnicodeString MlBreakEngine::getUnicodeBlock(UChar32 ch, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return UnicodeString(INVALID);
|
||||
}
|
||||
|
||||
UBlockCode block = ublock_getCode(ch);
|
||||
if (block == UBLOCK_NO_BLOCK || block == UBLOCK_INVALID_CODE) {
|
||||
return UnicodeString(INVALID);
|
||||
} else {
|
||||
UnicodeString empty;
|
||||
// Same as sprintf("%03d", block)
|
||||
return ICU_Utility::appendNumber(empty, (int32_t)block, 10, 3);
|
||||
}
|
||||
}
|
||||
|
||||
void MlBreakEngine::loadMLModel(UErrorCode &error) {
|
||||
// BudouX's model consists of pairs of the feature and its score.
|
||||
// As integrating it into jaml.txt, modelKeys denotes the ML feature; modelValues means the
|
||||
|
|
|
@ -13,51 +13,6 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* A class used to encapsulate a character and its unicode block index
|
||||
*/
|
||||
class Element : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
Element();
|
||||
|
||||
/**
|
||||
* Set the character and its unicode block.
|
||||
*
|
||||
* @param ch A unicode character.
|
||||
* @param ublock The unicode block of the character.
|
||||
*/
|
||||
void setCharAndUblock(UChar32 ch, const UnicodeString& ublock);
|
||||
|
||||
/**
|
||||
* Get the unicode character.
|
||||
*
|
||||
* @return The unicode character.
|
||||
*/
|
||||
UChar32 getCharacter() const;
|
||||
|
||||
/**
|
||||
* Get the unicode character's unicode block.
|
||||
*
|
||||
* @return The unicode block.
|
||||
*/
|
||||
char16_t* getUblock() const;
|
||||
|
||||
/**
|
||||
* Get the length of the unicode block.
|
||||
*
|
||||
* @return The unicode block length.
|
||||
*/
|
||||
uint16_t getLength() const;
|
||||
|
||||
private:
|
||||
UChar32 character;
|
||||
char16_t ublock[4];
|
||||
uint16_t length;
|
||||
};
|
||||
|
||||
/**
|
||||
* A machine learning break engine for the phrase breaking in Japanese.
|
||||
*/
|
||||
|
@ -104,38 +59,27 @@ class MlBreakEngine : public UMemory {
|
|||
*/
|
||||
void loadMLModel(UErrorCode &error);
|
||||
|
||||
/**
|
||||
* Get the character's unicode block code defined in UBlockCode.
|
||||
*
|
||||
* @param ch A character.
|
||||
* @param error Information on any errors encountered.
|
||||
* @return The unicode block code which is 3 digits with '0' added in the beginning if the code
|
||||
* is less than 3 digits.
|
||||
*
|
||||
*/
|
||||
UnicodeString getUnicodeBlock(UChar32 ch, UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Initialize the element list from the input string.
|
||||
*
|
||||
* @param inString A input string to be segmented.
|
||||
* @param elementList A list to store the first six characters and their unicode block codes.
|
||||
* @param elementList A list to store the first six characters.
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of code units of the first six characters in inString.
|
||||
*/
|
||||
int32_t initElementList(const UnicodeString &inString, Element* elementList,
|
||||
int32_t initElementList(const UnicodeString &inString, UChar32* elementList,
|
||||
UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Evaluate whether the index is a potential breakpoint.
|
||||
*
|
||||
* @param elementList A list including 6 elements for the breakpoint evaluation.
|
||||
* @param elementList A list including six elements for the breakpoint evaluation.
|
||||
* @param index The breakpoint index to be evaluated.
|
||||
* @param numBreaks The accumulated number of breakpoints.
|
||||
* @param boundary A vector including the index of the breakpoint.
|
||||
* @param status Information on any errors encountered.
|
||||
*/
|
||||
void evaluateBreakpoint(Element* elementList, int32_t index, int32_t &numBreaks,
|
||||
void evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
|
||||
UVector32 &boundary, UErrorCode &status) const;
|
||||
|
||||
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||||
|
|
File diff suppressed because it is too large
Load diff
6
icu4c/source/test/testdata/rbbitst.txt
vendored
6
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1919,9 +1919,9 @@ Bangkok)•</data>
|
|||
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
|
||||
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
|
||||
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
|
||||
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
|
||||
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
|
||||
<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
|
||||
#Kana supplement: 𛁛 (U+1B05B), 𛂦(U+1B0A6)
|
||||
#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。
|
||||
<data>•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002•</data>
|
||||
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
|
||||
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
|
||||
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
|
||||
|
|
|
@ -24,61 +24,12 @@ public class MlBreakEngine {
|
|||
|
||||
private static final int INVALID = '|';
|
||||
private static final String INVALID_STRING = "|";
|
||||
private static final int MAX_FEATURE = 26;
|
||||
private static final int MAX_FEATURE = 13;
|
||||
private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||||
private UnicodeSet fClosePunctuationSet;
|
||||
private HashMap<String, Integer> fModel;
|
||||
|
||||
private int fNegativeSum;
|
||||
|
||||
static class Element {
|
||||
private int character;
|
||||
private String ublock;
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
public Element() {
|
||||
character = 0;
|
||||
ublock = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the character and its unicode block.
|
||||
*
|
||||
* @param ch A unicode character.
|
||||
* @param str The unicode block of the character.
|
||||
*/
|
||||
public void setCharAndUblock(int ch, String str) {
|
||||
Assert.assrt(str.length() <= 3);
|
||||
this.character = ch;
|
||||
ublock = str;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the unicode character.
|
||||
*
|
||||
* @return The unicode character.
|
||||
*/
|
||||
public int getCharacter() {
|
||||
return character;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the unicode character's unicode block.
|
||||
*
|
||||
* @return The unicode block.
|
||||
*/
|
||||
public String getUblock() {
|
||||
return ublock;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isValid(Element element) {
|
||||
String ublock = element.getUblock();
|
||||
return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for Chinese and Japanese phrase breaking.
|
||||
*
|
||||
|
@ -114,12 +65,10 @@ public class MlBreakEngine {
|
|||
return 0;
|
||||
}
|
||||
ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts);
|
||||
int ch;
|
||||
String ublock;
|
||||
// The ML model groups six char to evaluate if the 4th char is a breakpoint.
|
||||
// Like a sliding window, the elementList removes the first char and appends the new char
|
||||
// from inString in each iteration so that its size always remains at six.
|
||||
Element elementList[] = new Element[6];
|
||||
int elementList[] = new int[6];
|
||||
initElementList(inString, elementList, numCodePts);
|
||||
|
||||
// Add a break for the start.
|
||||
|
@ -130,10 +79,7 @@ public class MlBreakEngine {
|
|||
break;
|
||||
}
|
||||
shiftLeftOne(elementList);
|
||||
|
||||
ch = (i + 3) < numCodePts ? next32(inString) : INVALID;
|
||||
ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING;
|
||||
elementList[5].setCharAndUblock(ch, ublock);
|
||||
elementList[5] = (i + 3) < numCodePts ? next32(inString) : INVALID;
|
||||
}
|
||||
|
||||
// Add a break for the end if there is not one there already.
|
||||
|
@ -181,11 +127,10 @@ public class MlBreakEngine {
|
|||
return correctedNumBreaks;
|
||||
}
|
||||
|
||||
private void shiftLeftOne(Element[] elementList) {
|
||||
private void shiftLeftOne(int[] elementList) {
|
||||
int length = elementList.length;
|
||||
for (int i = 1; i < length; i++) {
|
||||
elementList[i - 1].character = elementList[i].character;
|
||||
elementList[i - 1].ublock = elementList[i].ublock;
|
||||
elementList[i - 1] = elementList[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -196,14 +141,14 @@ public class MlBreakEngine {
|
|||
* @param index The breakpoint index to be evaluated.
|
||||
* @param boundary An list including the index of the breakpoint.
|
||||
*/
|
||||
private void evaluateBreakpoint(Element[] elementList, int index, ArrayList<Integer> boundary) {
|
||||
private void evaluateBreakpoint(int[] elementList, int index, ArrayList<Integer> boundary) {
|
||||
String[] featureList = new String[MAX_FEATURE];
|
||||
final int w1 = elementList[0].getCharacter();
|
||||
final int w2 = elementList[1].getCharacter();
|
||||
final int w3 = elementList[2].getCharacter();
|
||||
final int w4 = elementList[3].getCharacter();
|
||||
final int w5 = elementList[4].getCharacter();
|
||||
final int w6 = elementList[5].getCharacter();
|
||||
final int w1 = elementList[0];
|
||||
final int w2 = elementList[1];
|
||||
final int w3 = elementList[2];
|
||||
final int w4 = elementList[3];
|
||||
final int w5 = elementList[4];
|
||||
final int w6 = elementList[5];
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int idx = 0;
|
||||
|
@ -265,76 +210,7 @@ public class MlBreakEngine {
|
|||
featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint(
|
||||
w5).appendCodePoint(w6).toString();
|
||||
}
|
||||
if (isValid(elementList[0])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[1])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[2])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[3])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[4])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[5])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[1]) && isValid(elementList[2])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("BB1:").
|
||||
append(elementList[1].getUblock()).
|
||||
append(elementList[2].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[2]) && isValid(elementList[3])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("BB2:").
|
||||
append(elementList[2].getUblock()).
|
||||
append(elementList[3].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[3]) && isValid(elementList[4])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("BB3:").
|
||||
append(elementList[3].getUblock()).
|
||||
append(elementList[4].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("TB1:").
|
||||
append(elementList[0].getUblock()).
|
||||
append(elementList[1].getUblock()).
|
||||
append(elementList[2].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("TB2:").
|
||||
append(elementList[1].getUblock()).
|
||||
append(elementList[2].getUblock()).
|
||||
append(elementList[3].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("TB3:").
|
||||
append(elementList[2].getUblock()).
|
||||
append(elementList[3].getUblock()).
|
||||
append(elementList[4].getUblock()).toString();
|
||||
}
|
||||
if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) {
|
||||
sb.setLength(0);
|
||||
featureList[idx++] = sb.append("TB4:").
|
||||
append(elementList[3].getUblock()).
|
||||
append(elementList[4].getUblock()).
|
||||
append(elementList[5].getUblock()).toString();
|
||||
}
|
||||
|
||||
int score = fNegativeSum;
|
||||
for (int j = 0; j < idx; j++) {
|
||||
if (fModel.containsKey(featureList[j])) {
|
||||
|
@ -350,12 +226,11 @@ public class MlBreakEngine {
|
|||
* Initialize the element list from the input string.
|
||||
*
|
||||
* @param inString A input string to be segmented.
|
||||
* @param elementList A list to store the first six characters and their unicode block codes.
|
||||
* @param elementList A list to store the first six characters.
|
||||
* @param numCodePts The number of code points of input string
|
||||
* @return The number of the code units of the first six characters in inString.
|
||||
*/
|
||||
private int initElementList(CharacterIterator inString, Element[] elementList,
|
||||
int numCodePts) {
|
||||
private int initElementList(CharacterIterator inString, int[] elementList, int numCodePts) {
|
||||
int index = 0;
|
||||
inString.setIndex(index);
|
||||
int w1, w2, w3, w4, w5, w6;
|
||||
|
@ -363,60 +238,29 @@ public class MlBreakEngine {
|
|||
if (numCodePts > 0) {
|
||||
w3 = current32(inString);
|
||||
index += Character.charCount(w3);
|
||||
if (numCodePts > 1) {
|
||||
w4 = next32(inString);
|
||||
index += Character.charCount(w3);
|
||||
if (numCodePts > 2) {
|
||||
w5 = next32(inString);
|
||||
index += Character.charCount(w5);
|
||||
if (numCodePts > 3) {
|
||||
w6 = next32(inString);
|
||||
index += Character.charCount(w6);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (numCodePts > 1) {
|
||||
w4 = next32(inString);
|
||||
index += Character.charCount(w3);
|
||||
}
|
||||
if (numCodePts > 2) {
|
||||
w5 = next32(inString);
|
||||
index += Character.charCount(w5);
|
||||
}
|
||||
if (numCodePts > 3) {
|
||||
w6 = next32(inString);
|
||||
index += Character.charCount(w6);
|
||||
}
|
||||
|
||||
final String b1 = INVALID_STRING;
|
||||
final String b2 = b1;
|
||||
final String b3 = getUnicodeBlock(w3);
|
||||
final String b4 = getUnicodeBlock(w4);
|
||||
final String b5 = getUnicodeBlock(w5);
|
||||
final String b6 = getUnicodeBlock(w6);
|
||||
|
||||
elementList[0] = new Element();
|
||||
elementList[0].setCharAndUblock(w1, b1);
|
||||
elementList[1] = new Element();
|
||||
elementList[1].setCharAndUblock(w2, b2);
|
||||
elementList[2] = new Element();
|
||||
elementList[2].setCharAndUblock(w3, b3);
|
||||
elementList[3] = new Element();
|
||||
elementList[3].setCharAndUblock(w4, b4);
|
||||
elementList[4] = new Element();
|
||||
elementList[4].setCharAndUblock(w5, b5);
|
||||
elementList[5] = new Element();
|
||||
elementList[5].setCharAndUblock(w6, b6);
|
||||
elementList[0] = w1;
|
||||
elementList[1] = w2;
|
||||
elementList[2] = w3;
|
||||
elementList[3] = w4;
|
||||
elementList[4] = w5;
|
||||
elementList[5] = w6;
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character's unicode block code defined in UBlockCode.
|
||||
*
|
||||
* @param ch A char.
|
||||
* @return The unicode block code which is 3 digits with '0' added in the beginning if the code
|
||||
* is less than 3 digits.
|
||||
*/
|
||||
private String getUnicodeBlock(int ch) {
|
||||
int blockId = UCharacter.UnicodeBlock.of(ch).getID();
|
||||
if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID()
|
||||
|| blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) {
|
||||
return INVALID_STRING;
|
||||
} else {
|
||||
return String.format("%03d", blockId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the machine learning's model file.
|
||||
*/
|
||||
|
|
|
@ -1919,9 +1919,9 @@ Bangkok)•</data>
|
|||
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
|
||||
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
|
||||
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
|
||||
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
|
||||
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし)
|
||||
<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data>
|
||||
#Kana supplement: 𛁛 (U+1B05B), 𛂦(U+1B0A6)
|
||||
#生 𛁛𛂦゙をいただく。-> 生 𛁛𛂦゙を•いただく。
|
||||
<data>•\u751F\U0001B05B\U0001B0A6\u3099\u3092•\u3044\u305F\u3060\u304F\u3002•</data>
|
||||
#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です
|
||||
<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data>
|
||||
#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!!
|
||||
|
|
Loading…
Add table
Reference in a new issue