ICU-21699 Refactor codeunit handling

See #1965
This commit is contained in:
allenwtsu 2022-01-25 11:27:47 +00:00 committed by Frank Yung-Fong Tang
parent 32d279a437
commit a67bb90150
4 changed files with 20 additions and 11 deletions

View file

@ -1360,16 +1360,18 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
int32_t prevIdx = numCodePts;
int32_t codeUnitIdx = -1;
int32_t prevCodeUnitIdx = -1;
int32_t length = -1;
for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
codeUnitIdx = inString.moveIndex32(0, i);
prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
// Calculate the length by using the code unit.
length = inString.moveIndex32(0, prevIdx) - codeUnitIdx;
length = prevCodeUnitIdx - codeUnitIdx;
prevIdx = i;
// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
// characters don't occur.
if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
&& (!isKatakana(inString.char32At(codeUnitIdx -1))
&& (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
|| !isKatakana(inString.char32At(codeUnitIdx)))) {
t_boundary.addElement(i, status);
numBreaks++;

View file

@ -1898,7 +1898,9 @@ Bangkok)•</data>
<data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
<data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし)
<data>•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09•</data>
####################################################################################
#

View file

@ -11,6 +11,7 @@ package com.ibm.icu.impl.breakiter;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.current32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.previous32;
import java.io.IOException;
import java.text.CharacterIterator;
@ -240,17 +241,18 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
t_boundary[numBreaks] = numCodePts;
numBreaks++;
int prevIdx = numCodePts;
int codeUnitIdx = 0, length = 0;
int codeUnitIdx = 0, prevCodeUnitIdx = 0, length = 0;
for (int i = prev[numCodePts]; i > 0; i = prev[i]) {
codeUnitIdx = prenormstr.offsetByCodePoints(0, i);
length = prevIdx - i;
prevCodeUnitIdx = prenormstr.offsetByCodePoints(0, prevIdx);
length = prevCodeUnitIdx - codeUnitIdx;
prevIdx = i;
String pattern = getPatternFromText(text, s, codeUnitIdx, length);
// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
// characters don't occur.
text.setIndex(codeUnitIdx - 1);
text.setIndex(codeUnitIdx);
if (!fSkipSet.contains(pattern)
&& (!isKatakana(current32(text)) || !isKatakana(next32(text)))) {
&& (!isKatakana(current32(text)) || !isKatakana(previous32(text)))) {
t_boundary[numBreaks] = i;
numBreaks++;
}
@ -308,11 +310,11 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start,
int length) {
sb.setLength(0);
if(length > 0) {
if (length > 0) {
text.setIndex(start);
sb.appendCodePoint(current32(text));
for (int j = 1; j < length; j++) {
sb.appendCodePoint(next32(text));
sb.append(text.current());
for (int i = 1; i < length; i++) {
sb.append(text.next());
}
}
return sb.toString();

View file

@ -1898,6 +1898,9 @@ Bangkok)•</data>
<data>•\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e•</data>
#純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール
<data>•\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB•</data>
#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data>
#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし)
<data>•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09•</data>
####################################################################################