From a67bb901508d19abacb78d9ffea4c47f0b3c477b Mon Sep 17 00:00:00 2001 From: allenwtsu Date: Tue, 25 Jan 2022 11:27:47 +0000 Subject: [PATCH] ICU-21699 Refactor codeunit handling See #1965 --- icu4c/source/common/dictbe.cpp | 6 ++++-- icu4c/source/test/testdata/rbbitst.txt | 4 +++- .../ibm/icu/impl/breakiter/CjkBreakEngine.java | 18 ++++++++++-------- .../src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 3 +++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index c0af19ef171..4621bf4e24f 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -1360,16 +1360,18 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, int32_t prevIdx = numCodePts; int32_t codeUnitIdx = -1; + int32_t prevCodeUnitIdx = -1; int32_t length = -1; for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) { codeUnitIdx = inString.moveIndex32(0, i); + prevCodeUnitIdx = inString.moveIndex32(0, prevIdx); // Calculate the length by using the code unit. - length = inString.moveIndex32(0, prevIdx) - codeUnitIdx; + length = prevCodeUnitIdx - codeUnitIdx; prevIdx = i; // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana // characters don't occur. if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length)) - && (!isKatakana(inString.char32At(codeUnitIdx -1)) + && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1))) || !isKatakana(inString.char32At(codeUnitIdx)))) { t_boundary.addElement(i, status); numBreaks++; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 1be45e9f3c5..702bb479038 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1898,7 +1898,9 @@ Bangkok)• •\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e• #純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール •\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB• - +#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 +#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし) +•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09• #################################################################################### # diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java index 06b93683771..166a4c75245 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java @@ -11,6 +11,7 @@ package com.ibm.icu.impl.breakiter; import static com.ibm.icu.impl.CharacterIteration.DONE32; import static com.ibm.icu.impl.CharacterIteration.current32; import static com.ibm.icu.impl.CharacterIteration.next32; +import static com.ibm.icu.impl.CharacterIteration.previous32; import java.io.IOException; import java.text.CharacterIterator; @@ -240,17 +241,18 @@ public class CjkBreakEngine extends DictionaryBreakEngine { t_boundary[numBreaks] = numCodePts; numBreaks++; int prevIdx = numCodePts; - int codeUnitIdx = 0, length = 0; + int codeUnitIdx = 0, prevCodeUnitIdx = 0, length = 0; for (int i = prev[numCodePts]; i > 0; i = prev[i]) { codeUnitIdx = prenormstr.offsetByCodePoints(0, i); - length = prevIdx - i; + prevCodeUnitIdx = prenormstr.offsetByCodePoints(0, prevIdx); + length = prevCodeUnitIdx - codeUnitIdx; prevIdx = i; String pattern = getPatternFromText(text, s, codeUnitIdx, length); // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana // characters don't occur. - text.setIndex(codeUnitIdx - 1); + text.setIndex(codeUnitIdx); if (!fSkipSet.contains(pattern) - && (!isKatakana(current32(text)) || !isKatakana(next32(text)))) { + && (!isKatakana(current32(text)) || !isKatakana(previous32(text)))) { t_boundary[numBreaks] = i; numBreaks++; } @@ -308,11 +310,11 @@ public class CjkBreakEngine extends DictionaryBreakEngine { private String getPatternFromText(CharacterIterator text, StringBuffer sb, int start, int length) { sb.setLength(0); - if(length > 0) { + if (length > 0) { text.setIndex(start); - sb.appendCodePoint(current32(text)); - for (int j = 1; j < length; j++) { - sb.appendCodePoint(next32(text)); + sb.append(text.current()); + for (int i = 1; i < length; i++) { + sb.append(text.next()); } } return sb.toString(); diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 346da988d7d..2a238a80f9c 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1898,6 +1898,9 @@ Bangkok)• •\uff3b\u643a\u5e2f•\u96fb\u8a71\uff3d•\u6b63\u3057\u3044•\u9078\u629e• #純金製百人一首にサッカーボール -> 純金•製•百人一首に•サッカーボール •\u7D14\u91D1•\u88FD•\u767E\u4EBA\u4E00\u9996\u306B•\u30B5\u30C3\u30AB\u30FC\u30DC\u30FC\u30EB• +#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19 +#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あ•かよろし) +•\uD82C\uDC48\u308B\uD82C\uDC38•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\uD82C\uDC19\u3088\u308D\u3057•\uFF08\u3042•\u304B\u3088\u308D\u3057\uFF09• ####################################################################################