From 08c3f99c0882ea84aebadd0fdb73f4d92fad859e Mon Sep 17 00:00:00 2001 From: allenwtsu Date: Wed, 22 Dec 2021 15:50:44 +0000 Subject: [PATCH] ICU-21878 Sync icu4j's CjkBreakEngine to icu4c's See #1953 --- icu4c/source/common/dictbe.cpp | 13 +++++++++++-- icu4c/source/test/testdata/rbbitst.txt | 3 +++ .../core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt | 3 +++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 36a35c411a9..6b6d4297ad4 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -1361,6 +1361,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, // while reversing t_boundary and pushing values to foundBreaks. int32_t prevCPPos = -1; int32_t prevUTextPos = -1; + int correctedNumBreaks = 0; for (int32_t i = numBreaks-1; i >= 0; i--) { int32_t cpPos = t_boundary.elementAti(i); U_ASSERT(cpPos > prevCPPos); @@ -1369,7 +1370,10 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - foundBreaks.push(utextPos, status); + if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } } else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the @@ -1381,9 +1385,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } (void)prevCPPos; // suppress compiler warnings about unused variable + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + foundBreaks.popi(); + correctedNumBreaks--; + } + // inString goes out of scope // inputMap goes out of scope - return numBreaks; + return correctedNumBreaks; } #endif diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 7fb30c9e8e3..1948360277d 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -796,6 +796,9 @@ •ジョージア<400> • + +•[<0>携帯<400>電話<400>]<0>お金<400>が<400>かかる<400>ん<400>です<400>。<0> + # Test for #11723 •アレルギー性<400>結膜炎<400> diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 7fb30c9e8e3..1948360277d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -796,6 +796,9 @@ •ジョージア<400> • + +•[<0>携帯<400>電話<400>]<0>お金<400>が<400>かかる<400>ん<400>です<400>。<0> + # Test for #11723 •アレルギー性<400>結膜炎<400>