mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-21699 Fix CjkBreakEngine performance issue
1. vector.contains() uses sequential comparison, O(n). As the vector size is great, the performance will be impacted. Remove this unnecessary check, vector.contains(), in C++. 2. At Java's CjkBreakEngine, replace "vector.contains()" with "if(pos > previous)" to deal with duplicate breakpoint position. This way, C++ and Java implementation will be synchronous. Test: ant checkTest -Dtestclass='com.ibm.icu.dev.test.rbbi.RBBITest' (RBBTest#TestBreakAllChars() can generate duplicate position for word break. It could pass with this change)
This commit is contained in:
parent
44c7137ae5
commit
06ef8867f3
2 changed files with 5 additions and 3 deletions
|
@ -1370,7 +1370,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
if (utextPos > prevUTextPos) {
|
||||
// Boundaries are added to foundBreaks output in ascending order.
|
||||
U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
|
||||
if (!(foundBreaks.contains(utextPos) || utextPos == rangeStart)) {
|
||||
if (utextPos != rangeStart) {
|
||||
foundBreaks.push(utextPos, status);
|
||||
correctedNumBreaks++;
|
||||
}
|
||||
|
|
|
@ -209,12 +209,14 @@ public class CjkBreakEngine extends DictionaryBreakEngine {
|
|||
}
|
||||
|
||||
int correctedNumBreaks = 0;
|
||||
int previous = -1;
|
||||
for (int i = numBreaks - 1; i >= 0; i--) {
|
||||
int pos = charPositions[t_boundary[i]] + startPos;
|
||||
if (!(foundBreaks.contains(pos) || pos == startPos)) {
|
||||
foundBreaks.push(charPositions[t_boundary[i]] + startPos);
|
||||
if (pos > previous && pos != startPos) {
|
||||
foundBreaks.push(pos);
|
||||
correctedNumBreaks++;
|
||||
}
|
||||
previous = pos;
|
||||
}
|
||||
|
||||
if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
|
||||
|
|
Loading…
Add table
Reference in a new issue