mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-21946 RBBI Break Cache Optimizations
Adjust RuleBasedBreakIterator::BreakCache::populateNear() to retain the cache the cache contents in additional cases where are still useful, resulting in improved performance. This change is related to PR #2039, which addressed the same problem. This one retains the cache contents in more situations.
This commit is contained in:
parent
2673378260
commit
b95c6b1f3e
5 changed files with 267 additions and 52 deletions
|
@ -341,41 +341,87 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
|
|||
}
|
||||
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
|
||||
|
||||
// Find a boundary somewhere in the vicinity of the requested position.
|
||||
// Depending on the safe rules and the text data, it could be either before, at, or after
|
||||
// the requested position.
|
||||
|
||||
// Add boundaries to the cache near the specified position.
|
||||
// The given position need not be a boundary itself.
|
||||
// The input position must be within the range of the text, and
|
||||
// on a code point boundary.
|
||||
// If the requested position is a break boundary, leave the iteration
|
||||
// position on it.
|
||||
// If the requested position is not a boundary, leave the iteration
|
||||
// position on the preceding boundary and include both the
|
||||
// preceding and following boundaries in the cache.
|
||||
// Additional boundaries, either preceding or following, may be added
|
||||
// to the cache as a side effect.
|
||||
|
||||
// If the requested position is not near already cached positions, clear the existing cache,
|
||||
// find a near-by boundary and begin new cache contents there.
|
||||
|
||||
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
|
||||
int32_t aBoundary = 0;
|
||||
int32_t ruleStatusIndex = 0;
|
||||
if (position > 20) {
|
||||
int32_t backupPos = fBI->handleSafePrevious(position);
|
||||
// Threshold for a text position to be considered near to existing cache contents.
|
||||
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
|
||||
// This value is subject to change. See the ticket for more details.
|
||||
static constexpr int32_t CACHE_NEAR = 15;
|
||||
|
||||
if (backupPos > 0) {
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext();
|
||||
if (aBoundary <= backupPos + 4) {
|
||||
// +4 is a quick test for possibly having advanced only one codepoint.
|
||||
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
||||
utext_setNativeIndex(&fBI->fText, aBoundary);
|
||||
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
}
|
||||
int32_t aBoundary = -1;
|
||||
int32_t ruleStatusIndex = 0;
|
||||
bool retainCache = false;
|
||||
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
|
||||
// Requested position is near the existing cache. Retain it.
|
||||
retainCache = true;
|
||||
} else if (position <= CACHE_NEAR) {
|
||||
// Requested position is near the start of the text. Fill cache from start, skipping
|
||||
// the need to find a safe point.
|
||||
retainCache = false;
|
||||
aBoundary = 0;
|
||||
} else {
|
||||
// Requested position is not near the existing cache.
|
||||
// Find a safe point to refill the cache from.
|
||||
int32_t backupPos = fBI->handleSafePrevious(position);
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
|
||||
// The requested position is beyond the end of the existing cache, but the
|
||||
// reverse rules produced a position near or before the cached region.
|
||||
// Retain the existing cache, and fill from the end of it.
|
||||
retainCache = true;
|
||||
} else if (backupPos < CACHE_NEAR) {
|
||||
// The safe reverse rules moved us to near the start of text.
|
||||
// Take that (index 0) as the backup boundary, avoiding the complication
|
||||
// (in the following block) of moving forward from the safe point to a known boundary.
|
||||
//
|
||||
// Retain the cache if it begins not too far from the requested position.
|
||||
aBoundary = 0;
|
||||
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
|
||||
} else {
|
||||
// The safe reverse rules produced a position that is neither near the existing
|
||||
// cache, nor near the start of text.
|
||||
// Advance to the boundary following.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
retainCache = false;
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext();
|
||||
if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
|
||||
// +4 is a quick test for possibly having advanced only one codepoint.
|
||||
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
||||
utext_setNativeIndex(&fBI->fText, aBoundary);
|
||||
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
}
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
if (aBoundary == UBRK_DONE) {
|
||||
// Note (Andy Heninger): I don't think this condition can occur, but it's hard
|
||||
// to prove that it can't. We ran off the end of the string looking a boundary
|
||||
// following a safe point; choose the end of the string as that boundary.
|
||||
aBoundary = utext_nativeLength(&fBI->fText);
|
||||
}
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
}
|
||||
|
||||
if (!retainCache) {
|
||||
U_ASSERT(aBoundary != -1);
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
}
|
||||
|
||||
|
|
|
@ -139,6 +139,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
TESTCASE_AUTO(TestUnpairedSurrogate);
|
||||
TESTCASE_AUTO(TestLSTMThai);
|
||||
TESTCASE_AUTO(TestLSTMBurmese);
|
||||
TESTCASE_AUTO(TestRandomAccess);
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
TESTCASE_AUTO(TestTraceCreateCharacter);
|
||||
|
@ -5453,4 +5454,66 @@ void RBBITest::TestLSTMBurmese() {
|
|||
runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
|
||||
}
|
||||
|
||||
|
||||
// Test preceding(index) and following(index), with semi-random indexes.
|
||||
// The random indexes are produced in clusters that are relatively closely spaced,
|
||||
// to increase the occurrences of hits to the internal break cache.
|
||||
|
||||
void RBBITest::TestRandomAccess() {
|
||||
static constexpr int32_t CACHE_SIZE = 128;
|
||||
|
||||
UnicodeString testData;
|
||||
for (int i=0; i<CACHE_SIZE*2; ++i) {
|
||||
testData.append(u"aaaa\n");
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<RuleBasedBreakIterator> bi(
|
||||
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
|
||||
status);
|
||||
if (!assertSuccess(WHERE, status)) { return; };
|
||||
|
||||
bi->setText(testData);
|
||||
|
||||
auto expectedPreceding = [](int from) {
|
||||
if (from == 0) {return UBRK_DONE;}
|
||||
if (from % 5 == 0) {return from - 5;}
|
||||
return from - (from % 5);
|
||||
};
|
||||
|
||||
auto expectedFollow = [testData](int from) {
|
||||
if (from >= testData.length()) {return UBRK_DONE;}
|
||||
if (from % 5 == 0) {return from + 5;}
|
||||
return from + (5 - (from % 5));
|
||||
};
|
||||
|
||||
auto randomStringIndex = [testData]() {
|
||||
static icu_rand randomGenerator; // produces random uint32_t values.
|
||||
static int lastNum;
|
||||
static int clusterCount;
|
||||
static constexpr int CLUSTER_SIZE = 100;
|
||||
static constexpr int CLUSTER_LENGTH = 10;
|
||||
|
||||
if (clusterCount < CLUSTER_LENGTH) {
|
||||
++clusterCount;
|
||||
lastNum += (randomGenerator() % CLUSTER_SIZE);
|
||||
lastNum -= CLUSTER_SIZE / 2;
|
||||
lastNum = std::max(0, lastNum);
|
||||
// Deliberately test indexes > testData.length.
|
||||
lastNum = std::min(testData.length() + 5, lastNum);
|
||||
} else {
|
||||
clusterCount = 0;
|
||||
lastNum = randomGenerator() % testData.length();
|
||||
}
|
||||
return lastNum;
|
||||
};
|
||||
|
||||
for (int i=0; i<5000; ++i) {
|
||||
int idx = randomStringIndex();
|
||||
assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
|
||||
idx = randomStringIndex();
|
||||
assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
|
|
@ -95,6 +95,7 @@ public:
|
|||
void TestBug13590();
|
||||
void TestLSTMThai();
|
||||
void TestLSTMBurmese();
|
||||
void TestRandomAccess();
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
void TestTraceCreateCharacter();
|
||||
|
|
|
@ -1436,42 +1436,89 @@ class BreakCache {
|
|||
boolean populateNear(int position) {
|
||||
assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
|
||||
|
||||
// Find a boundary somewhere in the vicinity of the requested position.
|
||||
// Depending on the safe rules and the text data, it could be either before, at, or after
|
||||
// the requested position.
|
||||
|
||||
// Add boundaries to the cache near the specified position.
|
||||
// The given position need not be a boundary itself.
|
||||
// The input position must be within the range of the text, and
|
||||
// on a code point boundary.
|
||||
// If the requested position is a break boundary, leave the iteration
|
||||
// position on it.
|
||||
// If the requested position is not a boundary, leave the iteration
|
||||
// position on the preceding boundary and include both the
|
||||
// preceding and following boundaries in the cache.
|
||||
// Additional boundaries, either preceding or following, may be added
|
||||
// to the cache as a side effect.
|
||||
|
||||
// If the requested position is not near already cached positions, clear the existing cache,
|
||||
// find a near-by boundary and begin new cache contents there.
|
||||
|
||||
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
|
||||
int aBoundary = fText.getBeginIndex();
|
||||
int ruleStatusIndex = 0;
|
||||
if (position > aBoundary + 20) {
|
||||
int backupPos = handleSafePrevious(position);
|
||||
if (backupPos > aBoundary) {
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fPosition = backupPos;
|
||||
// Threshold for a text position to be considered near to existing cache contents.
|
||||
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
|
||||
// This value is subject to change. See the ticket for more details.
|
||||
final int CACHE_NEAR = 15;
|
||||
|
||||
int startOfText = fText.getBeginIndex();
|
||||
int aBoundary = -1;
|
||||
int ruleStatusIndex = 0;
|
||||
boolean retainCache = false;
|
||||
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
|
||||
// Requested position is near the existing cache. Retain it.
|
||||
retainCache = true;
|
||||
} else if (position <= startOfText + CACHE_NEAR) {
|
||||
// Requested position is near the start of the text. Fill cache from start, skipping
|
||||
// the need to find a safe point.
|
||||
retainCache = false;
|
||||
aBoundary = startOfText;
|
||||
} else {
|
||||
// Requested position is not near the existing cache.
|
||||
// Find a safe point to refill the cache from.
|
||||
int backupPos = handleSafePrevious(position);
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
|
||||
// The requested position is beyond the end of the existing cache, but the
|
||||
// reverse rules produced a position near or before the cached region.
|
||||
// Retain the existing cache, and fill from the end of it.
|
||||
retainCache = true;
|
||||
} else if (backupPos < startOfText + CACHE_NEAR) {
|
||||
// The safe reverse rules moved us to near the start of text.
|
||||
// Take that (index 0) as the backup boundary, avoiding the complication
|
||||
// (in the following block) of moving forward from the safe point to a known boundary.
|
||||
//
|
||||
// Retain the cache if it begins not too far from the requested position.
|
||||
aBoundary = startOfText;
|
||||
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
|
||||
} else {
|
||||
// The safe reverse rules produced a position that is neither near the existing
|
||||
// cache, nor near the start of text.
|
||||
// Advance to the boundary following.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
retainCache = false;
|
||||
fPosition = backupPos;
|
||||
aBoundary = handleNext();
|
||||
if (aBoundary == backupPos + 1 ||
|
||||
(aBoundary == backupPos + 2 &&
|
||||
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
|
||||
Character.isLowSurrogate(fText.next()))) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
// Safe rules identify safe pairs.
|
||||
aBoundary = handleNext();
|
||||
if (aBoundary == backupPos + 1 ||
|
||||
(aBoundary == backupPos + 2 &&
|
||||
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
|
||||
Character.isLowSurrogate(fText.next()))) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
// Safe rules identify safe pairs.
|
||||
aBoundary = handleNext();
|
||||
}
|
||||
}
|
||||
if (aBoundary == BreakIterator.DONE) {
|
||||
aBoundary = fText.getEndIndex();
|
||||
}
|
||||
ruleStatusIndex = fRuleStatusIndex;
|
||||
}
|
||||
}
|
||||
|
||||
if (!retainCache) {
|
||||
assert(aBoundary != -1);
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
}
|
||||
|
||||
|
||||
// Fill in boundaries between existing cache content and the new requested position.
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position) {
|
||||
|
|
|
@ -946,4 +946,62 @@ public class RBBITest extends TestFmwk {
|
|||
bi = new RuleBasedBreakIterator(rules);
|
||||
assertEquals("Rules does not match", rules, bi.toString());
|
||||
}
|
||||
|
||||
/* Test preceding(index) and following(index), with semi-random indexes.
|
||||
* The random indexes are produced in clusters that are relatively closely spaced,
|
||||
* to increase the occurrences of hits to the internal break cache.
|
||||
*/
|
||||
@Test
|
||||
public void TestRandomAccess() {
|
||||
final int CACHE_SIZE = 128;
|
||||
final StringBuilder testData = new StringBuilder();
|
||||
for (int i=0; i<CACHE_SIZE*2; ++i) {
|
||||
testData.append("aaaa\n");
|
||||
}
|
||||
RuleBasedBreakIterator bi =
|
||||
(RuleBasedBreakIterator)BreakIterator.getLineInstance(ULocale.ENGLISH);
|
||||
bi.setText(testData);
|
||||
|
||||
class Fns { // This class exists only to allow declaring nested functions
|
||||
// within TestRandomAccess().
|
||||
public int expectedPreceding(int from) {
|
||||
if (from == 0) {return BreakIterator.DONE;}
|
||||
if (from % 5 == 0) {return from - 5;}
|
||||
return from - (from % 5);
|
||||
};
|
||||
|
||||
public int expectedFollow(int from) {
|
||||
if (from >= testData.length()) {return BreakIterator.DONE;}
|
||||
if (from % 5 == 0) {return from + 5;}
|
||||
return from + (5 - (from % 5));
|
||||
};
|
||||
|
||||
ICU_Rand randomGenerator = new ICU_Rand(0);
|
||||
int lastNum;
|
||||
int clusterCount;
|
||||
static final int CLUSTER_SIZE = 100;
|
||||
static final int CLUSTER_LENGTH = 10;
|
||||
public int randomStringIndex() {
|
||||
if (clusterCount < CLUSTER_LENGTH) {
|
||||
++clusterCount;
|
||||
lastNum += (randomGenerator.next() % CLUSTER_SIZE);
|
||||
lastNum -= CLUSTER_SIZE / 2;
|
||||
lastNum = Math.max(0, lastNum);
|
||||
lastNum = Math.min(testData.length() + 5, lastNum);
|
||||
} else {
|
||||
clusterCount = 0;
|
||||
lastNum = randomGenerator.next() % testData.length();
|
||||
}
|
||||
return lastNum;
|
||||
};
|
||||
};
|
||||
Fns fns = new Fns();
|
||||
|
||||
for (int i=0; i<5000; ++i) {
|
||||
int idx = fns.randomStringIndex();
|
||||
assertEquals("following" + idx, fns.expectedFollow(idx), bi.following(idx));
|
||||
idx = fns.randomStringIndex();
|
||||
assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue