ICU-21946 RBBI Break Cache Optimizations

Adjust RuleBasedBreakIterator::BreakCache::populateNear() to retain the cache
the cache contents in additional cases where are still useful, resulting in
improved performance.

This change is related to PR #2039, which addressed the same problem. This one
retains the cache contents in more situations.
This commit is contained in:
Andy Heninger 2022-05-09 17:29:04 -07:00 committed by Markus Scherer
parent 2673378260
commit b95c6b1f3e
5 changed files with 267 additions and 52 deletions

View file

@ -341,41 +341,87 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
}
U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
// Find a boundary somewhere in the vicinity of the requested position.
// Depending on the safe rules and the text data, it could be either before, at, or after
// the requested position.
// Add boundaries to the cache near the specified position.
// The given position need not be a boundary itself.
// The input position must be within the range of the text, and
// on a code point boundary.
// If the requested position is a break boundary, leave the iteration
// position on it.
// If the requested position is not a boundary, leave the iteration
// position on the preceding boundary and include both the
// preceding and following boundaries in the cache.
// Additional boundaries, either preceding or following, may be added
// to the cache as a side effect.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int32_t aBoundary = 0;
int32_t ruleStatusIndex = 0;
if (position > 20) {
int32_t backupPos = fBI->handleSafePrevious(position);
// Threshold for a text position to be considered near to existing cache contents.
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
// This value is subject to change. See the ticket for more details.
static constexpr int32_t CACHE_NEAR = 15;
if (backupPos > 0) {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext();
if (aBoundary <= backupPos + 4) {
// +4 is a quick test for possibly having advanced only one codepoint.
// Four being the length of the longest potential code point, a supplementary in UTF-8
utext_setNativeIndex(&fBI->fText, aBoundary);
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
// The initial handleNext() only advanced by a single code point. Go again.
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
}
int32_t aBoundary = -1;
int32_t ruleStatusIndex = 0;
bool retainCache = false;
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
// Requested position is near the existing cache. Retain it.
retainCache = true;
} else if (position <= CACHE_NEAR) {
// Requested position is near the start of the text. Fill cache from start, skipping
// the need to find a safe point.
retainCache = false;
aBoundary = 0;
} else {
// Requested position is not near the existing cache.
// Find a safe point to refill the cache from.
int32_t backupPos = fBI->handleSafePrevious(position);
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
// The requested position is beyond the end of the existing cache, but the
// reverse rules produced a position near or before the cached region.
// Retain the existing cache, and fill from the end of it.
retainCache = true;
} else if (backupPos < CACHE_NEAR) {
// The safe reverse rules moved us to near the start of text.
// Take that (index 0) as the backup boundary, avoiding the complication
// (in the following block) of moving forward from the safe point to a known boundary.
//
// Retain the cache if it begins not too far from the requested position.
aBoundary = 0;
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
} else {
// The safe reverse rules produced a position that is neither near the existing
// cache, nor near the start of text.
// Advance to the boundary following.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
retainCache = false;
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext();
if (aBoundary != UBRK_DONE && aBoundary <= backupPos + 4) {
// +4 is a quick test for possibly having advanced only one codepoint.
// Four being the length of the longest potential code point, a supplementary in UTF-8
utext_setNativeIndex(&fBI->fText, aBoundary);
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
// The initial handleNext() only advanced by a single code point. Go again.
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
}
ruleStatusIndex = fBI->fRuleStatusIndex;
}
if (aBoundary == UBRK_DONE) {
// Note (Andy Heninger): I don't think this condition can occur, but it's hard
// to prove that it can't. We ran off the end of the string looking a boundary
// following a safe point; choose the end of the string as that boundary.
aBoundary = utext_nativeLength(&fBI->fText);
}
ruleStatusIndex = fBI->fRuleStatusIndex;
}
}
if (!retainCache) {
U_ASSERT(aBoundary != -1);
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}

View file

@ -139,6 +139,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestUnpairedSurrogate);
TESTCASE_AUTO(TestLSTMThai);
TESTCASE_AUTO(TestLSTMBurmese);
TESTCASE_AUTO(TestRandomAccess);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -5453,4 +5454,66 @@ void RBBITest::TestLSTMBurmese() {
runLSTMTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", USCRIPT_MYANMAR);
}
// Test preceding(index) and following(index), with semi-random indexes.
// The random indexes are produced in clusters that are relatively closely spaced,
// to increase the occurrences of hits to the internal break cache.
void RBBITest::TestRandomAccess() {
static constexpr int32_t CACHE_SIZE = 128;
UnicodeString testData;
for (int i=0; i<CACHE_SIZE*2; ++i) {
testData.append(u"aaaa\n");
}
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedBreakIterator> bi(
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status),
status);
if (!assertSuccess(WHERE, status)) { return; };
bi->setText(testData);
auto expectedPreceding = [](int from) {
if (from == 0) {return UBRK_DONE;}
if (from % 5 == 0) {return from - 5;}
return from - (from % 5);
};
auto expectedFollow = [testData](int from) {
if (from >= testData.length()) {return UBRK_DONE;}
if (from % 5 == 0) {return from + 5;}
return from + (5 - (from % 5));
};
auto randomStringIndex = [testData]() {
static icu_rand randomGenerator; // produces random uint32_t values.
static int lastNum;
static int clusterCount;
static constexpr int CLUSTER_SIZE = 100;
static constexpr int CLUSTER_LENGTH = 10;
if (clusterCount < CLUSTER_LENGTH) {
++clusterCount;
lastNum += (randomGenerator() % CLUSTER_SIZE);
lastNum -= CLUSTER_SIZE / 2;
lastNum = std::max(0, lastNum);
// Deliberately test indexes > testData.length.
lastNum = std::min(testData.length() + 5, lastNum);
} else {
clusterCount = 0;
lastNum = randomGenerator() % testData.length();
}
return lastNum;
};
for (int i=0; i<5000; ++i) {
int idx = randomStringIndex();
assertEquals(WHERE, expectedFollow(idx), bi->following(idx));
idx = randomStringIndex();
assertEquals(WHERE, expectedPreceding(idx), bi->preceding(idx));
}
}
#endif // #if !UCONFIG_NO_BREAK_ITERATION

View file

@ -95,6 +95,7 @@ public:
void TestBug13590();
void TestLSTMThai();
void TestLSTMBurmese();
void TestRandomAccess();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();

View file

@ -1436,42 +1436,89 @@ class BreakCache {
boolean populateNear(int position) {
assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
// Find a boundary somewhere in the vicinity of the requested position.
// Depending on the safe rules and the text data, it could be either before, at, or after
// the requested position.
// Add boundaries to the cache near the specified position.
// The given position need not be a boundary itself.
// The input position must be within the range of the text, and
// on a code point boundary.
// If the requested position is a break boundary, leave the iteration
// position on it.
// If the requested position is not a boundary, leave the iteration
// position on the preceding boundary and include both the
// preceding and following boundaries in the cache.
// Additional boundaries, either preceding or following, may be added
// to the cache as a side effect.
// If the requested position is not near already cached positions, clear the existing cache,
// find a near-by boundary and begin new cache contents there.
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int aBoundary = fText.getBeginIndex();
int ruleStatusIndex = 0;
if (position > aBoundary + 20) {
int backupPos = handleSafePrevious(position);
if (backupPos > aBoundary) {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fPosition = backupPos;
// Threshold for a text position to be considered near to existing cache contents.
// TODO: See issue ICU-22024 "perf tuning of Cache needed."
// This value is subject to change. See the ticket for more details.
final int CACHE_NEAR = 15;
int startOfText = fText.getBeginIndex();
int aBoundary = -1;
int ruleStatusIndex = 0;
boolean retainCache = false;
if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
// Requested position is near the existing cache. Retain it.
retainCache = true;
} else if (position <= startOfText + CACHE_NEAR) {
// Requested position is near the start of the text. Fill cache from start, skipping
// the need to find a safe point.
retainCache = false;
aBoundary = startOfText;
} else {
// Requested position is not near the existing cache.
// Find a safe point to refill the cache from.
int backupPos = handleSafePrevious(position);
if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
// The requested position is beyond the end of the existing cache, but the
// reverse rules produced a position near or before the cached region.
// Retain the existing cache, and fill from the end of it.
retainCache = true;
} else if (backupPos < startOfText + CACHE_NEAR) {
// The safe reverse rules moved us to near the start of text.
// Take that (index 0) as the backup boundary, avoiding the complication
// (in the following block) of moving forward from the safe point to a known boundary.
//
// Retain the cache if it begins not too far from the requested position.
aBoundary = startOfText;
retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
} else {
// The safe reverse rules produced a position that is neither near the existing
// cache, nor near the start of text.
// Advance to the boundary following.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
retainCache = false;
fPosition = backupPos;
aBoundary = handleNext();
if (aBoundary == backupPos + 1 ||
(aBoundary == backupPos + 2 &&
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
Character.isLowSurrogate(fText.next()))) {
// The initial handleNext() only advanced by a single code point. Go again.
// Safe rules identify safe pairs.
aBoundary = handleNext();
if (aBoundary == backupPos + 1 ||
(aBoundary == backupPos + 2 &&
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
Character.isLowSurrogate(fText.next()))) {
// The initial handleNext() only advanced by a single code point. Go again.
// Safe rules identify safe pairs.
aBoundary = handleNext();
}
}
if (aBoundary == BreakIterator.DONE) {
aBoundary = fText.getEndIndex();
}
ruleStatusIndex = fRuleStatusIndex;
}
}
if (!retainCache) {
assert(aBoundary != -1);
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {

View file

@ -946,4 +946,62 @@ public class RBBITest extends TestFmwk {
bi = new RuleBasedBreakIterator(rules);
assertEquals("Rules does not match", rules, bi.toString());
}
/* Test preceding(index) and following(index), with semi-random indexes.
* The random indexes are produced in clusters that are relatively closely spaced,
* to increase the occurrences of hits to the internal break cache.
*/
@Test
public void TestRandomAccess() {
final int CACHE_SIZE = 128;
final StringBuilder testData = new StringBuilder();
for (int i=0; i<CACHE_SIZE*2; ++i) {
testData.append("aaaa\n");
}
RuleBasedBreakIterator bi =
(RuleBasedBreakIterator)BreakIterator.getLineInstance(ULocale.ENGLISH);
bi.setText(testData);
class Fns { // This class exists only to allow declaring nested functions
// within TestRandomAccess().
public int expectedPreceding(int from) {
if (from == 0) {return BreakIterator.DONE;}
if (from % 5 == 0) {return from - 5;}
return from - (from % 5);
};
public int expectedFollow(int from) {
if (from >= testData.length()) {return BreakIterator.DONE;}
if (from % 5 == 0) {return from + 5;}
return from + (5 - (from % 5));
};
ICU_Rand randomGenerator = new ICU_Rand(0);
int lastNum;
int clusterCount;
static final int CLUSTER_SIZE = 100;
static final int CLUSTER_LENGTH = 10;
public int randomStringIndex() {
if (clusterCount < CLUSTER_LENGTH) {
++clusterCount;
lastNum += (randomGenerator.next() % CLUSTER_SIZE);
lastNum -= CLUSTER_SIZE / 2;
lastNum = Math.max(0, lastNum);
lastNum = Math.min(testData.length() + 5, lastNum);
} else {
clusterCount = 0;
lastNum = randomGenerator.next() % testData.length();
}
return lastNum;
};
};
Fns fns = new Fns();
for (int i=0; i<5000; ++i) {
int idx = fns.randomStringIndex();
assertEquals("following" + idx, fns.expectedFollow(idx), bi.following(idx));
idx = fns.randomStringIndex();
assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx));
}
}
}