ICU-13194 RBBI safe tables, all tests passing!

X-SVN-Rev: 41155
This commit is contained in:
Andy Heninger 2018-03-26 23:01:16 +00:00
parent 62dd66a13d
commit b1b0be93ea
9 changed files with 61 additions and 66 deletions

View file

@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
}
// Adjust offset to be on a code point boundary and not beyond the end of the text.
// Note that isBoundary() is always be false for offsets that are not on code point boundaries.
// Note that isBoundary() is always false for offsets that are not on code point boundaries.
// But we still need the side effect of leaving iteration at the following boundary.
utext_setNativeIndex(&fText, offset);
@ -1116,7 +1116,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
UChar32 c;
int32_t result = 0;
const RBBIStateTable *stateTable = fData->fSafeRevTable;
const RBBIStateTable *stateTable = fData->fReverseTable;
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
#ifdef RBBI_DEBUG
if (gTrace) {

View file

@ -206,7 +206,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
* BreakCache implemetation
*/
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fSideBuffer(status) {
reset();
}
@ -299,7 +299,7 @@ void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
fBI->fPosition = fTextIdx;
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
return;
}
}
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
@ -317,7 +317,7 @@ UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
fTextIdx = fBoundaries[fBufIdx];
return TRUE;
}
int32_t min = fStartBufIdx;
int32_t max = fEndBufIdx;
while (min != max) {
@ -354,20 +354,33 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int32_t aBoundary = 0;
int32_t ruleStatusIndex = 0;
// TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > 20) {
int32_t backupPos = fBI->handleSafePrevious(position);
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals.
// Safe rules work on pairs. +1 from start pos may be a false match.
if (backupPos > 0) {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fBI->fPosition = backupPos;
aBoundary = fBI->handleNext();
if (aBoundary <= backupPos + 4) {
// +4 is a quick test for possibly having advanced only one codepoint.
// Four being the length of the longest potential code point, a supplementary in UTF-8
utext_setNativeIndex(&fBI->fText, aBoundary);
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
// The initial handleNext() only advanced by a single code point. Go again.
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
}
}
ruleStatusIndex = fBI->fRuleStatusIndex;
}
ruleStatusIndex = fBI->fRuleStatusIndex;
}
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
}
// Fill in boundaries between existing cache content and the new requested position.
if (fBoundaries[fEndBufIdx] < position) {
@ -495,13 +508,24 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
position = 0;
positionStatusIdx = 0;
} else {
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
position = fBI->handleNext(); // TODO: supplementals don't work with the +1.
if (position == backupPosition + 1) {
position = fBI->handleNext(); // Safe rules identify safe pairs.
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fBI->fPosition = backupPosition;
position = fBI->handleNext();
if (position <= backupPosition + 4) {
// +4 is a quick test for possibly having advanced only one codepoint.
// Four being the length of the longest potential code point, a supplementary in UTF-8
utext_setNativeIndex(&fBI->fText, position);
if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
// The initial handleNext() only advanced by a single code point. Go again.
position = fBI->handleNext(); // Safe rules identify safe pairs.
}
};
positionStatusIdx = fBI->fRuleStatusIndex;
}
} while (position >= fromPosition);
@ -540,7 +564,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
}
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
}
if (!segmentHandledByDictionary && position < fromPosition) {
fSideBuffer.addElement(position, status);
fSideBuffer.addElement(positionStatusIdx, status);
@ -566,7 +590,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
break;
}
}
return success;
}

View file

@ -106,7 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
if (data->fFTableLen != 0) {
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
}
if (data->fSRTableLen != 0) {
if (data->fRTableLen != 0) {
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
}
if (data->fSRTableLen != 0) { // TODO: obsolete. Remove.
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
}

View file

@ -173,7 +173,8 @@ public:
/* */
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
const RBBIStateTable *fSafeRevTable;
const RBBIStateTable *fReverseTable; // auto-generated safe reverse.
const RBBIStateTable *fSafeRevTable; // hand-written safe reverse. TODO: delete this.
const UChar *fRuleSource;
const int32_t *fRuleStatusTable;

View file

@ -300,10 +300,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
fForwardTables->buildSafe(status);
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
fForwardTables
fForwardTables->printStates();
fForwardTables->printRuleStatusTable();
fForwardTables->printSafeTable();
}

View file

@ -1312,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
if (wantedEndState == endState) {
int32_t pair = c1 << 16 | c2;
safePairs.addElement(pair, status);
printf("(%d, %d) ", c1, c2);
// printf("(%d, %d) ", c1, c2);
}
}
printf("\n");
// printf("\n");
}
// Populate the initial safe table.
@ -1358,7 +1358,7 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
rowState.setCharAt(c1, 0);
}
// Merge similar states.
// TODO: Merge similar states.
}
@ -1392,9 +1392,9 @@ int32_t RBBITableBuilder::getSafeTableSize() const {
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
// by the runtime engine. getTableSize() bytes of memory
// must be available at the output address "where".
// exportSafeTable() export the state transition table in the format required
// by the runtime engine. getTableSize() bytes of memory
// must be available at the output address "where".
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::exportSafeTable(void *where) {

View file

@ -308,14 +308,14 @@ static void TestBreakIteratorCAPI()
log_verbose("\nTesting the functions for sentence\n");
ubrk_first(sentence);
pos = ubrk_first(sentence);
pos = ubrk_current(sentence);
log_verbose("Current(sentence) = %d\n", (int32_t)pos);
pos = ubrk_last(sentence);
if(pos!=49)
log_err("error ubrk_last for sentence did not return 49\n");
log_verbose("Last (sentence) = %d\n", (int32_t)pos);
ubrk_first(sentence);
pos = ubrk_first(sentence);
to = ubrk_following( sentence, 0 );
if (to == 0) log_err("ubrk_following returned 0\n");
to = ubrk_preceding( sentence, to );

View file

@ -4551,40 +4551,6 @@ void RBBITest::TestDebug(void) {
UParseError pe;
LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
assertSuccess(WHERE, status);
#if 0
bi->dumpTables();
RBBIDataWrapper *dw = bi->fData;
const RBBIStateTable *fwtbl = dw->fForwardTable;
int32_t numCharClasses = dw->fHeader->fCatCount;
printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
for (int32_t c1=0; c1<numCharClasses; ++c1) {
for (int32_t c2=0; c2 < numCharClasses; ++c2) {
int32_t wantedEndState = -1;
int32_t endState = 0;
for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
int32_t s2 = row->fNextState[c1];
row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
endState = row->fNextState[c2];
if (wantedEndState < 0) {
wantedEndState = endState;
} else {
if (wantedEndState != endState) {
break;
}
}
}
if (wantedEndState == endState) {
printf("(%d, %d) ", c1, c2);
}
}
printf("\n");
}
printf("\n");
#endif
}
void RBBITest::TestProperties() {

View file

@ -39,7 +39,9 @@
# Temp debugging tests
#
<word>
<data>•
•</data>
## FILTERED BREAK TESTS