mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13194 RBBI safe tables, all tests passing!
X-SVN-Rev: 41155
This commit is contained in:
parent
62dd66a13d
commit
b1b0be93ea
9 changed files with 61 additions and 66 deletions
|
@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
|
|||
}
|
||||
|
||||
// Adjust offset to be on a code point boundary and not beyond the end of the text.
|
||||
// Note that isBoundary() is always be false for offsets that are not on code point boundaries.
|
||||
// Note that isBoundary() is always false for offsets that are not on code point boundaries.
|
||||
// But we still need the side effect of leaving iteration at the following boundary.
|
||||
|
||||
utext_setNativeIndex(&fText, offset);
|
||||
|
@ -1116,7 +1116,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
|||
UChar32 c;
|
||||
int32_t result = 0;
|
||||
|
||||
const RBBIStateTable *stateTable = fData->fSafeRevTable;
|
||||
const RBBIStateTable *stateTable = fData->fReverseTable;
|
||||
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
|
|
|
@ -206,7 +206,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
* BreakCache implemetation
|
||||
*/
|
||||
|
||||
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
fBI(bi), fSideBuffer(status) {
|
||||
reset();
|
||||
}
|
||||
|
@ -299,7 +299,7 @@ void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
|
|||
fBI->fPosition = fTextIdx;
|
||||
fBI->fRuleStatusIndex = fStatuses[fBufIdx];
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
|
||||
|
@ -317,7 +317,7 @@ UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
|
|||
fTextIdx = fBoundaries[fBufIdx];
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
int32_t min = fStartBufIdx;
|
||||
int32_t max = fEndBufIdx;
|
||||
while (min != max) {
|
||||
|
@ -354,20 +354,33 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
|
|||
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
|
||||
int32_t aBoundary = 0;
|
||||
int32_t ruleStatusIndex = 0;
|
||||
// TODO: check for position == length of text. Although may still need to back up to get rule status.
|
||||
if (position > 20) {
|
||||
int32_t backupPos = fBI->handleSafePrevious(position);
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary.
|
||||
if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals.
|
||||
// Safe rules work on pairs. +1 from start pos may be a false match.
|
||||
|
||||
if (backupPos > 0) {
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fBI->fPosition = backupPos;
|
||||
aBoundary = fBI->handleNext();
|
||||
if (aBoundary <= backupPos + 4) {
|
||||
// +4 is a quick test for possibly having advanced only one codepoint.
|
||||
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
||||
utext_setNativeIndex(&fBI->fText, aBoundary);
|
||||
if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
aBoundary = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
}
|
||||
}
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
ruleStatusIndex = fBI->fRuleStatusIndex;
|
||||
}
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
}
|
||||
|
||||
|
||||
// Fill in boundaries between existing cache content and the new requested position.
|
||||
|
||||
if (fBoundaries[fEndBufIdx] < position) {
|
||||
|
@ -495,13 +508,24 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
|
|||
position = 0;
|
||||
positionStatusIdx = 0;
|
||||
} else {
|
||||
fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way.
|
||||
position = fBI->handleNext(); // TODO: supplementals don't work with the +1.
|
||||
if (position == backupPosition + 1) {
|
||||
position = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fBI->fPosition = backupPosition;
|
||||
position = fBI->handleNext();
|
||||
if (position <= backupPosition + 4) {
|
||||
// +4 is a quick test for possibly having advanced only one codepoint.
|
||||
// Four being the length of the longest potential code point, a supplementary in UTF-8
|
||||
utext_setNativeIndex(&fBI->fText, position);
|
||||
if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
position = fBI->handleNext(); // Safe rules identify safe pairs.
|
||||
}
|
||||
};
|
||||
positionStatusIdx = fBI->fRuleStatusIndex;
|
||||
|
||||
}
|
||||
} while (position >= fromPosition);
|
||||
|
||||
|
@ -540,7 +564,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
|
|||
}
|
||||
U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
|
||||
}
|
||||
|
||||
|
||||
if (!segmentHandledByDictionary && position < fromPosition) {
|
||||
fSideBuffer.addElement(position, status);
|
||||
fSideBuffer.addElement(positionStatusIdx, status);
|
||||
|
@ -566,7 +590,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
|
|
|
@ -106,7 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
if (data->fFTableLen != 0) {
|
||||
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
|
||||
}
|
||||
if (data->fSRTableLen != 0) {
|
||||
if (data->fRTableLen != 0) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
if (data->fSRTableLen != 0) { // TODO: obsolete. Remove.
|
||||
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
|
||||
}
|
||||
|
||||
|
|
|
@ -173,7 +173,8 @@ public:
|
|||
/* */
|
||||
const RBBIDataHeader *fHeader;
|
||||
const RBBIStateTable *fForwardTable;
|
||||
const RBBIStateTable *fSafeRevTable;
|
||||
const RBBIStateTable *fReverseTable; // auto-generated safe reverse.
|
||||
const RBBIStateTable *fSafeRevTable; // hand-written safe reverse. TODO: delete this.
|
||||
const UChar *fRuleSource;
|
||||
const int32_t *fRuleStatusTable;
|
||||
|
||||
|
|
|
@ -300,10 +300,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
|||
fForwardTables->buildSafe(status);
|
||||
|
||||
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
|
||||
fForwardTables
|
||||
fForwardTables->printStates();
|
||||
fForwardTables->printRuleStatusTable();
|
||||
fForwardTables->printSafeTable();
|
||||
}
|
||||
|
|
|
@ -1312,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
|||
if (wantedEndState == endState) {
|
||||
int32_t pair = c1 << 16 | c2;
|
||||
safePairs.addElement(pair, status);
|
||||
printf("(%d, %d) ", c1, c2);
|
||||
// printf("(%d, %d) ", c1, c2);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
// printf("\n");
|
||||
}
|
||||
|
||||
// Populate the initial safe table.
|
||||
|
@ -1358,7 +1358,7 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
|||
rowState.setCharAt(c1, 0);
|
||||
}
|
||||
|
||||
// Merge similar states.
|
||||
// TODO: Merge similar states.
|
||||
|
||||
}
|
||||
|
||||
|
@ -1392,9 +1392,9 @@ int32_t RBBITableBuilder::getSafeTableSize() const {
|
|||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// exportTable() export the state transition table in the format required
|
||||
// by the runtime engine. getTableSize() bytes of memory
|
||||
// must be available at the output address "where".
|
||||
// exportSafeTable() export the state transition table in the format required
|
||||
// by the runtime engine. getTableSize() bytes of memory
|
||||
// must be available at the output address "where".
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::exportSafeTable(void *where) {
|
||||
|
|
|
@ -308,14 +308,14 @@ static void TestBreakIteratorCAPI()
|
|||
|
||||
|
||||
log_verbose("\nTesting the functions for sentence\n");
|
||||
ubrk_first(sentence);
|
||||
pos = ubrk_first(sentence);
|
||||
pos = ubrk_current(sentence);
|
||||
log_verbose("Current(sentence) = %d\n", (int32_t)pos);
|
||||
pos = ubrk_last(sentence);
|
||||
if(pos!=49)
|
||||
log_err("error ubrk_last for sentence did not return 49\n");
|
||||
log_verbose("Last (sentence) = %d\n", (int32_t)pos);
|
||||
ubrk_first(sentence);
|
||||
pos = ubrk_first(sentence);
|
||||
to = ubrk_following( sentence, 0 );
|
||||
if (to == 0) log_err("ubrk_following returned 0\n");
|
||||
to = ubrk_preceding( sentence, to );
|
||||
|
|
|
@ -4551,40 +4551,6 @@ void RBBITest::TestDebug(void) {
|
|||
UParseError pe;
|
||||
LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
|
||||
assertSuccess(WHERE, status);
|
||||
|
||||
#if 0
|
||||
bi->dumpTables();
|
||||
|
||||
RBBIDataWrapper *dw = bi->fData;
|
||||
const RBBIStateTable *fwtbl = dw->fForwardTable;
|
||||
int32_t numCharClasses = dw->fHeader->fCatCount;
|
||||
printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
|
||||
|
||||
for (int32_t c1=0; c1<numCharClasses; ++c1) {
|
||||
for (int32_t c2=0; c2 < numCharClasses; ++c2) {
|
||||
int32_t wantedEndState = -1;
|
||||
int32_t endState = 0;
|
||||
for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
|
||||
int32_t s2 = row->fNextState[c1];
|
||||
row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
|
||||
endState = row->fNextState[c2];
|
||||
if (wantedEndState < 0) {
|
||||
wantedEndState = endState;
|
||||
} else {
|
||||
if (wantedEndState != endState) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (wantedEndState == endState) {
|
||||
printf("(%d, %d) ", c1, c2);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
void RBBITest::TestProperties() {
|
||||
|
|
4
icu4c/source/test/testdata/rbbitst.txt
vendored
4
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -39,7 +39,9 @@
|
|||
|
||||
# Temp debugging tests
|
||||
#
|
||||
|
||||
<word>
|
||||
<data>•
|
||||
•</data>
|
||||
|
||||
## FILTERED BREAK TESTS
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue