mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-2292 added safe forward and backwards rules
X-SVN-Rev: 13648
This commit is contained in:
parent
d0370e2786
commit
41ac2f557b
7 changed files with 301 additions and 169 deletions
|
@ -405,8 +405,8 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
if (fData->fSafeRevTable != NULL) {
|
||||
return handleNewPrevious();
|
||||
if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
|
||||
return handlePrevious(fData->fReverseTable);
|
||||
}
|
||||
|
||||
// old rule syntax
|
||||
|
@ -486,27 +486,56 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
if (fData->fSafeRevTable != NULL) {
|
||||
// new rule syntax
|
||||
/// todo synwee
|
||||
/// fText->setIndex(offset);
|
||||
fText->setIndex(fText->startIndex());
|
||||
|
||||
result = fText->startIndex();
|
||||
}
|
||||
else {
|
||||
// otherwise, we have to sync up first. Use handlePrevious() to back
|
||||
// us up to a known break position before the specified position (if
|
||||
// we can determine that the specified position is a break position,
|
||||
// we don't back up at all). This may or may not be the last break
|
||||
// position at or before our starting position. Advance forward
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
// old rule syntax
|
||||
|
||||
fText->setIndex(offset);
|
||||
if (offset == fText->startIndex()) {
|
||||
return handleNext();
|
||||
// move forward one codepoint to prepare for moving back to a
|
||||
// safe point.
|
||||
// this handles offset being between a supplementary character
|
||||
fText->next32();
|
||||
// handlePrevious will move most of the time to < 1 boundary away
|
||||
handlePrevious(fData->fSafeRevTable);
|
||||
int32_t result = next();
|
||||
while (result <= offset) {
|
||||
result = next();
|
||||
}
|
||||
result = previous();
|
||||
return result;
|
||||
}
|
||||
if (fData->fSafeFwdTable != NULL) {
|
||||
// backup plan if forward safe table is not available
|
||||
fText->setIndex(offset);
|
||||
fText->previous32();
|
||||
// handle next will give result >= offset
|
||||
handleNext(fData->fSafeFwdTable);
|
||||
// previous will give result 0 or 1 boundary away from offset,
|
||||
// most of the time
|
||||
// we have to
|
||||
int32_t oldresult = previous();
|
||||
while (oldresult > offset) {
|
||||
int32_t result = previous();
|
||||
if (result <= offset) {
|
||||
return oldresult;
|
||||
}
|
||||
oldresult = result;
|
||||
}
|
||||
int32_t result = next();
|
||||
if (result <= offset) {
|
||||
return next();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
// otherwise, we have to sync up first. Use handlePrevious() to back
|
||||
// us up to a known break position before the specified position (if
|
||||
// we can determine that the specified position is a break position,
|
||||
// we don't back up at all). This may or may not be the last break
|
||||
// position at or before our starting position. Advance forward
|
||||
// from here until we've passed the starting position. The position
|
||||
// we stop on will be the first break position after the specified one.
|
||||
// old rule syntax
|
||||
|
||||
fText->setIndex(offset);
|
||||
if (offset == fText->startIndex()) {
|
||||
return handleNext();
|
||||
}
|
||||
result = previous();
|
||||
|
||||
while (result != BreakIterator::DONE && result <= offset) {
|
||||
result = next();
|
||||
|
@ -537,15 +566,43 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
|||
// position specified by the caller, we can just use previous()
|
||||
// to carry out this operation
|
||||
|
||||
if (fData->fSafeRevTable != NULL) {
|
||||
if (fData->fSafeFwdTable != NULL) {
|
||||
/// todo synwee
|
||||
// new rule syntax
|
||||
int32_t result = fText->endIndex();
|
||||
fText->setIndex(result);
|
||||
while (result != BreakIterator::DONE && result >= offset) {
|
||||
fText->setIndex(offset);
|
||||
// move backwards one codepoint to prepare for moving forwards to a
|
||||
// safe point.
|
||||
// this handles offset being between a supplementary character
|
||||
fText->previous32();
|
||||
handleNext(fData->fSafeFwdTable);
|
||||
int32_t result = previous();
|
||||
while (result >= offset) {
|
||||
result = previous();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (fData->fSafeRevTable != NULL) {
|
||||
// backup plan if forward safe table is not available
|
||||
fText->setIndex(offset);
|
||||
fText->next32();
|
||||
// handle previous will give result <= offset
|
||||
handlePrevious(fData->fSafeRevTable);
|
||||
|
||||
// next will give result 0 or 1 boundary away from offset,
|
||||
// most of the time
|
||||
// we have to
|
||||
int32_t oldresult = next();
|
||||
while (oldresult < offset) {
|
||||
int32_t result = next();
|
||||
if (result >= offset) {
|
||||
return oldresult;
|
||||
}
|
||||
oldresult = result;
|
||||
}
|
||||
int32_t result = previous();
|
||||
if (result >= offset) {
|
||||
return previous();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -568,6 +625,11 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
|
|||
return TRUE;
|
||||
}
|
||||
|
||||
if (offset == fText->endIndex()) {
|
||||
last(); // For side effects on current position, tag values.
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// out-of-range indexes are never boundary positions
|
||||
if (offset < fText->startIndex()) {
|
||||
first(); // For side effects on current position, tag values.
|
||||
|
@ -608,7 +670,11 @@ int32_t RuleBasedBreakIterator::current(void) const {
|
|||
// value every time the state machine passes through an accepting state.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
int32_t RuleBasedBreakIterator::handleNext() {
|
||||
return handleNext(fData->fForwardTable);
|
||||
}
|
||||
|
||||
int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
||||
if (fTrace) {
|
||||
RBBIDebugPrintf("Handle Next pos char state category \n");
|
||||
}
|
||||
|
@ -637,7 +703,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
fLastBreakTag = 0;
|
||||
|
||||
row = (RBBIStateTableRow *) // Point to starting row of state table.
|
||||
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
|
||||
// Character Category fetch for starting character.
|
||||
// See comments on character category code within loop, below.
|
||||
|
@ -700,7 +766,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
// look up a state transition in the state table
|
||||
state = row->fNextState[category];
|
||||
row = (RBBIStateTableRow *)
|
||||
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
|
||||
// Get the next character. Doing it here positions the iterator
|
||||
// to the correct position for recording matches in the code that
|
||||
|
@ -913,14 +979,14 @@ continueOn:
|
|||
// The logic of this function is very similar to handleNext(), above.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
||||
if (fText == NULL || fData == NULL) {
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
|
||||
if (fText == NULL || statetable == NULL) {
|
||||
return 0;
|
||||
}
|
||||
// break tag is no longer valid after icu switched to exact backwards
|
||||
// positioning.
|
||||
fLastBreakTagValid = FALSE;
|
||||
if (fData->fReverseTable == NULL) {
|
||||
if (statetable == NULL) {
|
||||
return fText->setToStart();
|
||||
}
|
||||
|
||||
|
@ -938,7 +1004,7 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
|||
RBBIStateTableRow *row;
|
||||
|
||||
row = (RBBIStateTableRow *)
|
||||
(this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
|
||||
(statetable->fTableData + (state * statetable->fRowLen));
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
|
@ -954,8 +1020,7 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
|||
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
if (hasPassedStartText) {
|
||||
// if we have already considered the start of the text
|
||||
if (fData->fLookAheadHardBreak == TRUE
|
||||
&& row->fLookAhead != 0) {
|
||||
if (row->fLookAhead != 0 && lookaheadResult == 0) {
|
||||
result = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -987,7 +1052,7 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
|||
// look up a state transition in the backwards state table
|
||||
state = row->fNextState[category];
|
||||
row = (RBBIStateTableRow *)
|
||||
(this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
|
||||
(statetable->fTableData + (state * statetable->fRowLen));
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, could have lookahead so we move on to check it
|
||||
|
|
|
@ -30,6 +30,7 @@ struct RBBIDataHeader;
|
|||
class RuleBasedBreakIteratorTables;
|
||||
class BreakIterator;
|
||||
class RBBIDataWrapper;
|
||||
struct RBBIStateTable;
|
||||
|
||||
|
||||
|
||||
|
@ -480,9 +481,21 @@ private:
|
|||
* The various calling methods then iterate forward from this safe position to
|
||||
* the appropriate position to return. (For more information, see the description
|
||||
* of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
|
||||
* @param statetable state table used of moving backwards
|
||||
* @internal
|
||||
*/
|
||||
int32_t handleNewPrevious(void);
|
||||
int32_t handlePrevious(const RBBIStateTable *statetable);
|
||||
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
* and advances through the text character by character until we reach the end
|
||||
* of the text or the state machine transitions to state 0. We update our return
|
||||
* value every time the state machine passes through a possible end state.
|
||||
* @param statetable state table used of moving forwards
|
||||
* @internal
|
||||
*/
|
||||
int32_t handleNext(const RBBIStateTable *statetable);
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
|
|
@ -50,4 +50,12 @@ $BackOneCluster;
|
|||
|
||||
!!safe_reverse;
|
||||
|
||||
$BackOneCluster;
|
||||
# rule 6, 7, 8
|
||||
$V+ $L;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 6, 7, 8
|
||||
$V+ $T;
|
||||
|
|
|
@ -341,7 +341,7 @@ $CM* $ALPlus $CM+ / $LB5Breaks;
|
|||
!!safe_reverse;
|
||||
|
||||
# LB 7
|
||||
$CM* [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
|
||||
# LB 9
|
||||
|
@ -362,17 +362,17 @@ $CL $CM* ($NU | $IS);
|
|||
!!safe_forward;
|
||||
|
||||
# LB 7
|
||||
[^$BK $CR $LF $NL $ZW $SP] $CM*;
|
||||
$SP $CM+ / .;
|
||||
[^$BK $CR $LF $NL $ZW $SP] $CM+;
|
||||
$SP $CM+ / [^$CM];
|
||||
|
||||
# LB 9
|
||||
$OP $CM* $SP*;
|
||||
$OP $CM* $SP+;
|
||||
|
||||
# LB 10
|
||||
$QU $CM* $SP*;
|
||||
$QU $CM* $SP+;
|
||||
|
||||
# LB 11
|
||||
$CL $CM* $SP*;
|
||||
$CL $CM* $SP+;
|
||||
|
||||
# LB 18
|
||||
$HY $CM* $NU;
|
||||
|
|
|
@ -9,7 +9,6 @@
|
|||
# These rules are based on TR 29 version 4.0.0
|
||||
#
|
||||
|
||||
!!chain;
|
||||
|
||||
#
|
||||
# Character categories as defined in TR 29
|
||||
|
@ -31,85 +30,79 @@ $Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
|
|||
$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
|
||||
[[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
|
||||
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
$ATermEx = $ATerm $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$UpperEx = $Upper $Extend*;
|
||||
$CloseEx = $Close $Extend*;
|
||||
$SpEx = $Sp $Extend*;
|
||||
$LowerEx = $Lower $Extend*;
|
||||
$TermEx = $Term $Extend*;
|
||||
# Define extended forms of the character classes,
|
||||
# incorporate grapheme cluster + format chars.
|
||||
|
||||
# rule 6
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$ATermEx = $ATerm $Extend* $Format*;
|
||||
$NumericEx = $Numeric $Extend* $Format*;
|
||||
$UpperEx = $Upper $Extend* $Format*;
|
||||
$TermEx = $Term $Extend* $Format*;
|
||||
|
||||
$ATermEx $Format* $NumericEx;
|
||||
#
|
||||
# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
|
||||
#
|
||||
$SepSeq = $Sep | \u000d\u000a;
|
||||
|
||||
# rule 7
|
||||
# $InteriorChars are those that never trigger a following break.
|
||||
$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
|
||||
|
||||
$UpperEx $ATermEx $Format* $UpperEx;
|
||||
## -------------------------------------------------
|
||||
|
||||
# rule 8
|
||||
!!forward;
|
||||
|
||||
$ATermEx $Format* $CloseEx* $Format* $SpEx $Format*
|
||||
[^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
|
||||
|
||||
# rule 9 forced to exit by / [^$Close $Sp]
|
||||
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
|
||||
|
||||
# rule 10 forced to exit by / [^$Sp];
|
||||
# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
|
||||
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
|
||||
|
||||
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
|
||||
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];
|
||||
# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
|
||||
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
|
||||
|
||||
# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
|
||||
# because a lower case word follows the period.
|
||||
$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
|
||||
|
||||
# rule 11 partly included in rule 9 and 10
|
||||
$TermEx;
|
||||
$ATermEx;
|
||||
# Rules 3, 9, 10, 11
|
||||
# Matches a simple sentence, or the trailing part of a complex sentence,
|
||||
# where a simple sentence contains no interior "."s.
|
||||
$TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
|
||||
$EndSequence = $InteriorChars* $SepSeq?;
|
||||
|
||||
# rule 12
|
||||
|
||||
([^$Term $ATerm $Sep] $Extend*)+;
|
||||
([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);
|
||||
# Put them all together.
|
||||
($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM
|
||||
($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
$EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
|
||||
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
|
||||
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
|
||||
$ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
|
||||
$ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
|
||||
|
||||
$BackATermEx = $Extend* $ATerm;
|
||||
$BackNumericEx = $Extend* $Numeric;
|
||||
$BackUpperEx = $Extend* $Upper;
|
||||
$BackCloseEx = $Extend* $Close;
|
||||
$BackSpEx = $Extend* $Sp;
|
||||
$BackLowerEx = $Extend* $Lower;
|
||||
$BackTermEx = $Extend* $Term;
|
||||
$RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
|
||||
|
||||
# rule 3
|
||||
|
||||
! $Sep .;
|
||||
|
||||
# rule 6
|
||||
|
||||
! $BackNumericEx $Format* $BackATermEx;
|
||||
## -------------------------------------------------
|
||||
|
||||
## !!safe_reverse;
|
||||
|
||||
# rule 7
|
||||
## $Extend* $ATerm $Format* $Extend* $Upper;
|
||||
|
||||
! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
|
||||
# rule 11
|
||||
## ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 8
|
||||
|
||||
! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format*
|
||||
$BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
|
||||
|
||||
# rules 9, 10, 11, 12
|
||||
|
||||
$Any = [^$Term $ATerm $Sep];
|
||||
$Safe = [^$Term $ATerm $Sep $Sp $Close];
|
||||
$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
|
||||
! $BackEnd;
|
||||
! $BackEnd? $Any* $Safe;
|
||||
! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
|
||||
! $BackEnd? $Any* $Sp / $Sep;
|
||||
## $Lower .;
|
||||
|
|
|
@ -173,5 +173,42 @@ $BackKatakanaEx $Format* $BackKatakanaEx;
|
|||
|
||||
!!safe_reverse;
|
||||
|
||||
$Extend* [^$Extend];
|
||||
$BackACMLetterEx / $Format;
|
||||
# rule 3
|
||||
$Extend+ [^$Extend];
|
||||
|
||||
# rule 4
|
||||
$Format+ $BackABaseLetterEx;
|
||||
$Format+ $BackACMLetterEx / $Format;
|
||||
$Format+ $BackNumericEx;
|
||||
$Format+ $BackMidLetterEx;
|
||||
$Format+ $BackMidNumLetEx;
|
||||
$Format+ $BackMidNumEx;
|
||||
$Format+ $BackKatakanaEx;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet) $Format* $BackABaseLetterEx;
|
||||
($MidLetter | $MidNumLet) $Format* $BackACMLetterEx / $Format;
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet) $Format* $BackNumericEx;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# rule 3
|
||||
$Extend+;
|
||||
|
||||
# rule 4
|
||||
$Format+ $ALetterEx;
|
||||
$Format+ $NumericEx;
|
||||
$Format+ $MidLetterEx;
|
||||
$Format+ $MidNumLetEx;
|
||||
$Format+ $MidNumEx;
|
||||
$Format+ $KatakanaEx;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet) $Format* $ALetterEx;
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet) $Format* $NumericEx;
|
||||
|
|
|
@ -589,7 +589,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
if(exec) TestWordBoundary(); break;
|
||||
case 14: name = "TestLineBreaks";
|
||||
if(exec) TestLineBreaks(); break;
|
||||
/***
|
||||
case 15: name = "TestSentBreaks";
|
||||
if(exec) TestSentBreaks(); break;
|
||||
case 16: name = "TestExtended";
|
||||
|
@ -603,7 +602,6 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
#endif
|
||||
}
|
||||
break;
|
||||
***/
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
@ -3005,6 +3003,74 @@ static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d
|
|||
}
|
||||
#endif
|
||||
|
||||
static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
BreakIterator *bi,
|
||||
int expected[],
|
||||
int expectedcount)
|
||||
{
|
||||
int count = 0;
|
||||
int i = 0;
|
||||
int forward[20];
|
||||
bi->setText(ustr);
|
||||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count < expectedcount && expected[count] != i) {
|
||||
test->errln("break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
break;
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("break test failed: missed %d match",
|
||||
expectedcount - count);
|
||||
return;
|
||||
}
|
||||
// testing boundaries
|
||||
for (i = 1; i < expectedcount; i ++) {
|
||||
int j = expected[i - 1];
|
||||
if (!bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("Expected boundary at position %d", j);
|
||||
return;
|
||||
}
|
||||
for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
|
||||
if (bi->isBoundary(j)) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("Not expecting boundary at position %d", j);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
test->errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("happy break test failed: missed a match");
|
||||
return;
|
||||
}
|
||||
|
||||
// testing preceding
|
||||
for (i = 0; i < expectedcount - 1; i ++) {
|
||||
int j = expected[i] + 1;
|
||||
for (; j <= expected[i + 1]; j ++) {
|
||||
if (bi->preceding(j) != expected[i]) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
test->errln("Not expecting backwards boundary at position %d", j);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RBBITest::TestWordBreaks(void)
|
||||
{
|
||||
// <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
|
||||
|
@ -3015,6 +3081,7 @@ void RBBITest::TestWordBreaks(void)
|
|||
UChar str[25];
|
||||
char *strlist[] =
|
||||
{
|
||||
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
|
||||
"\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
|
||||
"\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
|
||||
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
|
@ -3051,13 +3118,13 @@ void RBBITest::TestWordBreaks(void)
|
|||
};
|
||||
int loop;
|
||||
for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
// printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 25);
|
||||
UnicodeString ustr(str);
|
||||
// RBBICharMonkey monkey;
|
||||
RBBIWordMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
int forward[20];
|
||||
int expectedcount = 0;
|
||||
|
||||
monkey.setText(ustr);
|
||||
|
@ -3066,33 +3133,7 @@ void RBBITest::TestWordBreaks(void)
|
|||
expected[expectedcount ++] = i;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
bi->setText(ustr);
|
||||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count > 20 || expected[count] != i) {
|
||||
errln("happy break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test failed: missed a match");
|
||||
break;
|
||||
}
|
||||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
errln("happy break test failed: missed a match");
|
||||
}
|
||||
testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3105,7 +3146,9 @@ void RBBITest::TestWordBoundary(void)
|
|||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[20];
|
||||
char *strlist[] =
|
||||
{"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
{
|
||||
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
|
||||
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
|
||||
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
|
||||
"\\u2027\\U000e0067\\u0a47\\u00b7",
|
||||
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
|
||||
|
@ -3136,6 +3179,7 @@ void RBBITest::TestWordBoundary(void)
|
|||
};
|
||||
int loop;
|
||||
for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
// printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 20);
|
||||
UnicodeString ustr(str);
|
||||
int forward[20];
|
||||
|
@ -3153,7 +3197,7 @@ void RBBITest::TestWordBoundary(void)
|
|||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d not a boundary",
|
||||
j);
|
||||
break;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3161,7 +3205,7 @@ void RBBITest::TestWordBoundary(void)
|
|||
printStringBreaks(ustr, forward, count);
|
||||
errln("happy boundary test failed: expected %d a boundary",
|
||||
i);
|
||||
break;
|
||||
return;
|
||||
}
|
||||
prev = i;
|
||||
}
|
||||
|
@ -3176,6 +3220,9 @@ void RBBITest::TestLineBreaks(void)
|
|||
UChar str[20];
|
||||
char *strlist[] =
|
||||
{
|
||||
"\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
|
||||
"\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
|
||||
"\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
|
||||
"\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
|
||||
"\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
|
||||
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
|
||||
|
@ -3207,7 +3254,6 @@ void RBBITest::TestLineBreaks(void)
|
|||
RBBILineMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
int forward[20];
|
||||
int expectedcount = 0;
|
||||
|
||||
monkey.setText(ustr);
|
||||
|
@ -3216,35 +3262,7 @@ void RBBITest::TestLineBreaks(void)
|
|||
expected[expectedcount ++] = i;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
bi->setText(ustr);
|
||||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
if (count < expectedcount && expected[count] != i) {
|
||||
errln("happy break forward test failed: expected %d but got %d",
|
||||
expected[count], i);
|
||||
}
|
||||
count ++;
|
||||
}
|
||||
if (count != expectedcount) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test failed: missed %d match",
|
||||
expectedcount - count);
|
||||
break;
|
||||
}
|
||||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test reverse failed: expected %d but got %d",
|
||||
forward[count], i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
errln("happy break test failed: missed a match");
|
||||
break;
|
||||
}
|
||||
testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3266,12 +3284,10 @@ void RBBITest::TestSentBreaks(void)
|
|||
"Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
|
||||
};
|
||||
int loop;
|
||||
int forward[100];
|
||||
for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 100);
|
||||
UnicodeString ustr(str);
|
||||
|
||||
int forward[20];
|
||||
|
||||
int count = 0;
|
||||
bi->setText(ustr);
|
||||
|
|
Loading…
Add table
Reference in a new issue