ICU-2292 line breaks passing on default option

X-SVN-Rev: 13636
This commit is contained in:
Syn Wee Quek 2003-11-07 22:49:38 +00:00
parent 062c626e85
commit 558442a420
7 changed files with 154 additions and 85 deletions

View file

@ -405,6 +405,10 @@ int32_t RuleBasedBreakIterator::previous(void) {
return BreakIterator::DONE;
}
if (fData->fSafeRevTable != NULL) {
return handleNewPrevious();
}
// old rule syntax
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
@ -415,8 +419,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
int32_t start = current();
fText->previous32();
int32_t lastResult = (fData->fSafeRevTable != NULL) ?
handleNewPrevious(): handlePrevious();
int32_t lastResult = handlePrevious();
int32_t result = lastResult;
int32_t lastTag = 0;
UBool breakTagValid = FALSE;
@ -450,9 +453,6 @@ int32_t RuleBasedBreakIterator::previous(void) {
return lastResult;
}
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
@ -954,6 +954,10 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
if (hasPassedStartText) {
// if we have already considered the start of the text
if (fData->fLookAheadHardBreak == TRUE
&& row->fLookAhead != 0) {
result = 0;
}
break;
}
@ -1007,6 +1011,17 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
/// we need to make the lookahead rules not chain eventually.
/// return result;
/// this is going to be the longest match again
/// syn wee todo hard coded for line breaks stuff
/// needs to provide a tag in rules to ensure a stop.
if (fData->fLookAheadHardBreak == TRUE) {
fText->setIndex(result);
return result;
}
category = lastCategory;
fText->setIndex(result);
goto continueOn;
}

View file

@ -107,6 +107,15 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
fRefCount = 1;
/// todo: maybe add this formally to the builder
UnicodeString hardbreak("!!lookAheadHardBreak");
if (fRuleString.indexOf(hardbreak) >= 0) {
fLookAheadHardBreak = TRUE;
}
else {
fLookAheadHardBreak = FALSE;
}
#ifdef RBBI_DEBUG
char *debugEnv = getenv("U_RBBIDEBUG");
if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}

View file

@ -127,6 +127,14 @@ public:
const UChar *fRuleSource;
UTrie fTrie;
// if fLookAheadHardBreak is true, we will break at the first lookahead match
// the search does not go on further to look for a longer match
// this also allows breaks at both ends of the string
// e.g. rule "ABC / D; ABCDE" and
// text "ABCD ABCDE ABC" will give breaks at
// 01234567890123
// {0, 3, 4, 5, 8, 9, 10, 11, 14}
UBool fLookAheadHardBreak;
private:
int32_t fRefCount;

View file

@ -480,6 +480,10 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
fRB->fDefaultTree = &fRB->fSafeFwdTree;
} else if (opt == "safe_reverse") {
fRB->fDefaultTree = &fRB->fSafeRevTree;
} else if (opt == "lookAheadHardBreak") {
// at the moment do nothing for this
// the code is handled in rbbi.cpp
// todo: think about how to handle this
} else {
error(U_BRK_UNRECOGNIZED_OPTION);
}

View file

@ -14,6 +14,7 @@
!!chain;
!!LBCMNoChain;
!!lookAheadHardBreak;
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
@ -114,6 +115,9 @@ $QU $CM+;
$SP $CM+;
$SY $CM+;
## -------------------------------------------------
!!forward;
#
# Rule LB 3
@ -217,111 +221,122 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
# at the current position.
#
!!reverse;
# !. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
## -------------------------------------------------
! $CM+ $ALPlus;
! $CM+ $BA;
! $CM+ $BB;
! $CM+ $B2;
! $CM+ $CL;
! $CM+ $EX;
! $CM+ $GL;
! $CM+ $HY;
! $CM+ $ID;
! $CM+ $IN;
! $CM+ $IS;
! $CM+ $NS;
! $CM+ $NU;
! $CM+ $OP;
! $CM+ $PO;
! $CM+ $PR;
! $CM+ $QU;
! $CM+ $SP;
! $CM+ $SY;
!!reverse;
$CM+ $ALPlus;
$CM+ $BA;
$CM+ $BB;
$CM+ $B2;
$CM+ $CL;
$CM+ $EX;
$CM+ $GL;
$CM+ $HY;
$CM+ $ID;
$CM+ $IN;
$CM+ $IS;
$CM+ $NS;
$CM+ $NU;
$CM+ $OP;
$CM+ $PO;
$CM+ $PR;
$CM+ $QU;
$CM+ $SP;
$CM+ $SY;
# LB 3
! ($BK | $CR | $LF | $NL) $LB3NonBreaks?;
! ($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
! $LF $CR;
($BK | $CR | $LF | $NL) $LB3NonBreaks;
($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
$LF $CR;
# LB 4 x SP
# x ZW
! [$SP $ZW] $LB3NonBreaks;
! [$SP $ZW] $CM* $LB5NonBreaks;
[$SP $ZW] $LB3NonBreaks;
[$SP $ZW] $CM* $LB5NonBreaks;
# LB 5 Break after zero width space
# LB 5 Break after zero width space
# LB 7 Combining marks. TODO: get it right!
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# $CM not covered by the above needs to behave like $AL
! $CM+ $LB5NonBreaks; # Stick together any combining sequences that don't match other rules.
# LB 6 Jamo is treated like an alphabet
# LB 7 Combining marks.
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# $CM not covered by the above needs to behave like $AL
# Stick together any combining sequences that don't match other rules.
$CM+ $LB5NonBreaks;
# LB 8
! $CL $CM* $LB5NonBreaks;
! $EX $CM* $LB5NonBreaks;
! $IS $CM* $LB5NonBreaks;
! $SY $CM* $LB5NonBreaks;
$CL $CM* $LB5NonBreaks;
$EX $CM* $LB5NonBreaks;
$IS $CM* $LB5NonBreaks;
$SY $CM* $LB5NonBreaks;
# LB 9
! .? $SP* $CM* $OP;
! $CM* $LB5NonBreaks $SP* $CM* $OP;
$LB5NonBreaks $SP* $CM* $OP;
# LB 10
! $CM* $OP $SP* $CM* $QU;
$CM* $OP $SP* $CM* $QU;
# LB 11
! $CM* $NS $SP* $CM* $CL;
$CM* $NS $SP* $CM* $CL;
# LB 11a
! ($CM* $B2)+;
($CM* $B2)+;
# LB 11b
! .? $CM* $GL $CM* $LB5NonBreaks;
! $CM* $LB5NonBreaks $CM* $GL $CM* $LB5NonBreaks;
! $LB3NonBreaks? $CM* $GL;
! $CM* $LB5NonBreaks $CM* $GL;
$CM* $GL $CM* $LB5NonBreaks;
$CM* $LB5NonBreaks $CM* $GL;
$LB3NonBreaks $CM* $GL;
# LB 12
# LB 14
! .? ($CM* $QU)+ $CM* $LB12NonBreaks;
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM* $LB12NonBreaks;
! .? ($CM* $QU)+ $CM+ $SP; # LB7a SP CM+ behaves as ID
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM+ $SP;
! $LB3NonBreaks? $CM* $QU;
! $CM* $LB5NonBreaks $CM* $QU; # Don't let a combining mark go onto $CR, $BK, etc.
$CM* $QU $CM* $LB12NonBreaks;
$CM* $QU $CM+ $SP;
$CM* $LB5NonBreaks $CM* $QU;
# LB 14a
$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
# LB 15
! ($CM* $BA | $CM* $HY | $CM* $NS) $BackLB14CanBreakAfter;
! ($CM* $BA | $CM* $HY | $CM* $NS) $CM+ / [$BK $CR $LF $NL $ZW];
! [^$CB] $CM* $BB;
! $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
($CM* ($BA | $HY | $NS))+ $CM+ / [$BK $CR $LF $NL $ZW];
[$CR $LF $BK $NL $ZW] $CM* $BB;
$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
# LB 16
! $CM* $IN $CM* $ALPlus;
! $CM* $IN $CM+ / [$BK $CR $LF $NL $ZW]; # by rule 7c, any otherwise unattached CM behaves as AL
! $CM* $IN $CM* $ID;
! $CM* $IN $CM+ $SP; # by rule 7a, $SP $CM behaves like ID
! $CM* $IN $CM* $IN;
! $CM* $IN $CM* $NU;
$CM* $IN $CM* $ALPlus;
# by rule 7c, any otherwise unattached CM behaves as AL
$CM* $IN $CM+ / [$BK $CR $LF $NL $ZW];
$CM* $IN $CM* ($ID | $CM $SP);
$CM* $IN $CM* $IN;
$CM* $IN $CM* $NU;
# $LB 17
! $CM* $PO ($CM* $ID | $CM+ $SP);
! $CM* $NU ($CM* $ALPlus)+; # includes $LB19
! ($CM* $NU)+;
! ($CM* $NU)+ $CM+ / [$BK $CR $LF $NL $ZW]; # Rule 7c
! ($CM* $ALPlus)+ $CM* $NU;
$CM* $PO $CM* ($ID | $CM $SP);
$CM* $NU ($CM* $ALPlus)+; # includes $LB19
$CM* $NU $CM+ / [$BK $CR $LF $NL $ZW]; # Rule 7c
$CM* $ALPlus $CM* $NU;
# LB 18
! ($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
# LB 19
! ($CM* $ALPlus)+;
! ($CM* $ALPlus)+ $CM+ / [$BK $CR $LF $NL $ZW]; # The $CM* is from rule 7C, and unattached CM is treated as AL
$CM* $ALPlus $CM* $ALPlus;
# The $CM* is from rule 7C, and unattached CM is treated as AL
$CM* $ALPlus $CM+ / [$BK $CR $LF $NL $ZW];
## problem state table can't handle lookahead when it is at the
## start of the string, currently handled in the rbbi code
## todo fix this
## -------------------------------------------------
!!safe_reverse;
$CM* [^$CM];

View file

@ -587,9 +587,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestWordBreaks(); break;
case 13: name = "TestWordBoundary";
if(exec) TestWordBoundary(); break;
/***
case 14: name = "TestLineBreaks";
if(exec) TestLineBreaks(); break;
/***
case 15: name = "TestSentBreaks";
if(exec) TestSentBreaks(); break;
case 16: name = "TestExtended";
@ -602,8 +602,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
#endif
}
***/
break;
***/
default: name = ""; break; //needed to end loop
}
}
@ -2735,6 +2735,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
if (fSP->contains(thisChar)) {
continue;
}
if (fZW->contains(thisChar)) {
continue;
}
@ -2745,11 +2746,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
// LB 6, LB 7
int32_t oldpos = pos;
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
c = fText->char32At(nextPos);
// another percularity of lb4
if (fSP->contains(thisChar)) {
continue;
}
rule67Adjust(pos, &thisChar, &nextPos, &c);
// If the loop is still warming up - if we haven't shifted the initial
@ -2785,11 +2791,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
// LB 9 Don't break after OP SP*
/// UBool cmFlag = FALSE;
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
if (fOP->contains(fText->char32At(tPos))) {
break;
}
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
if (fSP->contains(prevChar) == FALSE
|| fSP->contains(fText->char32At(tPos)) == FALSE
|| tPos == 0) {
/// || cmFlag == TRUE) {
// if we have $SP$CM+ which is an $ID
goto fall_through_9;
}
}
@ -3164,7 +3175,13 @@ void RBBITest::TestLineBreaks(void)
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
UChar str[20];
char *strlist[] =
{"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
{
"\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
"\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
@ -3184,10 +3201,9 @@ void RBBITest::TestLineBreaks(void)
};
int loop;
for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
printf("looping %d\n", loop);
// printf("looping %d\n", loop);
u_unescape(strlist[loop], str, 20);
UnicodeString ustr(str);
// RBBICharMonkey monkey;
RBBILineMonkey monkey;
int expected[20];
@ -3214,8 +3230,9 @@ void RBBITest::TestLineBreaks(void)
printStringBreaks(ustr, expected, expectedcount);
errln("happy break test failed: missed %d match",
expectedcount - count);
break;
}
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
count --;
if (forward[count] != i) {
printStringBreaks(ustr, expected, expectedcount);
@ -3226,6 +3243,7 @@ void RBBITest::TestLineBreaks(void)
}
if (count != 0) {
errln("happy break test failed: missed a match");
break;
}
}
}

View file

@ -108,8 +108,8 @@ DataHeader dh ={
0, // reserved
{ 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
{ 2, 1, 0, 0 }, // formatVersion
{ 3, 1, 0, 0 } // dataVersion (Unicode version)
{ 3, 0, 0, 0 }, // formatVersion
{ 4, 0, 0, 0 } // dataVersion (Unicode version)
}};
#endif