mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-2292 line breaks passing on default option
X-SVN-Rev: 13636
This commit is contained in:
parent
062c626e85
commit
558442a420
7 changed files with 154 additions and 85 deletions
|
@ -405,6 +405,10 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
if (fData->fSafeRevTable != NULL) {
|
||||
return handleNewPrevious();
|
||||
}
|
||||
|
||||
// old rule syntax
|
||||
// set things up. handlePrevious() will back us up to some valid
|
||||
// break position before the current position (we back our internal
|
||||
|
@ -415,8 +419,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
int32_t start = current();
|
||||
|
||||
fText->previous32();
|
||||
int32_t lastResult = (fData->fSafeRevTable != NULL) ?
|
||||
handleNewPrevious(): handlePrevious();
|
||||
int32_t lastResult = handlePrevious();
|
||||
int32_t result = lastResult;
|
||||
int32_t lastTag = 0;
|
||||
UBool breakTagValid = FALSE;
|
||||
|
@ -450,9 +453,6 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
return lastResult;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Sets the iterator to refer to the first boundary position following
|
||||
* the specified position.
|
||||
|
@ -954,6 +954,10 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
|||
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
if (hasPassedStartText) {
|
||||
// if we have already considered the start of the text
|
||||
if (fData->fLookAheadHardBreak == TRUE
|
||||
&& row->fLookAhead != 0) {
|
||||
result = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1007,6 +1011,17 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
|
|||
/// we need to make the lookahead rules not chain eventually.
|
||||
/// return result;
|
||||
/// this is going to be the longest match again
|
||||
|
||||
/// syn wee todo hard coded for line breaks stuff
|
||||
/// needs to provide a tag in rules to ensure a stop.
|
||||
|
||||
if (fData->fLookAheadHardBreak == TRUE) {
|
||||
fText->setIndex(result);
|
||||
return result;
|
||||
}
|
||||
category = lastCategory;
|
||||
fText->setIndex(result);
|
||||
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
|
|
@ -107,6 +107,15 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
|
||||
fRefCount = 1;
|
||||
|
||||
/// todo: maybe add this formally to the builder
|
||||
UnicodeString hardbreak("!!lookAheadHardBreak");
|
||||
if (fRuleString.indexOf(hardbreak) >= 0) {
|
||||
fLookAheadHardBreak = TRUE;
|
||||
}
|
||||
else {
|
||||
fLookAheadHardBreak = FALSE;
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
char *debugEnv = getenv("U_RBBIDEBUG");
|
||||
if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
|
||||
|
|
|
@ -127,6 +127,14 @@ public:
|
|||
const UChar *fRuleSource;
|
||||
|
||||
UTrie fTrie;
|
||||
// if fLookAheadHardBreak is true, we will break at the first lookahead match
|
||||
// the search does not go on further to look for a longer match
|
||||
// this also allows breaks at both ends of the string
|
||||
// e.g. rule "ABC / D; ABCDE" and
|
||||
// text "ABCD ABCDE ABC" will give breaks at
|
||||
// 01234567890123
|
||||
// {0, 3, 4, 5, 8, 9, 10, 11, 14}
|
||||
UBool fLookAheadHardBreak;
|
||||
|
||||
private:
|
||||
int32_t fRefCount;
|
||||
|
|
|
@ -480,6 +480,10 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
|
|||
fRB->fDefaultTree = &fRB->fSafeFwdTree;
|
||||
} else if (opt == "safe_reverse") {
|
||||
fRB->fDefaultTree = &fRB->fSafeRevTree;
|
||||
} else if (opt == "lookAheadHardBreak") {
|
||||
// at the moment do nothing for this
|
||||
// the code is handled in rbbi.cpp
|
||||
// todo: think about how to handle this
|
||||
} else {
|
||||
error(U_BRK_UNRECOGNIZED_OPTION);
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
!!lookAheadHardBreak;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
|
@ -114,6 +115,9 @@ $QU $CM+;
|
|||
$SP $CM+;
|
||||
$SY $CM+;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Rule LB 3
|
||||
|
@ -217,111 +221,122 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
|||
# at the current position.
|
||||
#
|
||||
|
||||
!!reverse;
|
||||
# !. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
|
||||
## -------------------------------------------------
|
||||
|
||||
! $CM+ $ALPlus;
|
||||
! $CM+ $BA;
|
||||
! $CM+ $BB;
|
||||
! $CM+ $B2;
|
||||
! $CM+ $CL;
|
||||
! $CM+ $EX;
|
||||
! $CM+ $GL;
|
||||
! $CM+ $HY;
|
||||
! $CM+ $ID;
|
||||
! $CM+ $IN;
|
||||
! $CM+ $IS;
|
||||
! $CM+ $NS;
|
||||
! $CM+ $NU;
|
||||
! $CM+ $OP;
|
||||
! $CM+ $PO;
|
||||
! $CM+ $PR;
|
||||
! $CM+ $QU;
|
||||
! $CM+ $SP;
|
||||
! $CM+ $SY;
|
||||
!!reverse;
|
||||
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HY;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $NS;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $SP;
|
||||
$CM+ $SY;
|
||||
|
||||
# LB 3
|
||||
|
||||
! ($BK | $CR | $LF | $NL) $LB3NonBreaks?;
|
||||
! ($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
|
||||
! $LF $CR;
|
||||
($BK | $CR | $LF | $NL) $LB3NonBreaks;
|
||||
($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
|
||||
$LF $CR;
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
! [$SP $ZW] $LB3NonBreaks;
|
||||
! [$SP $ZW] $CM* $LB5NonBreaks;
|
||||
[$SP $ZW] $LB3NonBreaks;
|
||||
[$SP $ZW] $CM* $LB5NonBreaks;
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
# LB 5 Break after zero width space
|
||||
|
||||
# LB 7 Combining marks. TODO: get it right!
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
! $CM+ $LB5NonBreaks; # Stick together any combining sequences that don't match other rules.
|
||||
# LB 6 Jamo is treated like an alphabet
|
||||
|
||||
# LB 7 Combining marks.
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $LB5NonBreaks;
|
||||
|
||||
# LB 8
|
||||
! $CL $CM* $LB5NonBreaks;
|
||||
! $EX $CM* $LB5NonBreaks;
|
||||
! $IS $CM* $LB5NonBreaks;
|
||||
! $SY $CM* $LB5NonBreaks;
|
||||
$CL $CM* $LB5NonBreaks;
|
||||
$EX $CM* $LB5NonBreaks;
|
||||
$IS $CM* $LB5NonBreaks;
|
||||
$SY $CM* $LB5NonBreaks;
|
||||
|
||||
# LB 9
|
||||
! .? $SP* $CM* $OP;
|
||||
! $CM* $LB5NonBreaks $SP* $CM* $OP;
|
||||
$LB5NonBreaks $SP* $CM* $OP;
|
||||
|
||||
# LB 10
|
||||
! $CM* $OP $SP* $CM* $QU;
|
||||
$CM* $OP $SP* $CM* $QU;
|
||||
|
||||
# LB 11
|
||||
! $CM* $NS $SP* $CM* $CL;
|
||||
$CM* $NS $SP* $CM* $CL;
|
||||
|
||||
# LB 11a
|
||||
! ($CM* $B2)+;
|
||||
($CM* $B2)+;
|
||||
|
||||
# LB 11b
|
||||
! .? $CM* $GL $CM* $LB5NonBreaks;
|
||||
! $CM* $LB5NonBreaks $CM* $GL $CM* $LB5NonBreaks;
|
||||
! $LB3NonBreaks? $CM* $GL;
|
||||
! $CM* $LB5NonBreaks $CM* $GL;
|
||||
$CM* $GL $CM* $LB5NonBreaks;
|
||||
$CM* $LB5NonBreaks $CM* $GL;
|
||||
$LB3NonBreaks $CM* $GL;
|
||||
|
||||
# LB 12
|
||||
|
||||
# LB 14
|
||||
! .? ($CM* $QU)+ $CM* $LB12NonBreaks;
|
||||
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM* $LB12NonBreaks;
|
||||
! .? ($CM* $QU)+ $CM+ $SP; # LB7a SP CM+ behaves as ID
|
||||
! $CM* $LB5NonBreaks ($CM* $QU)+ $CM+ $SP;
|
||||
|
||||
! $LB3NonBreaks? $CM* $QU;
|
||||
! $CM* $LB5NonBreaks $CM* $QU; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
$CM* $QU $CM* $LB12NonBreaks;
|
||||
$CM* $QU $CM+ $SP;
|
||||
$CM* $LB5NonBreaks $CM* $QU;
|
||||
|
||||
# LB 14a
|
||||
$BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
|
||||
|
||||
# LB 15
|
||||
! ($CM* $BA | $CM* $HY | $CM* $NS) $BackLB14CanBreakAfter;
|
||||
! ($CM* $BA | $CM* $HY | $CM* $NS) $CM+ / [$BK $CR $LF $NL $ZW];
|
||||
! [^$CB] $CM* $BB;
|
||||
! $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
|
||||
$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
|
||||
($CM* ($BA | $HY | $NS))+ $CM+ / [$BK $CR $LF $NL $ZW];
|
||||
[$CR $LF $BK $NL $ZW] $CM* $BB;
|
||||
$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
|
||||
|
||||
# LB 16
|
||||
! $CM* $IN $CM* $ALPlus;
|
||||
! $CM* $IN $CM+ / [$BK $CR $LF $NL $ZW]; # by rule 7c, any otherwise unattached CM behaves as AL
|
||||
! $CM* $IN $CM* $ID;
|
||||
! $CM* $IN $CM+ $SP; # by rule 7a, $SP $CM behaves like ID
|
||||
! $CM* $IN $CM* $IN;
|
||||
! $CM* $IN $CM* $NU;
|
||||
$CM* $IN $CM* $ALPlus;
|
||||
# by rule 7c, any otherwise unattached CM behaves as AL
|
||||
$CM* $IN $CM+ / [$BK $CR $LF $NL $ZW];
|
||||
|
||||
$CM* $IN $CM* ($ID | $CM $SP);
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
|
||||
# $LB 17
|
||||
! $CM* $PO ($CM* $ID | $CM+ $SP);
|
||||
! $CM* $NU ($CM* $ALPlus)+; # includes $LB19
|
||||
! ($CM* $NU)+;
|
||||
! ($CM* $NU)+ $CM+ / [$BK $CR $LF $NL $ZW]; # Rule 7c
|
||||
! ($CM* $ALPlus)+ $CM* $NU;
|
||||
$CM* $PO $CM* ($ID | $CM $SP);
|
||||
$CM* $NU ($CM* $ALPlus)+; # includes $LB19
|
||||
$CM* $NU $CM+ / [$BK $CR $LF $NL $ZW]; # Rule 7c
|
||||
|
||||
$CM* $ALPlus $CM* $NU;
|
||||
|
||||
# LB 18
|
||||
! ($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
|
||||
($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
|
||||
|
||||
# LB 19
|
||||
! ($CM* $ALPlus)+;
|
||||
! ($CM* $ALPlus)+ $CM+ / [$BK $CR $LF $NL $ZW]; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
$CM* $ALPlus $CM* $ALPlus;
|
||||
# The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
$CM* $ALPlus $CM+ / [$BK $CR $LF $NL $ZW];
|
||||
|
||||
## problem state table can't handle lookahead when it is at the
|
||||
## start of the string, currently handled in the rbbi code
|
||||
## todo fix this
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
$CM* [^$CM];
|
|
@ -587,9 +587,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
if(exec) TestWordBreaks(); break;
|
||||
case 13: name = "TestWordBoundary";
|
||||
if(exec) TestWordBoundary(); break;
|
||||
/***
|
||||
case 14: name = "TestLineBreaks";
|
||||
if(exec) TestLineBreaks(); break;
|
||||
/***
|
||||
case 15: name = "TestSentBreaks";
|
||||
if(exec) TestSentBreaks(); break;
|
||||
case 16: name = "TestExtended";
|
||||
|
@ -602,8 +602,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
|
||||
#endif
|
||||
}
|
||||
***/
|
||||
break;
|
||||
***/
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
@ -2735,6 +2735,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
if (fSP->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (fZW->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -2745,11 +2746,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
}
|
||||
|
||||
// LB 6, LB 7
|
||||
int32_t oldpos = pos;
|
||||
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
|
||||
|
||||
|
||||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = nextCPPos;
|
||||
c = fText->char32At(nextPos);
|
||||
// another percularity of lb4
|
||||
if (fSP->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
rule67Adjust(pos, &thisChar, &nextPos, &c);
|
||||
|
||||
// If the loop is still warming up - if we haven't shifted the initial
|
||||
|
@ -2785,11 +2791,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
}
|
||||
|
||||
// LB 9 Don't break after OP SP*
|
||||
/// UBool cmFlag = FALSE;
|
||||
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
|
||||
if (fOP->contains(fText->char32At(tPos))) {
|
||||
break;
|
||||
}
|
||||
if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
|
||||
if (fSP->contains(prevChar) == FALSE
|
||||
|| fSP->contains(fText->char32At(tPos)) == FALSE
|
||||
|| tPos == 0) {
|
||||
/// || cmFlag == TRUE) {
|
||||
// if we have $SP$CM+ which is an $ID
|
||||
goto fall_through_9;
|
||||
}
|
||||
}
|
||||
|
@ -3164,7 +3175,13 @@ void RBBITest::TestLineBreaks(void)
|
|||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
UChar str[20];
|
||||
char *strlist[] =
|
||||
{"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
|
||||
{
|
||||
"\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
|
||||
"\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
|
||||
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
|
||||
"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
|
||||
"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
|
||||
"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
|
||||
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
|
||||
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
|
||||
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
|
||||
|
@ -3184,10 +3201,9 @@ void RBBITest::TestLineBreaks(void)
|
|||
};
|
||||
int loop;
|
||||
for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
|
||||
printf("looping %d\n", loop);
|
||||
// printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 20);
|
||||
UnicodeString ustr(str);
|
||||
// RBBICharMonkey monkey;
|
||||
RBBILineMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
|
@ -3214,8 +3230,9 @@ void RBBITest::TestLineBreaks(void)
|
|||
printStringBreaks(ustr, expected, expectedcount);
|
||||
errln("happy break test failed: missed %d match",
|
||||
expectedcount - count);
|
||||
break;
|
||||
}
|
||||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
||||
count --;
|
||||
if (forward[count] != i) {
|
||||
printStringBreaks(ustr, expected, expectedcount);
|
||||
|
@ -3226,6 +3243,7 @@ void RBBITest::TestLineBreaks(void)
|
|||
}
|
||||
if (count != 0) {
|
||||
errln("happy break test failed: missed a match");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,8 +108,8 @@ DataHeader dh ={
|
|||
0, // reserved
|
||||
|
||||
{ 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
|
||||
{ 2, 1, 0, 0 }, // formatVersion
|
||||
{ 3, 1, 0, 0 } // dataVersion (Unicode version)
|
||||
{ 3, 0, 0, 0 }, // formatVersion
|
||||
{ 4, 0, 0, 0 } // dataVersion (Unicode version)
|
||||
}};
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue