ICU-2292 sentence break rules updated

X-SVN-Rev: 13649
This commit is contained in:
Syn Wee Quek 2003-11-09 20:32:00 +00:00
parent 41ac2f557b
commit cea200bf0a
2 changed files with 61 additions and 27 deletions

View file

@ -78,31 +78,75 @@ $EndSequence = $InteriorChars* $SepSeq?;
!!reverse;
#
# Reverse Rules
#
$EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
$ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
$ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
# rule 6
$RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
$RULE6 = $Numeric $Format* $Extend* $ATerm;
# rule 7
$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper;
# rule 8
$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])*
($Format* $Extend* $Sp)* ($Format* $Extend* $Close)*
$Format* $Extend* $ATerm;
# rule 9, 10, 11
# $CR $LF
$End = $Sep | \u000a\u000d
| $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
$Extend* ($Term | $ATerm)
| $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
$Extend* ($Term | $ATerm);
# rule 12
$RULE12 = [^$Sep $Term $ATerm];
$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*;
$End;
$End? $Join [$RULE12 - $Sp - $Close];
# forces a break at the beginning of text "$Sp blah blah blah"
# remember the break iterators takes the longest match
$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
# forces a break at the beginning of text "$Close blah blah blah"
$End? $Join $Close / [^$Term $ATerm $Close];
## -------------------------------------------------
## !!safe_reverse;
!!safe_reverse;
# rule 4
$Extend+ [^$Extend];
# rule 7
## $Extend* $ATerm $Format* $Extend* $Upper;
$Extend* $ATerm $Format* $Extend* $Upper;
# rule 8
($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm;
# rule 11
## ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*;
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
## -------------------------------------------------
!!safe_forward;
# rule 7
$ATerm $Extend* $Format* $Upper;
# rule 8
## $Lower .;
$Lower .;
# rule 11
($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*;

View file

@ -3273,7 +3273,9 @@ void RBBITest::TestSentBreaks(void)
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
UChar str[100];
char *strlist[] =
{"This\n",
{
"Now\ris\nthe\r\ntime\n\rfor\r\r",
"This\n",
"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
"\"Sentence ending with a quote.\" Bye.",
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
@ -3295,19 +3297,7 @@ void RBBITest::TestSentBreaks(void)
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count ++] = i;
}
int tempcount = count;
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
tempcount --;
if (forward[tempcount] != i) {
printStringBreaks(ustr, forward, count);
errln("happy break test reverse failed: expected %d but got %d",
forward[tempcount], i);
break;
}
}
if (tempcount != 0) {
errln("happy break test failed: missed a match");
}
testBreakBoundPreceding(this, ustr, bi, forward, count);
}
}