diff --git a/icu4c/source/data/brkitr/sent.txt b/icu4c/source/data/brkitr/sent.txt index 4f5449c7197..efb5a000c07 100644 --- a/icu4c/source/data/brkitr/sent.txt +++ b/icu4c/source/data/brkitr/sent.txt @@ -78,31 +78,75 @@ $EndSequence = $InteriorChars* $SepSeq?; !!reverse; -# -# Reverse Rules -# -$EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp); -$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*; -$ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*; -$ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*; -$ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*; +# rule 6 -$RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?; +$RULE6 = $Numeric $Format* $Extend* $ATerm; + +# rule 7 + +$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper; + +# rule 8 + +$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])* + ($Format* $Extend* $Sp)* ($Format* $Extend* $Close)* + $Format* $Extend* $ATerm; + +# rule 9, 10, 11 + +# $CR $LF +$End = $Sep | \u000a\u000d + | $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* + $Extend* ($Term | $ATerm) + | $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* + $Extend* ($Term | $ATerm); + +# rule 12 + +$RULE12 = [^$Sep $Term $ATerm]; + +$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*; + +$End; + +$End? $Join [$RULE12 - $Sp - $Close]; + +# forces a break at the beginning of text "$Sp blah blah blah" +# remember the break iterators takes the longest match +$End? $Join $Sp / [^$Term $ATerm $Sp $Close]; + +# forces a break at the beginning of text "$Close blah blah blah" +$End? $Join $Close / [^$Term $ATerm $Close]; ## ------------------------------------------------- -## !!safe_reverse; +!!safe_reverse; + +# rule 4 +$Extend+ [^$Extend]; # rule 7 -## $Extend* $ATerm $Format* $Extend* $Upper; +$Extend* $ATerm $Format* $Extend* $Upper; + +# rule 8 +($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm; # rule 11 -## ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm); +($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*; +($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm); ## ------------------------------------------------- !!safe_forward; +# rule 7 + +$ATerm $Extend* $Format* $Upper; + # rule 8 -## $Lower .; +$Lower .; + +# rule 11 + +($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*; \ No newline at end of file diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index b6952473bc7..51271ed649a 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -3273,7 +3273,9 @@ void RBBITest::TestSentBreaks(void) BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); UChar str[100]; char *strlist[] = - {"This\n", + { + "Now\ris\nthe\r\ntime\n\rfor\r\r", + "This\n", "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", "\"Sentence ending with a quote.\" Bye.", " (This is it). Testing the sentence iterator. \"This isn't it.\"", @@ -3295,19 +3297,7 @@ void RBBITest::TestSentBreaks(void) for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { forward[count ++] = i; } - int tempcount = count; - for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { - tempcount --; - if (forward[tempcount] != i) { - printStringBreaks(ustr, forward, count); - errln("happy break test reverse failed: expected %d but got %d", - forward[tempcount], i); - break; - } - } - if (tempcount != 0) { - errln("happy break test failed: missed a match"); - } + testBreakBoundPreceding(this, ustr, bi, forward, count); } }