ICU-2231 RBBI Sentence Break Rules and test updated to match draft of TR 29

X-SVN-Rev: 9823
This commit is contained in:
Andy Heninger 2002-08-30 21:37:59 +00:00
parent bbc62da408
commit 3144b2665e
3 changed files with 103 additions and 95 deletions

View file

@ -1,80 +1,76 @@
# file: sent.txt Sentence Boundary Rules.
#
#
# Copyright (C) 2002, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: sent.txt
#
# ICU Sentence Break Rules
# See Unicode Technical Report #29.
# These rules are based on the proposed draft dated 2002-08-09
#
# Separators are line or paragraph ends that will attach to the end of sentences.
$Sep =[\n \r \u0085 \u2028 \u2029];
$SepSeq = $Sep | \u000d\u000a;
$Sp = [[:Zs:] - $Sep];
# $ATerm contains ambiguous terminators, characters that may or may not terminate
# sentence depending on the context.
# $Term contains $ATerm + all characters that unambiguously end sentences.
#
$ATerm = [\u002e \u0589 \u3001]; # same as Terminal_Punctuation2 from TR29
$Term = [$ATerm \u0021 \u003f \u037e \u061f \u06d4 \u203c \u203d
\u3002 \u2048 \u2049
\u0964]; # TODO: these (this line) not yet decided in TR29.
$Lower = [[:Ll:] [:Sk:]];
$Upper = [[:Lu:] [:Lt:]];
$NotLetter = [^[:L:] $Term];
$Open = [:Ps:];
$Close = [[:Pe:] \" \'];
#
# Combining chars. Copied from UNIDATA/DerivedCoreProperties.txt
#
$Extend =
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
#
# Character categories as defined in TR 29
#
$Sep = [\u000d \u000a \u0085 \u2028 \u2029];
$Format = [[:Cf:]];
$Sp = [[:Whitespace:] - $Sep];
$Lower = [[:Lowercase:]];
$Upper = [[:Lt:] [:Uppercase:]];
$OLetter = [[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3];
# The chars listed by number below are those with "Linebreak=QU"
$Close = [[:Pe:] [:Po:] \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C
\u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];
$ATerm = [\u002e];
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
\u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
\uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
$AnyTerm = [$ATerm $Term];
# From Grapheme Cluster
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
#
# $SepSeq keeps together CRLF as a separator.
#
$SepSeq = $Sep | \u000d\u000a;
# $InteriorChars are those that never cause a break.
$InteriorChars = [^$AnyTerm $Sep];
$EndSequence = [^$Term]* $Term ($Close | $Term | $Extend)* $Sp* $SepSeq?;
$LowerWordFollows = [^$Term]* $ATerm $Close* $Sp* $SepSeq? $NotLetter* $Lower;
$UpperWordPrecedes = [^$Term]* $Upper ($Lower | $Extend)* $ATerm $Close* $Sp* $SepSeq?;
($LowerWordFollows | $UpperWordPrecedes)* $EndSequence;
#
# In cases where the input text ends without a normal end-of-sentence sequence,
# this rule will match whatever text is there.
#
[^$Term]*;
# Sentence Break Rules 8, 9, 11
# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
# that do not cause a break for one exceptional reason or another.
$EndSequence = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
($AnyTerm | $Format | $Sp | $Extend)* $SepSeq?;
# Rule 6 Matches a sentence fragment containing "." that should not cause a sentence break,
# because a lower case word follows the period.
$LowerWordFollows = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;
# Rule 7. $UpperFollowsImmediately
# Matches a fragment containing in a "." that should not cause a sentence break
# because an uppercase letter follows the period with no intervening spaces.
$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;
# Put them all together.
($LowerWordFollows | $UpperFollowsImmediately)* $EndSequence;
#
# Reverse Rules
#
$RevEndSequence = [^$Term]* ($Term | $Close | $Extend)* [^$Term]*;
$ReverseLowerWordFollows = $Lower ($Close | $Sp | $Sep | $Extend | $NotLetter)* $ATerm [^$Term]*;
$ReverseUpperWordPrecedes = $ATerm ($Lower | $Extend)* $Upper [^$Term]*;
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperWordPrecedes)* $Term?;
!.;
#
# Reverse Rules
#
$EndGorp = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp*;
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
$ReverseUpperFollowsIm = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
#! .*;

View file

@ -339,6 +339,8 @@ void RBBIAPITest::TestFirstNextFollowing()
status=U_ZERO_ERROR;
testString="Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
// 0123456789012345678901234567890123456789012345678901234567890123 45678901234567890123456789
// 0 1 2 3 4 5 6 7 8
RuleBasedBreakIterator* sentIter1=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status);
if(U_FAILURE(status))
errln("FAIL : in construction");
@ -357,10 +359,10 @@ void RBBIAPITest::TestFirstNextFollowing()
q=sentIter1->next(-2);
doTest(testString, p, q, 7, "how are you? I'am fine. ");
p=q;
q=sentIter1->next(3);
q=sentIter1->next(4);
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
p=q;
q=sentIter1->next();
q=sentIter1->next(2);
doTest(testString, p, q, 83, "This\n costs $20,00,000.");
q=sentIter1->following(1);
doTest(testString, 1, q, 7, "ello! ");
@ -511,12 +513,13 @@ void RBBIAPITest::TestLastPreviousPreceding()
if(p != testString.length() )
errln((UnicodeString)"ERROR: last() returned" + p + (UnicodeString)"instead of " + testString.length());
q=sentIter1->previous();
q=sentIter1->previous();
doTest(testString, p, q, 60, "This\n costs $20,00,000.");
p=q;
q=sentIter1->previous();
doTest(testString, p, q, 31, "Thankyou. How are you doing? ");
// q=sentIter1->preceding(40);
// doTest(testString, 40, q, 31, "Thankyou.");
doTest(testString, p, q, 41, "How are you doing? ");
q=sentIter1->preceding(40);
doTest(testString, 40, q, 31, "Thankyou.");
q=sentIter1->preceding(25);
doTest(testString, 25, q, 20, "I'am ");
sentIter1->first();

View file

@ -143,13 +143,12 @@ void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx
int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
int32_t actual = fActualBreakPositions.elementAti(actualIdx);
int32_t o = 0;
int32_t line = fLineNum.elementAti(0);
int32_t line = fLineNum.elementAti(expectedIdx);
if (expectedIdx > 0) {
// The line numbers are off by one because a premature break occurs somewhere
// within the previous item, rather than at the start of the current (expected) item.
// Similarly, we want to report the offset of the unexpected break from the start of
// We want to report the offset of the unexpected break from the start of
// this previous item.
line = fLineNum.elementAti(expectedIdx-1);
o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
}
if (actual < expected) {
@ -591,7 +590,6 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
ADD_DATACHUNK(sentdata, "Testing the sentence iterator. ", 0, status);
ADD_DATACHUNK(sentdata, "\"This isn\'t it.\" ", 0, status);
ADD_DATACHUNK(sentdata, "Hi! ", 0, status);
//sentdata = new Vector();
ADD_DATACHUNK(sentdata, "This is a simple sample sentence. ", 0, status);
ADD_DATACHUNK(sentdata, "(This is it.) ", 0, status);
ADD_DATACHUNK(sentdata, "This is a simple sample sentence. ", 0, status);
@ -609,7 +607,6 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
ADD_DATACHUNK(sentdata, "Not on my time (el timo.)! ", 0, status);
ADD_DATACHUNK(sentdata, "So what!!\\u2029", 0, status); // Paragraph Separator
ADD_DATACHUNK(sentdata, "\"But now,\" he said, \"I know!\" ", 0, status);
ADD_DATACHUNK(sentdata, "Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ", 0, status);
ADD_DATACHUNK(sentdata, "One species, B. anthracis, is highly virulent.\n", 0, status);
@ -621,7 +618,18 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
ADD_DATACHUNK(sentdata, "What is the proper use of the abbreviation pp.? ", 0, status);
ADD_DATACHUNK(sentdata, "Yes, I am definatelly 12\" tall!!", 0, status);
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
ADD_DATACHUNK(sentdata, "Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e", 0, status);
// And then, revised again for TR29. \n and \r do count as paragraph breaks.
ADD_DATACHUNK(sentdata, "Now\r", 0, status);
ADD_DATACHUNK(sentdata, "is\n", 0, status);
ADD_DATACHUNK(sentdata, "the\r\n", 0, status);
ADD_DATACHUNK(sentdata, "time\n", 0, status);
ADD_DATACHUNK(sentdata, "\r", 0, status);
ADD_DATACHUNK(sentdata, "for\r", 0, status);
ADD_DATACHUNK(sentdata, "\r", 0, status);
// ADD_DATACHUNK(sentdata, "all\\u037e", 0, status); TODO: Greek question mark
// Why isn't it a sentence ender?
ADD_DATACHUNK(sentdata, "No breaks when . is followed .Immediately by an .Upper case Letter. ", 0, status);
// test that it doesn't break sentences at the boundary between CJK
// and other letters
@ -638,21 +646,19 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
// Treat fullwidth variants of .!? the same as their
// normal counterparts
#if 0 // Not according to TR29. TODO: what is the right thing for these chars?
ADD_DATACHUNK(sentdata, "I know I'm right\\uff0e ", 0, status);
ADD_DATACHUNK(sentdata, "Right\\uff1f ", 0, status);
ADD_DATACHUNK(sentdata, "Right\\uff01 ", 0, status);
#endif
// Don't break sentences at boundary between CJK and digits
ADD_DATACHUNK(sentdata, "\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8"
"\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0"
"\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3001", 0, status);
"\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002", 0, status);
// Break sentence between a sentence terminator and
// opening punctuation
ADD_DATACHUNK(sentdata, "How do you do?", 0, status);
ADD_DATACHUNK(sentdata, "(fine).", 0, status);
ADD_DATACHUNK(sentdata, "(fine). ", 0, status);
// test for bug #4158381: Don't break sentence after period if it isn't
// followed by a space
@ -675,7 +681,8 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
// letter are treated correctly
// Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter.
ADD_DATACHUNK(sentdata, "The type of all primitive <code>boolean</code> values accessed in the "
"target VM. Calls to xxx will return an implementor of this interface. \\u2029", 0, status);
"target VM. ", 0, status);
ADD_DATACHUNK(sentdata, "Calls to xxx will return an implementor of this interface. \\u2029", 0, status);
// test for bug #4152117: Make sure sentence breaking is handling
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
@ -697,8 +704,10 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
"\\u0939\\u0948?", 0, status);
ADD_DATACHUNK(sentdata,
"\\u092e\\u0948 \\u0905" halfCHA "\\u091b\\u093e \\u0939\\u0942\\u0901\\u0964 ", 0, status);
ADD_DATACHUNK(sentdata, "\\u0905\\u093e\\u092a\r\n \\u0915\\u0948\\u0938\\u0947 \\u0939\\u0948?", 0, status);
ADD_DATACHUNK(sentdata, "\\u0935\\u0939 " halfKA "\\u092f\\u093e\n \\u0939\\u0948?", 0, status);
ADD_DATACHUNK(sentdata, "\\u0905\\u093e\\u092a\r\n", 0, status);
ADD_DATACHUNK(sentdata, "\\u0915\\u0948\\u0938\\u0947 \\u0939\\u0948?", 0, status);
ADD_DATACHUNK(sentdata, "\\u0935\\u0939 " halfKA "\\u092f\\u093e\n", 0, status);
ADD_DATACHUNK(sentdata, "\\u0939\\u0948?", 0, status);
ADD_DATACHUNK(sentdata, "\\u092f\\u0939 \\u0905\\u093e\\u092e \\u0939\\u0948. ", 0, status);
ADD_DATACHUNK(sentdata, "\\u092f\\u0939 means \"this\". ", 0, status);
ADD_DATACHUNK(sentdata, "\"\\u092a\\u095d\\u093e\\u0908\" meaning \"education\" or \"studies\". ", 0, status);
@ -734,12 +743,11 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
// Try a few more of the less common sentence endings.
ADD_DATACHUNK(sentdata, "Hello, world\\u3002 ", 0, status);
ADD_DATACHUNK(sentdata, "Hello, world\\u037e ", 0, status);
// ADD_DATACHUNK(sentdata, "Hello, world\\u037e ", 0, status); // Greek Question Mark, omitted from TR29. TODO:
ADD_DATACHUNK(sentdata, "Hello, world\\u2048 ", 0, status);
ADD_DATACHUNK(sentdata, "Hello, world\\u203c ", 0, status);
ADD_DATACHUNK(sentdata, "Let's end here. ", 0, status);
generalIteratorTest(*sentIterDefault, sentdata);
delete sentIterDefault;
@ -2186,8 +2194,9 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
tb.setText(work);
for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
if (k == 2) {
errln("Break between U+" + UCharToUnicodeString(work[1])
+ " and U+" + UCharToUnicodeString(work[2]));
//errln("Break between U+" + UCharToUnicodeString(work[1])
// + " and U+" + UCharToUnicodeString(work[2]));
errln("Unexpected Break between %6x and %6x", c1, c2);
errCount++;
if (errCount >= 75)
return;