mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2231 RBBI Sentence Break Rules and test updated to match draft of TR 29
X-SVN-Rev: 9823
This commit is contained in:
parent
bbc62da408
commit
3144b2665e
3 changed files with 103 additions and 95 deletions
|
@ -1,80 +1,76 @@
|
|||
# file: sent.txt Sentence Boundary Rules.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: sent.txt
|
||||
#
|
||||
# ICU Sentence Break Rules
|
||||
# See Unicode Technical Report #29.
|
||||
# These rules are based on the proposed draft dated 2002-08-09
|
||||
#
|
||||
|
||||
|
||||
# Separators are line or paragraph ends that will attach to the end of sentences.
|
||||
$Sep =[\n \r \u0085 \u2028 \u2029];
|
||||
$SepSeq = $Sep | \u000d\u000a;
|
||||
$Sp = [[:Zs:] - $Sep];
|
||||
|
||||
# $ATerm contains ambiguous terminators, characters that may or may not terminate
|
||||
# sentence depending on the context.
|
||||
# $Term contains $ATerm + all characters that unambiguously end sentences.
|
||||
#
|
||||
$ATerm = [\u002e \u0589 \u3001]; # same as Terminal_Punctuation2 from TR29
|
||||
$Term = [$ATerm \u0021 \u003f \u037e \u061f \u06d4 \u203c \u203d
|
||||
\u3002 \u2048 \u2049
|
||||
\u0964]; # TODO: these (this line) not yet decided in TR29.
|
||||
|
||||
$Lower = [[:Ll:] [:Sk:]];
|
||||
$Upper = [[:Lu:] [:Lt:]];
|
||||
$NotLetter = [^[:L:] $Term];
|
||||
$Open = [:Ps:];
|
||||
$Close = [[:Pe:] \" \'];
|
||||
|
||||
#
|
||||
# Combining chars. Copied from UNIDATA/DerivedCoreProperties.txt
|
||||
#
|
||||
$Extend =
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
#
|
||||
# Character categories as defined in TR 29
|
||||
#
|
||||
$Sep = [\u000d \u000a \u0085 \u2028 \u2029];
|
||||
$Format = [[:Cf:]];
|
||||
$Sp = [[:Whitespace:] - $Sep];
|
||||
$Lower = [[:Lowercase:]];
|
||||
$Upper = [[:Lt:] [:Uppercase:]];
|
||||
$OLetter = [[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3];
|
||||
|
||||
# The chars listed by number below are those with "Linebreak=QU"
|
||||
$Close = [[:Pe:] [:Po:] \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C
|
||||
\u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];
|
||||
|
||||
$ATerm = [\u002e];
|
||||
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
|
||||
\u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
|
||||
\uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
|
||||
$AnyTerm = [$ATerm $Term];
|
||||
|
||||
# From Grapheme Cluster
|
||||
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
||||
|
||||
#
|
||||
# $SepSeq keeps together CRLF as a separator.
|
||||
#
|
||||
$SepSeq = $Sep | \u000d\u000a;
|
||||
|
||||
# $InteriorChars are those that never cause a break.
|
||||
$InteriorChars = [^$AnyTerm $Sep];
|
||||
|
||||
|
||||
$EndSequence = [^$Term]* $Term ($Close | $Term | $Extend)* $Sp* $SepSeq?;
|
||||
$LowerWordFollows = [^$Term]* $ATerm $Close* $Sp* $SepSeq? $NotLetter* $Lower;
|
||||
$UpperWordPrecedes = [^$Term]* $Upper ($Lower | $Extend)* $ATerm $Close* $Sp* $SepSeq?;
|
||||
|
||||
|
||||
($LowerWordFollows | $UpperWordPrecedes)* $EndSequence;
|
||||
|
||||
#
|
||||
# In cases where the input text ends without a normal end-of-sentence sequence,
|
||||
# this rule will match whatever text is there.
|
||||
#
|
||||
[^$Term]*;
|
||||
# Sentence Break Rules 8, 9, 11
|
||||
# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
|
||||
# that do not cause a break for one exceptional reason or another.
|
||||
$EndSequence = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
|
||||
($AnyTerm | $Format | $Sp | $Extend)* $SepSeq?;
|
||||
|
||||
# Rule 6 Matches a sentence fragment containing "." that should not cause a sentence break,
|
||||
# because a lower case word follows the period.
|
||||
$LowerWordFollows = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;
|
||||
|
||||
|
||||
# Rule 7. $UpperFollowsImmediately
|
||||
# Matches a fragment containing in a "." that should not cause a sentence break
|
||||
# because an uppercase letter follows the period with no intervening spaces.
|
||||
$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;
|
||||
|
||||
# Put them all together.
|
||||
($LowerWordFollows | $UpperFollowsImmediately)* $EndSequence;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
$RevEndSequence = [^$Term]* ($Term | $Close | $Extend)* [^$Term]*;
|
||||
$ReverseLowerWordFollows = $Lower ($Close | $Sp | $Sep | $Extend | $NotLetter)* $ATerm [^$Term]*;
|
||||
$ReverseUpperWordPrecedes = $ATerm ($Lower | $Extend)* $Upper [^$Term]*;
|
||||
|
||||
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperWordPrecedes)* $Term?;
|
||||
!.;
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
$EndGorp = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
|
||||
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp*;
|
||||
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
|
||||
$ReverseUpperFollowsIm = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;
|
||||
|
||||
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
|
||||
#! .*;
|
||||
|
||||
|
|
|
@ -339,6 +339,8 @@ void RBBIAPITest::TestFirstNextFollowing()
|
|||
|
||||
status=U_ZERO_ERROR;
|
||||
testString="Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
|
||||
// 0123456789012345678901234567890123456789012345678901234567890123 45678901234567890123456789
|
||||
// 0 1 2 3 4 5 6 7 8
|
||||
RuleBasedBreakIterator* sentIter1=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status))
|
||||
errln("FAIL : in construction");
|
||||
|
@ -357,10 +359,10 @@ void RBBIAPITest::TestFirstNextFollowing()
|
|||
q=sentIter1->next(-2);
|
||||
doTest(testString, p, q, 7, "how are you? I'am fine. ");
|
||||
p=q;
|
||||
q=sentIter1->next(3);
|
||||
q=sentIter1->next(4);
|
||||
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
|
||||
p=q;
|
||||
q=sentIter1->next();
|
||||
q=sentIter1->next(2);
|
||||
doTest(testString, p, q, 83, "This\n costs $20,00,000.");
|
||||
q=sentIter1->following(1);
|
||||
doTest(testString, 1, q, 7, "ello! ");
|
||||
|
@ -511,12 +513,13 @@ void RBBIAPITest::TestLastPreviousPreceding()
|
|||
if(p != testString.length() )
|
||||
errln((UnicodeString)"ERROR: last() returned" + p + (UnicodeString)"instead of " + testString.length());
|
||||
q=sentIter1->previous();
|
||||
q=sentIter1->previous();
|
||||
doTest(testString, p, q, 60, "This\n costs $20,00,000.");
|
||||
p=q;
|
||||
q=sentIter1->previous();
|
||||
doTest(testString, p, q, 31, "Thankyou. How are you doing? ");
|
||||
// q=sentIter1->preceding(40);
|
||||
// doTest(testString, 40, q, 31, "Thankyou.");
|
||||
doTest(testString, p, q, 41, "How are you doing? ");
|
||||
q=sentIter1->preceding(40);
|
||||
doTest(testString, 40, q, 31, "Thankyou.");
|
||||
q=sentIter1->preceding(25);
|
||||
doTest(testString, 25, q, 20, "I'am ");
|
||||
sentIter1->first();
|
||||
|
|
|
@ -143,13 +143,12 @@ void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx
|
|||
int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
|
||||
int32_t actual = fActualBreakPositions.elementAti(actualIdx);
|
||||
int32_t o = 0;
|
||||
int32_t line = fLineNum.elementAti(0);
|
||||
int32_t line = fLineNum.elementAti(expectedIdx);
|
||||
if (expectedIdx > 0) {
|
||||
// The line numbers are off by one because a premature break occurs somewhere
|
||||
// within the previous item, rather than at the start of the current (expected) item.
|
||||
// Similarly, we want to report the offset of the unexpected break from the start of
|
||||
// We want to report the offset of the unexpected break from the start of
|
||||
// this previous item.
|
||||
line = fLineNum.elementAti(expectedIdx-1);
|
||||
o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
|
||||
}
|
||||
if (actual < expected) {
|
||||
|
@ -591,7 +590,6 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
ADD_DATACHUNK(sentdata, "Testing the sentence iterator. ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\"This isn\'t it.\" ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Hi! ", 0, status);
|
||||
//sentdata = new Vector();
|
||||
ADD_DATACHUNK(sentdata, "This is a simple sample sentence. ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "(This is it.) ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "This is a simple sample sentence. ", 0, status);
|
||||
|
@ -609,7 +607,6 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
ADD_DATACHUNK(sentdata, "Not on my time (el timo.)! ", 0, status);
|
||||
|
||||
ADD_DATACHUNK(sentdata, "So what!!\\u2029", 0, status); // Paragraph Separator
|
||||
|
||||
ADD_DATACHUNK(sentdata, "\"But now,\" he said, \"I know!\" ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "One species, B. anthracis, is highly virulent.\n", 0, status);
|
||||
|
@ -621,7 +618,18 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
ADD_DATACHUNK(sentdata, "What is the proper use of the abbreviation pp.? ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Yes, I am definatelly 12\" tall!!", 0, status);
|
||||
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
|
||||
ADD_DATACHUNK(sentdata, "Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e", 0, status);
|
||||
// And then, revised again for TR29. \n and \r do count as paragraph breaks.
|
||||
ADD_DATACHUNK(sentdata, "Now\r", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "is\n", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "the\r\n", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "time\n", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\r", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "for\r", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\r", 0, status);
|
||||
// ADD_DATACHUNK(sentdata, "all\\u037e", 0, status); TODO: Greek question mark
|
||||
// Why isn't it a sentence ender?
|
||||
|
||||
ADD_DATACHUNK(sentdata, "No breaks when . is followed .Immediately by an .Upper case Letter. ", 0, status);
|
||||
|
||||
// test that it doesn't break sentences at the boundary between CJK
|
||||
// and other letters
|
||||
|
@ -638,21 +646,19 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
|
||||
// Treat fullwidth variants of .!? the same as their
|
||||
// normal counterparts
|
||||
#if 0 // Not according to TR29. TODO: what is the right thing for these chars?
|
||||
ADD_DATACHUNK(sentdata, "I know I'm right\\uff0e ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Right\\uff1f ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Right\\uff01 ", 0, status);
|
||||
#endif
|
||||
|
||||
// Don't break sentences at boundary between CJK and digits
|
||||
ADD_DATACHUNK(sentdata, "\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8"
|
||||
"\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0"
|
||||
"\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3001", 0, status);
|
||||
"\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002", 0, status);
|
||||
|
||||
// Break sentence between a sentence terminator and
|
||||
// opening punctuation
|
||||
ADD_DATACHUNK(sentdata, "How do you do?", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "(fine).", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "(fine). ", 0, status);
|
||||
|
||||
// test for bug #4158381: Don't break sentence after period if it isn't
|
||||
// followed by a space
|
||||
|
@ -675,7 +681,8 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
// letter are treated correctly
|
||||
// Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter.
|
||||
ADD_DATACHUNK(sentdata, "The type of all primitive <code>boolean</code> values accessed in the "
|
||||
"target VM. Calls to xxx will return an implementor of this interface. \\u2029", 0, status);
|
||||
"target VM. ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Calls to xxx will return an implementor of this interface. \\u2029", 0, status);
|
||||
|
||||
// test for bug #4152117: Make sure sentence breaking is handling
|
||||
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
|
||||
|
@ -697,8 +704,10 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
"\\u0939\\u0948?", 0, status);
|
||||
ADD_DATACHUNK(sentdata,
|
||||
"\\u092e\\u0948 \\u0905" halfCHA "\\u091b\\u093e \\u0939\\u0942\\u0901\\u0964 ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u0905\\u093e\\u092a\r\n \\u0915\\u0948\\u0938\\u0947 \\u0939\\u0948?", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u0935\\u0939 " halfKA "\\u092f\\u093e\n \\u0939\\u0948?", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u0905\\u093e\\u092a\r\n", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u0915\\u0948\\u0938\\u0947 \\u0939\\u0948?", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u0935\\u0939 " halfKA "\\u092f\\u093e\n", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u0939\\u0948?", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u092f\\u0939 \\u0905\\u093e\\u092e \\u0939\\u0948. ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\\u092f\\u0939 means \"this\". ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "\"\\u092a\\u095d\\u093e\\u0908\" meaning \"education\" or \"studies\". ", 0, status);
|
||||
|
@ -734,12 +743,11 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
|
||||
// Try a few more of the less common sentence endings.
|
||||
ADD_DATACHUNK(sentdata, "Hello, world\\u3002 ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Hello, world\\u037e ", 0, status);
|
||||
// ADD_DATACHUNK(sentdata, "Hello, world\\u037e ", 0, status); // Greek Question Mark, omitted from TR29. TODO:
|
||||
ADD_DATACHUNK(sentdata, "Hello, world\\u2048 ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Hello, world\\u203c ", 0, status);
|
||||
ADD_DATACHUNK(sentdata, "Let's end here. ", 0, status);
|
||||
|
||||
|
||||
generalIteratorTest(*sentIterDefault, sentdata);
|
||||
|
||||
delete sentIterDefault;
|
||||
|
@ -2186,8 +2194,9 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
|||
tb.setText(work);
|
||||
for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
||||
if (k == 2) {
|
||||
errln("Break between U+" + UCharToUnicodeString(work[1])
|
||||
+ " and U+" + UCharToUnicodeString(work[2]));
|
||||
//errln("Break between U+" + UCharToUnicodeString(work[1])
|
||||
// + " and U+" + UCharToUnicodeString(work[2]));
|
||||
errln("Unexpected Break between %6x and %6x", c1, c2);
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
|
|
Loading…
Add table
Reference in a new issue