ICU-2231 RBBI Sentence Break Rules and test updated to match draft of TR 29

X-SVN-Rev: 9823
2025-04-13 08:53:20 +00:00 · 2002-08-30 21:37:59 +00:00 · 2002-08-30 21:37:59 +00:00 · 3144b2665e
commit 3144b2665e
parent bbc62da408
3 changed files with 103 additions and 95 deletions
--- a/icu4c/source/data/brkitr/sent.txt
+++ b/icu4c/source/data/brkitr/sent.txt
@ -1,80 +1,76 @@
-    # file: sent.txt         Sentence Boundary Rules.
-    #
+#
+#   Copyright (C) 2002, International Business Machines Corporation and others.
+#       All Rights Reserved.
+#
+#   file:  sent.txt   
+#
+#   ICU Sentence Break Rules
+#      See Unicode Technical Report #29.
+#      These rules are based on the proposed draft dated 2002-08-09
+#
    

-    # Separators are line or paragraph ends that will attach to the end of sentences.
-    $Sep    =[\n \r \u0085 \u2028 \u2029];
-    $SepSeq = $Sep | \u000d\u000a;
-    $Sp    = [[:Zs:] - $Sep];
-    
-    # $ATerm contains ambiguous terminators, characters that may or may not terminate 
-    #        sentence depending on the context.
-    # $Term  contains $ATerm + all characters that unambiguously end sentences.
-    #
-    $ATerm = [\u002e \u0589 \u3001];   # same as Terminal_Punctuation2 from TR29
-    $Term  = [$ATerm \u0021 \u003f \u037e \u061f \u06d4 \u203c \u203d
-			     \u3002 \u2048 \u2049
-			     \u0964];      # TODO:  these (this line) not yet decided in TR29.
-		
-    $Lower     = [[:Ll:] [:Sk:]];
-    $Upper     = [[:Lu:] [:Lt:]];
-    $NotLetter = [^[:L:] $Term];
-    $Open      = [:Ps:];
-    $Close     = [[:Pe:] \" \'];
-    
-    #
-    #  Combining chars.   Copied from UNIDATA/DerivedCoreProperties.txt
-    #
-    $Extend     = 
-    	[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
-    	\u05BB-\u05BD \u05BF   \u05C1-\u05C2 \u05C4   \u064B-\u0655 \u0670   \u06D6-\u06DC
-    	\u06DE   \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711   \u0730-\u074A
-    	\u07A6-\u07B0 \u0901-\u0902 \u0903   \u093C   \u093E-\u0940 \u0941-\u0948
-    	\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981   \u0982-\u0983 \u09BC
-    	\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7   \u09E2-\u09E3
-    	\u0A02   \u0A3C   \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
-    	\u0A70-\u0A71 \u0A81-\u0A82 \u0A83   \u0ABC   \u0ABE-\u0AC0 \u0AC1-\u0AC5
-    	\u0AC7-\u0AC8 \u0AC9   \u0ACB-\u0ACC \u0B01   \u0B02-\u0B03 \u0B3C   \u0B3E
-    	\u0B3F   \u0B40   \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56   \u0B57
-    	\u0B82   \u0BBE-\u0BBF \u0BC0   \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
-    	\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
-    	\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE   \u0CBF   \u0CC0-\u0CC4 \u0CC6
-    	\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC   \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
-    	\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57   \u0D82-\u0D83 \u0DCF-\u0DD1
-    	\u0DD2-\u0DD4 \u0DD6   \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31   \u0E34-\u0E39
-    	\u0E47-\u0E4E \u0EB1   \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
-    	\u0F35   \u0F37   \u0F39   \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F   \u0F80-\u0F84
-    	\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6   \u102C   \u102D-\u1030 \u1031
-    	\u1032   \u1036-\u1037 \u1038   \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
-    	\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
-    	\u17BE-\u17C5 \u17C6   \u17C7-\u17C8 \u17C9-\u17D1 \u17D3   \u180B-\u180D
-    	\u18A9   \u20D0-\u20DC \u20DD-\u20E0 \u20E1   \u20E2-\u20E4 \u20E5-\u20EA
-    	\u302A-\u302F \u3099-\u309A \uFB1E   \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
-    	\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172 
-    	\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
+#
+# Character categories as defined in TR 29
+#
+$Sep     = [\u000d \u000a \u0085 \u2028 \u2029];
+$Format  = [[:Cf:]];
+$Sp      = [[:Whitespace:] - $Sep];
+$Lower   = [[:Lowercase:]];
+$Upper   = [[:Lt:] [:Uppercase:]];
+$OLetter = [[:Alphabetic:] \u02b9-\u02ba  \u02c2-\u02cf  \u02d2-\u02df  \u02e5-\u02ed  \u05f3];
+
+                           #  The chars listed by number below are those with "Linebreak=QU"
+$Close   = [[:Pe:] [:Po:]  \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C 
+                           \u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];
+                           
+$ATerm = [\u002e];  
+$Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
+          \u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
+          \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
+$AnyTerm = [$ATerm $Term];	
+
+# From Grapheme Cluster
+$Extend   = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend
+
+#
+#  $SepSeq keeps together CRLF as a separator.
+#
+$SepSeq  = $Sep | \u000d\u000a;
+
+# $InteriorChars are those that never cause a break.
+$InteriorChars = [^$AnyTerm $Sep];


-    $EndSequence       = [^$Term]* $Term ($Close | $Term | $Extend)* $Sp* $SepSeq?;
-    $LowerWordFollows  = [^$Term]* $ATerm $Close* $Sp* $SepSeq? $NotLetter* $Lower;
-    $UpperWordPrecedes = [^$Term]* $Upper ($Lower | $Extend)* $ATerm $Close* $Sp* $SepSeq?;

-    
-    ($LowerWordFollows | $UpperWordPrecedes)*  $EndSequence;
-    
-    #
-    # In cases where the input text ends without a normal end-of-sentence sequence,
-    #   this rule will match whatever text is there.
-    #
-    [^$Term]*;
+# Sentence Break Rules 8, 9, 11
+# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
+#              that do not cause a break for one exceptional reason or another.
+$EndSequence       = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
+                               ($AnyTerm | $Format | $Sp | $Extend)*  $SepSeq?;
+
+# Rule 6   Matches a sentence fragment containing "." that should not cause a sentence break,
+#          because a lower case word follows the period.
+$LowerWordFollows  = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;
+
+
+# Rule 7.  $UpperFollowsImmediately
+#          Matches a fragment containing in a "." that should not cause a sentence break
+#          because an uppercase letter follows the period with no intervening spaces.
+$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;
+
+# Put them all together.  
+($LowerWordFollows |  $UpperFollowsImmediately)*  $EndSequence;
+
     
-     
-     #
-     #  Reverse Rules
-     #
-     $RevEndSequence           = [^$Term]* ($Term | $Close | $Extend)* [^$Term]*;
-     $ReverseLowerWordFollows  = $Lower ($Close | $Sp | $Sep | $Extend | $NotLetter)* $ATerm [^$Term]*;
-     $ReverseUpperWordPrecedes = $ATerm ($Lower | $Extend)* $Upper  [^$Term]*;
-     
-     ! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperWordPrecedes)* $Term?;
-     !.;
+#
+#  Reverse Rules
+#
+$EndGorp                  = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
+$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp*;
+$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
+$ReverseUpperFollowsIm    = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;
+
+! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
+#! .*;
 
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -339,6 +339,8 @@ void RBBIAPITest::TestFirstNextFollowing()

    status=U_ZERO_ERROR;
    testString="Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.";
+    //          0123456789012345678901234567890123456789012345678901234567890123 45678901234567890123456789
+    //          0         1         2         3         4         5         6          7         8
    RuleBasedBreakIterator* sentIter1=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createSentenceInstance(Locale::getDefault(), status);
    if(U_FAILURE(status))
        errln("FAIL : in construction");
@ -357,10 +359,10 @@ void RBBIAPITest::TestFirstNextFollowing()
        q=sentIter1->next(-2);
        doTest(testString, p, q, 7, "how are you? I'am fine. ");
        p=q;
-        q=sentIter1->next(3);
+        q=sentIter1->next(4);
        doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
        p=q; 
-        q=sentIter1->next();
+        q=sentIter1->next(2);
        doTest(testString, p, q, 83, "This\n costs $20,00,000.");
        q=sentIter1->following(1);
        doTest(testString, 1, q, 7, "ello! ");
@ -511,12 +513,13 @@ void RBBIAPITest::TestLastPreviousPreceding()
        if(p != testString.length() )
            errln((UnicodeString)"ERROR: last() returned" + p + (UnicodeString)"instead of " + testString.length());
        q=sentIter1->previous();
+        q=sentIter1->previous();
        doTest(testString, p, q, 60, "This\n costs $20,00,000.");
        p=q;
        q=sentIter1->previous();
-        doTest(testString, p, q, 31, "Thankyou. How are you doing? ");
-        // q=sentIter1->preceding(40);
-        // doTest(testString, 40, q, 31, "Thankyou.");
+        doTest(testString, p, q, 41, "How are you doing? ");
+        q=sentIter1->preceding(40);
+        doTest(testString, 40, q, 31, "Thankyou.");
        q=sentIter1->preceding(25);
        doTest(testString, 25, q, 20, "I'am "); 
        sentIter1->first();
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -143,13 +143,12 @@ void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx
    int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
    int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
    int32_t   o        = 0;
-    int32_t   line     = fLineNum.elementAti(0);
+    int32_t   line     = fLineNum.elementAti(expectedIdx);
    if (expectedIdx > 0) {
        // The line numbers are off by one because a premature break occurs somewhere
        //    within the previous item, rather than at the start of the current (expected) item.
-        //    Similarly, we want to report the offset of the unexpected break from the start of
+        //    We want to report the offset of the unexpected break from the start of
        //      this previous item.
-        line = fLineNum.elementAti(expectedIdx-1);
        o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
    }
    if (actual < expected) {
@ -591,7 +590,6 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
      ADD_DATACHUNK(sentdata, "Testing the sentence iterator. ", 0, status);
      ADD_DATACHUNK(sentdata, "\"This isn\'t it.\" ", 0, status);
      ADD_DATACHUNK(sentdata, "Hi! ", 0, status);
-      //sentdata = new Vector();
      ADD_DATACHUNK(sentdata, "This is a simple sample sentence. ", 0, status);
      ADD_DATACHUNK(sentdata, "(This is it.) ", 0, status);
      ADD_DATACHUNK(sentdata, "This is a simple sample sentence. ", 0, status);
@ -609,7 +607,6 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
      ADD_DATACHUNK(sentdata, "Not on my time (el timo.)! ", 0, status);

      ADD_DATACHUNK(sentdata, "So what!!\\u2029", 0, status);              // Paragraph Separator
-
      ADD_DATACHUNK(sentdata, "\"But now,\" he said, \"I know!\" ", 0, status);
      ADD_DATACHUNK(sentdata, "Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ", 0, status);
      ADD_DATACHUNK(sentdata, "One species, B. anthracis, is highly virulent.\n", 0, status);
@ -621,7 +618,18 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
      ADD_DATACHUNK(sentdata, "What is the proper use of the abbreviation pp.? ", 0, status);
      ADD_DATACHUNK(sentdata, "Yes, I am definatelly 12\" tall!!", 0, status);
      // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
-      ADD_DATACHUNK(sentdata, "Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e", 0, status);
+      //   And then, revised again for TR29.   \n and \r do count as paragraph breaks.
+      ADD_DATACHUNK(sentdata, "Now\r", 0, status);
+      ADD_DATACHUNK(sentdata, "is\n", 0, status);
+      ADD_DATACHUNK(sentdata, "the\r\n", 0, status);
+      ADD_DATACHUNK(sentdata, "time\n", 0, status);
+      ADD_DATACHUNK(sentdata, "\r", 0, status);
+      ADD_DATACHUNK(sentdata, "for\r", 0, status);
+      ADD_DATACHUNK(sentdata, "\r", 0, status);
+     // ADD_DATACHUNK(sentdata, "all\\u037e", 0, status);  TODO:  Greek question mark
+      //                                                           Why isn't it a sentence ender?
+
+      ADD_DATACHUNK(sentdata, "No breaks when . is followed .Immediately by an .Upper case Letter.  ", 0, status);

    // test that it doesn't break sentences at the boundary between CJK
    // and other letters
@ -638,21 +646,19 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()

      // Treat fullwidth variants of .!? the same as their
      // normal counterparts
-#if 0   // Not according to TR29.  TODO:  what is the right thing for these chars?
      ADD_DATACHUNK(sentdata, "I know I'm right\\uff0e ", 0, status);
      ADD_DATACHUNK(sentdata, "Right\\uff1f ", 0, status);
      ADD_DATACHUNK(sentdata, "Right\\uff01 ", 0, status);
-#endif

      // Don't break sentences at boundary between CJK and digits
      ADD_DATACHUNK(sentdata, "\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8"
                   "\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0"
-                   "\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3001", 0, status);
+                   "\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002", 0, status);

      // Break sentence between a sentence terminator and
      // opening punctuation
      ADD_DATACHUNK(sentdata, "How do you do?", 0, status);
-      ADD_DATACHUNK(sentdata, "(fine).", 0, status);
+      ADD_DATACHUNK(sentdata, "(fine). ", 0, status);

      // test for bug #4158381: Don't break sentence after period if it isn't
      // followed by a space
@ -675,7 +681,8 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
      // letter are treated correctly
      // Unicode TR29 reverses above bug:  Don't break a sentence if the last word begins with an upper case letter.
      ADD_DATACHUNK(sentdata, "The type of all primitive <code>boolean</code> values accessed in the "            
-          "target VM.  Calls to xxx will return an implementor of this interface.  \\u2029", 0, status);
+          "target VM.  ", 0, status);
+      ADD_DATACHUNK(sentdata, "Calls to xxx will return an implementor of this interface.  \\u2029", 0, status);
      
      // test for bug #4152117: Make sure sentence breaking is handling
      // punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
@ -697,8 +704,10 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
                                    "\\u0939\\u0948?", 0, status);
      ADD_DATACHUNK(sentdata,
              "\\u092e\\u0948 \\u0905"  halfCHA "\\u091b\\u093e \\u0939\\u0942\\u0901\\u0964 ", 0, status);
-      ADD_DATACHUNK(sentdata, "\\u0905\\u093e\\u092a\r\n \\u0915\\u0948\\u0938\\u0947 \\u0939\\u0948?", 0, status);
-      ADD_DATACHUNK(sentdata, "\\u0935\\u0939 " halfKA "\\u092f\\u093e\n \\u0939\\u0948?", 0, status);
+      ADD_DATACHUNK(sentdata, "\\u0905\\u093e\\u092a\r\n", 0, status);
+      ADD_DATACHUNK(sentdata, "\\u0915\\u0948\\u0938\\u0947 \\u0939\\u0948?", 0, status);
+      ADD_DATACHUNK(sentdata, "\\u0935\\u0939 " halfKA "\\u092f\\u093e\n", 0, status);
+      ADD_DATACHUNK(sentdata, "\\u0939\\u0948?", 0, status);
      ADD_DATACHUNK(sentdata, "\\u092f\\u0939 \\u0905\\u093e\\u092e \\u0939\\u0948. ", 0, status);
      ADD_DATACHUNK(sentdata, "\\u092f\\u0939 means \"this\". ", 0, status);
      ADD_DATACHUNK(sentdata, "\"\\u092a\\u095d\\u093e\\u0908\" meaning \"education\" or \"studies\". ", 0, status);
@ -734,12 +743,11 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()

      // Try a few more of the less common sentence endings.
      ADD_DATACHUNK(sentdata, "Hello, world\\u3002 ", 0, status);
-      ADD_DATACHUNK(sentdata, "Hello, world\\u037e ", 0, status);  
+      // ADD_DATACHUNK(sentdata, "Hello, world\\u037e ", 0, status);  // Greek Question Mark, omitted from TR29.  TODO:
      ADD_DATACHUNK(sentdata, "Hello, world\\u2048 ", 0, status);
      ADD_DATACHUNK(sentdata, "Hello, world\\u203c ", 0, status);
      ADD_DATACHUNK(sentdata, "Let's end here. ", 0, status);

-
      generalIteratorTest(*sentIterDefault, sentdata);

      delete sentIterDefault;
@ -2186,8 +2194,9 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
            tb.setText(work);
            for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
                if (k == 2) {
-                    errln("Break between U+" + UCharToUnicodeString(work[1])
-                            + " and U+" + UCharToUnicodeString(work[2]));
+                    //errln("Break between U+" + UCharToUnicodeString(work[1])
+                    //        + " and U+" + UCharToUnicodeString(work[2]));
+                    errln("Unexpected Break between %6x and %6x", c1, c2);
                    errCount++;
                    if (errCount >= 75)
                        return;