ICU-7151 ICU4J RBBI Line break tests updated for Unicode 5.2

X-SVN-Rev: 26639
2025-04-14 17:24:01 +00:00 · 2009-09-16 21:56:30 +00:00 · 2009-09-16 21:56:30 +00:00 · 5e28363a15
commit 5e28363a15
parent 1cc5cd1a8b
4 changed files with 85 additions and 25 deletions
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91a5fa298946e4adc34f127284de8aef86b89a4b4353fc27dcd02303ca61b20d
-size 6408595
+oid sha256:569abd84cb1c912c72e1e86d6d82a5b42c3ba3037445495bb0c56738413c6992
+size 6408913
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5321a103d09e461c153c530e0bf3ab9ea456a51c68bb73109731ad87b7c26fbc
+oid sha256:eb381ea2266664ace19cac6ca554bbe6ff7623d2d9bb1bd9e30707990f992c38
 size 717057
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
- * Copyright (C) 2003-2008 International Business Machines Corporation and     *
+ * Copyright (C) 2003-2009 International Business Machines Corporation and     *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@ -470,6 +470,8 @@ public class RBBITestMonkey extends TestFmwk {
    }

 
+    // TODO:  for class fCP, fCL: when Unicode 5.2 properties become available, change the definitions of
+    //        these classes to use them.
    static class RBBILineMonkey extends RBBIMonkeyKind {
        
        List        fSets;
@ -490,6 +492,7 @@ public class RBBITestMonkey extends TestFmwk {
        UnicodeSet  fBB;
        UnicodeSet  fHY;
        UnicodeSet  fCL;
+        UnicodeSet  fCP;
        UnicodeSet  fEX;
        UnicodeSet  fIN;
        UnicodeSet  fNS;
@ -535,7 +538,9 @@ public class RBBITestMonkey extends TestFmwk {
            fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
            fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
            fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
-            fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
+            fCL    = new UnicodeSet("[[\\p{Line_break=CL}]-[\\u0029\\u005d]]");
+            fCP    = new UnicodeSet("[\\u0029\\u005d]");
+            // fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
            fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
            fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
            fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
@ -583,6 +588,7 @@ public class RBBITestMonkey extends TestFmwk {
            fSets.add(fH2);
            fSets.add(fH3);
            fSets.add(fCL);
+            fSets.add(fCP);
            fSets.add(fEX);
            fSets.add(fIN);
            fSets.add(fJL);
@ -763,13 +769,14 @@ public class RBBITestMonkey extends TestFmwk {
                
                
                // LB 13  Don't break before closings.
-                //       NU x CL  and NU x IS are not matched here so that they will
+                //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
                //       fall into LB 17 and the more general number regular expression.
                //
                if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
-                        fEX.contains(thisChar) ||
-                        !fNU.contains(prevChar) && fIS.contains(thisChar) ||
-                        !fNU.contains(prevChar) && fSY.contains(thisChar))    {
+                    !fNU.contains(prevChar) && fCP.contains(thisChar) ||
+                                               fEX.contains(thisChar) ||
+                    !fNU.contains(prevChar) && fIS.contains(thisChar) ||
+                    !fNU.contains(prevChar) && fSY.contains(thisChar))    {
                    continue;
                }
                
@ -806,7 +813,7 @@ public class RBBITestMonkey extends TestFmwk {
                    }
                }               
                
-                // LB 16   CL SP* x NS
+                // LB 16   (CL | CP) SP* x NS
                if (fNS.contains(thisChar)) {
                    tPos = prevPos;
                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
@ -815,7 +822,7 @@ public class RBBITestMonkey extends TestFmwk {
                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
                        tPos = moveIndex32(fText, tPos, -1);
                    }
-                    if (fCL.contains(UTF16.charAt(fText, tPos))) {
+                    if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
                        continue;
                    }
                }               
@ -960,7 +967,16 @@ public class RBBITestMonkey extends TestFmwk {
                    continue;
                }
                
-                // LB 30  (Withdrawn as of Unicode 5.1)
+                // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
+                //          (AL | NU) x OP
+                //          CP x (AL | NU)
+                if ((fAL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
+                    continue;
+                }
+                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fNU.contains(thisChar))) {
+                    continue;
+                }
+
              
                // LB 31    Break everywhere else
                break;            
@ -972,8 +988,8 @@ public class RBBITestMonkey extends TestFmwk {
        
        
        // Match the following regular expression in the input text.
-        //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
-        //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
+        //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
+        //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
        //  retVals array  [0]  index of the start of the match, or -1 if no match
        //                 [1]  index of first char following the match.
        //  Can not use Java regex because need supplementary character support,
@ -1065,6 +1081,10 @@ public class RBBITestMonkey extends TestFmwk {
                            matchState = 9;
                            break;                           
                        }
+                        //if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
+                        //    matchState = 9;
+                        //    break;                           
+                        //}
                        if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
                            matchState = 11;
                            break;                           
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@ -1,4 +1,4 @@
-# Copyright (c) 2001-2008 International Business Machines
+# Copyright (c) 2001-2009 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 # RBBI Test Data
@ -24,13 +24,8 @@


 #   Temp debugging tests 
-<sent>
-<data>•  (This is it).  •Testing the sentence iterator. •\
-"This isn't it." •Hi! \
-•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\
-"This isn't it." •\
-Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why. 
-•Not on my time (el timo.)! •</data>
+<line>
+<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>

 ########################################################################################
 #
@ -95,6 +90,28 @@ Hi! •This is a simple sample sentence. •It does not have to make any sense a
 #  Treat Japanese Half Width voicing marks as combining
 <data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data>

+########################################################################################
+#
+#
+#       E x t e n d e d    G r a p h e m e    C l u s t e r     T e s t s
+#
+#
+##########################################################################################
+#<xgc>
+
+# Plain Vanilla grapheme clusters
+#<data>•a•b•c•</data>
+#<data>•a\u0301\u0302• •b\u0303\u0304•</data>
+
+# Assorted Hindi combining marks
+#<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data>
+
+# Thai Clusters
+# $Prepend $Extend* $PrependBase $Extend*;
+#
+#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data>
+
+
 ########################################################################################
 #
 #
@ -486,15 +503,28 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal

 #      Regression for bug 836
 #        Note:  Unicode 5.1 changed this behavior
-#               ICU will want to change it back before releasing,
-#               so there is no break preceding the '('
-<data>•AAA•(AAA •</data> 
+#               Unicode 5.2 changed it again, there is no break following the '('
+<data>•AAA(AAA •</data> 

 #       Try some words from other scripts.
 #          Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
 #      
 <data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>

+#
+#       ticket #4853:  unpaired surrogates should behave like AL
+#
+<data>•abc\ud801xyz•</data>
+
+#
+#     Regression tests for failures that originally came from the monkey test.
+#     Monkey test failure lines can, with slight reformatting, be copied into this section
+#     as test cases.  The error display from here is more informative.
+#
+<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
+<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data>
+<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data>
+ 

 ########################################################################################
 #
@ -539,4 +569,14 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
 #
 <data>•สวัสดี<000>ครับ<000>สบาย<000>ดี<000>ไหม<000> •ครับ<000></data>

+#
+#  Trac ticket 5595 Test Case
+#     (Omitted from ICU4J because of old dictionary implementation.
+#
+# <data>•บท<000>ที่๑พายุ<000>ไซโคลน<000>โด<000>โรธี<000>อาศัย<000>อยู่<000>ท่ามกลาง<000>\
+# ทุ่งใหญ่<000>ใน<000>แคนซัส<000>กับ<000>ลุง<000>เฮ<000>นรี<000>ชาวไร่<000>และ<000>ป้า<000>เอ็ม<000>\
+# ภรรยา<000>ชาวไร่<000>บ้าน<000>ของ<000>พวก<000>เขา<000>หลัง<000>เล็ก<000>เพราะ<000>ไม้<000>\
+# สร้าง<000>บ้าน<000>ต้อง<000>ขน<000>มา<000>ด้วย<000>เกวียน<000>เป็น<000>ระยะ<000>ทาง<000>หลาย<000>\
+# ไมล์<000></data>
+