ICU-2924 RBBI, line break rules, monkey test, a few more fixes

X-SVN-Rev: 13402
2025-04-13 08:53:20 +00:00 · 2003-10-13 22:01:53 +00:00 · 2003-10-13 22:01:53 +00:00 · 94a9e101e7
commit 94a9e101e7
parent e1776d90c6
2 changed files with 50 additions and 33 deletions
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -46,6 +46,18 @@ $XX = [:LineBreak =  Unknown:];
 $ZW = [:LineBreak =  ZWSpace:];


+#
+# Korean Syllable Definitions
+#
+$L   = [:Hangul_Syllable_Type = L:];
+$V   = [:Hangul_Syllable_Type = V:];
+$T   = [:Hangul_Syllable_Type = T:];
+
+$LV  = [:Hangul_Syllable_Type = LV:];
+$LVT = [:Hangul_Syllable_Type = LVT:];
+
+$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
+

 #
 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
@ -66,7 +78,7 @@ $CLcm = $CL $CM*;
 $EXcm = $EX $CM*;
 $GLcm = $GL $CM*;
 $HYcm = $HY $CM*;
-$IDcm = $ID $CM*;
+$IDcm = ($ID | HangulSyllable) $CM*;
 $INcm = $IN $CM*;
 $IScm = $IS $CM*;
 $NScm = $NS $CM*;
@ -199,10 +211,10 @@ $CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL

 #
 #  Reverse Rules.
-#
-#     Back up to a hard break or a space that will cause a boundary.
-#     Not all spaces cause line breaks.  $SpaceGlue represents a sequence
-#     containing a space that may inhibit a break from occuring.
+#     TODO:  Something more efficient.  These rules just back up to hard breaks
+#            Note that the initial .. is to back over both halves of a CR/LF sequence
+#            at the current position.
 #

-!.*;
+!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
+#!.*;
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -2553,12 +2553,14 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
    //  advance over any CM class chars.  (Line Break CM class is different from
    //    grapheme cluster CM, so we need to do this even for HangulSyllables.
    //    Line Break may eat additional stuff as combining, beyond what graphem cluster did.
-    for (;;) {
-        *nextChar = fText->char32At(nPos);
-        if (!fCM->contains(*nextChar)) {
-            break;
+    if (!(fBK->contains(*posChar) || *posChar==0x0a || *posChar==0x0d || *posChar==0x85)) {
+        for (;;) {
+            *nextChar = fText->char32At(nPos);
+            if (!fCM->contains(*nextChar)) {
+                break;
+            }
+            nPos = fText->moveIndex32(nPos, 1);
        }
-        nPos = fText->moveIndex32(nPos, 1);
    }
    
    
@ -2652,27 +2654,31 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        }

        // LB 10    QU SP* x OP
-        UnicodeString  subStr10(*fText, prevPos);
-        fLB10Matcher->reset(subStr10);
-        status = U_ZERO_ERROR;
-        if (fLB10Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
-            // TODO:  Check status codes
-            pos      = prevPos + fLB10Matcher->start(1, status);
-            nextPos  = prevPos + fLB10Matcher->end(0, status);
-            thisChar = fText->char32At(pos);
-            continue;
+        if (prevPos >= 0) {
+            UnicodeString  subStr10(*fText, prevPos);
+            fLB10Matcher->reset(subStr10);
+            status = U_ZERO_ERROR;
+            if (fLB10Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
+                // TODO:  Check status codes
+                pos      = prevPos + fLB10Matcher->start(1, status);
+                nextPos  = prevPos + fLB10Matcher->end(0, status);
+                thisChar = fText->char32At(pos);
+                continue;
+            }
        }

        // LB 11   CL SP* x NS
-        UnicodeString  subStr11(*fText, prevPos);
-        fLB11Matcher->reset(subStr11);
-        status = U_ZERO_ERROR;
-        if (fLB11Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
-            // TODO:  Check status codes
-            pos      = prevPos + fLB11Matcher->start(1, status);
-            nextPos  = prevPos + fLB11Matcher->end(0, status);
-            thisChar = fText->char32At(pos);
-            continue;
+        if (prevPos >= 0) {
+            UnicodeString  subStr11(*fText, prevPos);
+            fLB11Matcher->reset(subStr11);
+            status = U_ZERO_ERROR;
+            if (fLB11Matcher->lookingAt(status)) {  //   /QU CM* SP* (OP) CM*/;
+                // TODO:  Check status codes
+                pos      = prevPos + fLB11Matcher->start(1, status);
+                nextPos  = prevPos + fLB11Matcher->end(0, status);
+                thisChar = fText->char32At(pos);
+                continue;
+            }
        }

        // LB 4  Don't break before spaces or zero-width space.
@ -2984,8 +2990,7 @@ void RBBITest::TestMonkey(char *params) {
        RBBILineMonkey  m;
        BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
        if (params == NULL) {
-            // TODO:  Resolve rule ambiguities, unpin loop count.
-            loopCount = 2;
+            loopCount = 50;
        }
        RunMonkey(bi, m, "line", seed, loopCount);
        delete bi;
@ -3041,8 +3046,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
        }
    }

-    while (loopCount <= numIterations || numIterations == -1) {
-        if (numIterations == -1 && loopCount % 500 == 0) {
+    while (loopCount < numIterations || numIterations == -1) {
+        if (numIterations == -1 && loopCount % 10 == 0) {
            // If test is running in an infinite loop, display a periodic tic so
            //   we can tell that it is making progress.
            fprintf(stderr, ".");