ICU-12677 RBBI, fix incorrect stripping of comments from saved rules.

X-SVN-Rev: 40837
2025-04-10 07:39:16 +00:00 · 2018-02-03 19:10:50 +00:00 · 2018-02-03 19:10:50 +00:00 · aaebaf90a3
commit aaebaf90a3
parent f74b42f960
9 changed files with 77 additions and 35 deletions
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -47,7 +47,7 @@ U_NAMESPACE_BEGIN
 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
                                       UParseError     *parseErr,
                                       UErrorCode      &status)
- : fRules(rules)
+ : fRules(rules), fStrippedRules(rules)
 {
    fStatus = &status; // status is checked below
    fParseError = parseErr;
@ -147,8 +147,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
        return NULL;
    }

-    // Remove comments and whitespace from the rules to make it smaller.
-    UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
+    // Remove whitespace from the rules to make it smaller.
+    // The rule parser has already removed comments.
+    fStrippedRules = fScanner->stripRules(fStrippedRules);

    // Calculate the size of each section in the data.
    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
@ -162,7 +163,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
-    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
+    int32_t rulesSize         = align8((fStrippedRules.length()+1) * sizeof(UChar));

    (void)safeFwdTableSize;

@ -225,7 +226,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    data->fStatusTable   = data->fTrie    + trieSize;
    data->fStatusTableLen= statusTableSize;
    data->fRuleSource    = data->fStatusTable + statusTableSize;
-    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
+    data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar);

    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));

@ -245,7 +246,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
    }

-    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
+    fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);

    return data;
 }
--- a/icu4c/source/common/rbbirb.h
+++ b/icu4c/source/common/rbbirb.h
@ -130,6 +130,7 @@ public:
    UErrorCode                    *fStatus;          // Error reporting.  Keeping status
    UParseError                   *fParseError;      //   here avoids passing it everywhere.
    const UnicodeString           &fRules;           // The rule string that we are compiling
+    UnicodeString                 fStrippedRules;    // The rule string, with comments stripped.

    RBBIRuleScanner               *fScanner;         // The scanner.
    RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -822,27 +822,24 @@ static const UChar      chRParen    = 0x29;

 //------------------------------------------------------------------------------
 //
-//  stripRules    Return a rules string without unnecessary
-//                characters.
+//  stripRules    Return a rules string without extra spaces.
+//                (Comments are removed separately, during rule parsing.)
 //
 //------------------------------------------------------------------------------
 UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
    UnicodeString strippedRules;
-    int rulesLength = rules.length();
-    for (int idx = 0; idx < rulesLength; ) {
-        UChar ch = rules[idx++];
-        if (ch == chPound) {
-            while (idx < rulesLength
-                && ch != chCR && ch != chLF && ch != chNEL)
-            {
-                ch = rules[idx++];
-            }
-        }
-        if (!u_isISOControl(ch)) {
-            strippedRules.append(ch);
+    int32_t rulesLength = rules.length();
+    bool skippingSpaces = false;
+    
+    for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
+        UChar32 cp = rules.char32At(idx);
+        bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
+        if (skippingSpaces && whiteSpace) {
+            continue;
        }
+        strippedRules.append(cp);
+        skippingSpaces = whiteSpace;
    }
-    // strippedRules = strippedRules.unescape();
    return strippedRules;
 }

@ -942,6 +939,7 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
            //  It will be treated as white-space, and serves to break up anything
            //    that might otherwise incorrectly clump together with a comment in
            //    the middle (a variable name, for example.)
+            int32_t commentStart = fScanIndex;
            for (;;) {
                c.fChar = nextCharLL();
                if (c.fChar == (UChar32)-1 ||  // EOF
@ -950,6 +948,9 @@ void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
                    c.fChar == chNEL    ||
                    c.fChar == chLS)       {break;}
            }
+            for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
+                fRB->fStrippedRules.setCharAt(i, u' ');
+            }
        }
        if (c.fChar == (UChar32)-1) {
            return;
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -1035,7 +1035,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {

    builtRules = (const uint8_t *)udata_getMemory(data.getAlias());
    builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
-    RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
+    LocalPointer<RuleBasedBreakIterator> brkItr (new RuleBasedBreakIterator(builtSource, parseError, status));
    if (U_FAILURE(status)) {
        errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
                __FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
@ -1048,7 +1048,6 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
        errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
        return;
    }
-    delete brkItr;
 }

 void RBBIAPITest::TestRoundtripRules() {
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -105,6 +105,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
    TESTCASE_AUTO(TestBug12932);
    TESTCASE_AUTO(TestEmoji);
    TESTCASE_AUTO(TestBug12519);
+    TESTCASE_AUTO(TestBug12677);
    TESTCASE_AUTO_END;
 }

@ -4436,6 +4437,23 @@ void RBBITest::TestBug12519() {
    assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
 }

+void RBBITest::TestBug12677() {
+    // Check that stripping of comments from rules for getRules() is not confused by
+    // the presence of '#' characters in the rules that do not introduce comments.
+    UnicodeString rules(u"!!forward; \n"
+                         "$x = [ab#];  # a set with a # literal. \n"
+                         " # .;        # a comment that looks sort of like a rule.   \n"
+                         " '#' '?';    # a rule with a quoted #   \n"
+                       );
+    
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError pe;
+    RuleBasedBreakIterator bi (rules, pe, status);
+    assertSuccess(WHERE, status);
+    UnicodeString rtRules = bi.getRules();
+    assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules); 
+}
+
 //
 //  TestDebug    -  A place-holder test for debugging purposes.
 //                  For putting in fragments of other tests that can be invoked
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -74,6 +74,7 @@ public:
    void TestBug12932();
    void TestEmoji();
    void TestBug12519();
+    void TestBug12677();

    void TestDebug();
    void TestProperties();
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java
@ -28,6 +28,7 @@ class RBBIRuleBuilder {

    String fDebugEnv;              // controls debug trace output
    String fRules;                 // The rule string that we are compiling
+    StringBuilder fStrippedRules;  // The rule string, with comments stripped.
    RBBIRuleScanner fScanner;      // The scanner.


@ -142,6 +143,7 @@ class RBBIRuleBuilder {
        fDebugEnv       = ICUDebug.enabled("rbbi") ?
                            ICUDebug.value("rbbi") : null;
        fRules          = rules;
+        fStrippedRules  = new StringBuilder(rules);
        fUSetNodes      = new ArrayList<RBBINode>();
        fRuleStatusVals = new ArrayList<Integer>();
        fScanner        = new RBBIRuleScanner(this);
@ -165,8 +167,9 @@ class RBBIRuleBuilder {
        DataOutputStream dos = new DataOutputStream(os);
        int i;

-        //  Remove comments and whitespace from the rules to make it smaller.
-        String strippedRules = RBBIRuleScanner.stripRules(fRules);
+        //  Remove whitespace from the rules to make it smaller.
+        //  The rule parser has already removed comments.
+        String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());

        // Calculate the size of each section in the data in bytes.
        //   Sizes here are padded up to a multiple of 8 for better memory alignment.
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java
@ -14,6 +14,7 @@ import java.util.HashMap;
 import com.ibm.icu.impl.Assert;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;

 /**
  *  This class is part of the Rule Based Break Iterator rule compiler.
@ -696,17 +697,16 @@ class RBBIRuleScanner {
    static String stripRules(String rules) {
        StringBuilder strippedRules = new StringBuilder();
        int rulesLength = rules.length();
-        for (int idx = 0; idx < rulesLength;) {
-            char ch = rules.charAt(idx++);
-            if (ch == '#') {
-                while (idx < rulesLength
-                        && ch != '\r' && ch != '\n' && ch != chNEL) {
-                    ch = rules.charAt(idx++);
-                }
-            }
-            if (!UCharacter.isISOControl(ch)) {
-                strippedRules.append(ch);
+        boolean skippingSpaces = false;
+
+        for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
+            int cp = rules.codePointAt(idx);
+            boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
+            if (skippingSpaces && whiteSpace) {
+                continue;
            }
+            strippedRules.appendCodePoint(cp);
+            skippingSpaces = whiteSpace;
        }
        return strippedRules.toString();
    }
@ -800,6 +800,7 @@ class RBBIRuleScanner {
                //  It will be treated as white-space, and serves to break up anything
                //    that might otherwise incorrectly clump together with a comment in
                //    the middle (a variable name, for example.)
+                int commentStart = fScanIndex;
                for (;;) {
                    c.fChar = nextCharLL();
                    if (c.fChar == -1 || // EOF
@ -811,6 +812,9 @@ class RBBIRuleScanner {
                        break;
                    }
                }
+                for (int i=commentStart; i<fNextIndex-1; ++i) {
+                    fRB.fStrippedRules.setCharAt(i, ' ');
+                }
            }
            if (c.fChar == -1) {
                return;
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@ -548,4 +548,18 @@ public class RBBITest extends TestFmwk {
        assertEquals("", t1.fExpectedBoundaries, t1.fBoundaries);
        assertEquals("", t2.fExpectedBoundaries, t2.fBoundaries);
    }
+
+    @Test
+    public void TestBug12677() {
+        // Check that stripping of comments from rules for getRules() is not confused by
+        // the presence of '#' characters in the rules that do not introduce comments.
+        String rules = "!!forward; \n"
+                     + "$x = [ab#];  # a set with a # literal. \n"
+                     + " # .;        # a comment that looks sort of like a rule.   \n"
+                     + " '#' '?';    # a rule with a quoted #   \n";
+
+        RuleBasedBreakIterator bi  = new RuleBasedBreakIterator(rules);
+        String rtRules = bi.toString();        // getRules() in C++
+        assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ",  rtRules);
+    }
 }