ICU-2648 Reduce the size of .brk files, and test that the rules roundtrip.

X-SVN-Rev: 11076
2025-04-17 02:37:25 +00:00 · 2003-02-17 18:06:42 +00:00 · 2003-02-17 18:06:42 +00:00 · f1919822df
commit f1919822df
parent cd9de6d966
5 changed files with 95 additions and 8 deletions
--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -112,6 +112,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
        return NULL;
    }

+    // Remove comments and whitespace from the rules to make it smaller.
+    UnicodeString strippedRules(RBBIRuleScanner::stripRules(fRules));
+
    // Calculate the size of each section in the data.
    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
    //   Sections sizes actually stored in the header are for the actual data
@ -121,7 +124,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
    int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
-    int32_t rulesSize         = align8((fRules.length()+1) * sizeof(UChar));
+    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));

    int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
                                + trieSize + rulesSize;
@ -145,14 +148,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    data->fTrie          = data->fRTable + reverseTableSize;
    data->fTrieLen       = fSetBuilder->getTrieSize();
    data->fRuleSource    = data->fTrie   + trieSize;
-    data->fRuleSourceLen = fRules.length() * sizeof(UChar);
+    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);

    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));

    fForwardTables->exportTable((uint8_t *)data + data->fFTable);
    fReverseTables->exportTable((uint8_t *)data + data->fRTable);
    fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
-    fRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
+    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);

    return data;
 }
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -699,6 +699,33 @@ static const UChar      chLParen    = 0x28;
 static const UChar      chRParen    = 0x29;


+//----------------------------------------------------------------------------------------
+//
+//  stripRules    Return a rules string without unnecessary
+//                characters.
+//
+//----------------------------------------------------------------------------------------
+UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
+    UnicodeString strippedRules;
+    int rulesLength = rules.length();
+    for (int idx = 0; idx < rulesLength; ) {
+        UChar ch = rules[idx++];
+        if (ch == chPound) {
+            while (idx < rulesLength
+                && ch != chCR && ch != chLF && ch != chNEL)
+            {
+                ch = rules[idx++];
+            }
+        }
+        if (!u_isWhitespace(ch)) {
+            strippedRules.append(ch);
+        }
+    }
+    // strippedRules = strippedRules.unescape();
+    return strippedRules;
+}
+
+
 //----------------------------------------------------------------------------------------
 //
 //  nextCharLL    Low Level Next Char from rule input source.
--- a/icu4c/source/common/rbbiscan.h
+++ b/icu4c/source/common/rbbiscan.h
@ -73,6 +73,12 @@ public:
                                                    //   trees, one each for the forward and
                                                    //   reverse rules,
                                                    //   and a list of UnicodeSets encountered.
+
+    /**
+     * Return a rules string without unnecessary
+     * characters.
+     */
+    static UnicodeString stripRules(const UnicodeString &rules);
 private:

    UBool       doParseActions(EParseAction a);
--- a/icu4c/source/test/intltest/rbbiapts.cpp
+++ b/icu4c/source/test/intltest/rbbiapts.cpp
@ -18,8 +18,8 @@
 #include "unicode/rbbi.h"
 #include "unicode/schriter.h"
 #include "rbbiapts.h"
-#include "string.h"
-#include "stdio.h"
+#include "rbbidata.h"
+#include "cstring.h"

 /**
 * API Test the RuleBasedBreakIterator class
@ -850,6 +850,52 @@ void RBBIAPITest::TestRegistration() {
  delete root_char;
 }

+void RBBIAPITest::RoundtripRule(const char *dataFile) {
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError parseError;
+	parseError.line = 0;
+	parseError.offset = 0;
+    UDataMemory *data = udata_open(NULL, "brk", dataFile, &status);
+    uint32_t length;
+    const UChar *builtSource;
+    const uint8_t *rbbiRules;
+    const uint8_t *builtRules;
+
+    if (U_FAILURE(status)) {
+        errln("Can't open \"%s\"", dataFile);
+        return;
+    }
+
+    builtRules = (const uint8_t *)udata_getMemory(data);
+    builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
+    RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
+    if (U_FAILURE(status)) {
+        errln("createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
+                u_errorName(status), parseError.line, parseError.offset);
+        return;
+    };
+    rbbiRules = brkItr->getBinaryRules(length);
+    logln("Comparing \"%s\" len=%d", dataFile, length);
+    if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
+        errln("Built rules and rebuilt rules are different %s", dataFile);
+        return;
+    }
+    delete brkItr;
+    udata_close(data);
+}
+
+void RBBIAPITest::TestRoundtripRules() {
+    RoundtripRule("word");
+    RoundtripRule("title");
+    RoundtripRule("sent");
+    RoundtripRule("line");
+    RoundtripRule("char");
+    if (!quick) {
+        RoundtripRule("word_th");
+        RoundtripRule("line_th");
+    }
+}
+
 //---------------------------------------------
 // runIndexedTest
 //---------------------------------------------
@ -872,6 +918,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
        case 10: name = "TestBug2190"; if (exec) TestBug2190(); break;
        case 11: name = "TestRegistration"; if (exec) TestRegistration(); break;
        case 12: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
+        case 13: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;

        default: name = ""; break; /*needed to end loop*/
    }
--- a/icu4c/source/test/intltest/rbbiapts.h
+++ b/icu4c/source/test/intltest/rbbiapts.h
@ -59,17 +59,21 @@ public:
    /**
     * Tests the method IsBoundary() of RuleBasedBreakIterator
     **/
-   void TestIsBoundary(void);
+    void TestIsBoundary(void);

    /**
     * Tests creating RuleBasedBreakIterator from rules strings.
     **/
-   void TestBuilder(void);
+    void TestBuilder(void);
+
+    void TestRoundtripRules(void);
+
+    void RoundtripRule(const char *dataFile);

    /**
     * Tests grouping effect of 'single quotes' in rules.
     **/
-   void TestQuoteGrouping();
+    void TestQuoteGrouping();

    /**
     *  Tests word break status returns.