ICU-2648 Reduce the size of .brk files, and test that the rules roundtrip.

X-SVN-Rev: 11076
This commit is contained in:
George Rhoten 2003-02-17 18:06:42 +00:00
parent cd9de6d966
commit f1919822df
5 changed files with 95 additions and 8 deletions

View file

@ -112,6 +112,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
return NULL;
}
// Remove comments and whitespace from the rules to make it smaller.
UnicodeString strippedRules(RBBIRuleScanner::stripRules(fRules));
// Calculate the size of each section in the data.
// Sizes here are padded up to a multiple of 8 for better memory alignment.
// Sections sizes actually stored in the header are for the actual data
@ -121,7 +124,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t rulesSize = align8((fRules.length()+1) * sizeof(UChar));
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
+ trieSize + rulesSize;
@ -145,14 +148,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fTrie = data->fRTable + reverseTableSize;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fRuleSource = data->fTrie + trieSize;
data->fRuleSourceLen = fRules.length() * sizeof(UChar);
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
fRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
return data;
}

View file

@ -699,6 +699,33 @@ static const UChar chLParen = 0x28;
static const UChar chRParen = 0x29;
//----------------------------------------------------------------------------------------
//
// stripRules Return a rules string without unnecessary
// characters.
//
//----------------------------------------------------------------------------------------
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
UnicodeString strippedRules;
int rulesLength = rules.length();
for (int idx = 0; idx < rulesLength; ) {
UChar ch = rules[idx++];
if (ch == chPound) {
while (idx < rulesLength
&& ch != chCR && ch != chLF && ch != chNEL)
{
ch = rules[idx++];
}
}
if (!u_isWhitespace(ch)) {
strippedRules.append(ch);
}
}
// strippedRules = strippedRules.unescape();
return strippedRules;
}
//----------------------------------------------------------------------------------------
//
// nextCharLL Low Level Next Char from rule input source.

View file

@ -73,6 +73,12 @@ public:
// trees, one each for the forward and
// reverse rules,
// and a list of UnicodeSets encountered.
/**
* Return a rules string without unnecessary
* characters.
*/
static UnicodeString stripRules(const UnicodeString &rules);
private:
UBool doParseActions(EParseAction a);

View file

@ -18,8 +18,8 @@
#include "unicode/rbbi.h"
#include "unicode/schriter.h"
#include "rbbiapts.h"
#include "string.h"
#include "stdio.h"
#include "rbbidata.h"
#include "cstring.h"
/**
* API Test the RuleBasedBreakIterator class
@ -850,6 +850,52 @@ void RBBIAPITest::TestRegistration() {
delete root_char;
}
void RBBIAPITest::RoundtripRule(const char *dataFile) {
UErrorCode status = U_ZERO_ERROR;
UParseError parseError;
parseError.line = 0;
parseError.offset = 0;
UDataMemory *data = udata_open(NULL, "brk", dataFile, &status);
uint32_t length;
const UChar *builtSource;
const uint8_t *rbbiRules;
const uint8_t *builtRules;
if (U_FAILURE(status)) {
errln("Can't open \"%s\"", dataFile);
return;
}
builtRules = (const uint8_t *)udata_getMemory(data);
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
if (U_FAILURE(status)) {
errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
u_errorName(status), parseError.line, parseError.offset);
return;
};
rbbiRules = brkItr->getBinaryRules(length);
logln("Comparing \"%s\" len=%d", dataFile, length);
if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
errln("Built rules and rebuilt rules are different %s", dataFile);
return;
}
delete brkItr;
udata_close(data);
}
void RBBIAPITest::TestRoundtripRules() {
RoundtripRule("word");
RoundtripRule("title");
RoundtripRule("sent");
RoundtripRule("line");
RoundtripRule("char");
if (!quick) {
RoundtripRule("word_th");
RoundtripRule("line_th");
}
}
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@ -872,6 +918,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
case 10: name = "TestBug2190"; if (exec) TestBug2190(); break;
case 11: name = "TestRegistration"; if (exec) TestRegistration(); break;
case 12: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
case 13: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;
default: name = ""; break; /*needed to end loop*/
}

View file

@ -59,17 +59,21 @@ public:
/**
* Tests the method IsBoundary() of RuleBasedBreakIterator
**/
void TestIsBoundary(void);
void TestIsBoundary(void);
/**
* Tests creating RuleBasedBreakIterator from rules strings.
**/
void TestBuilder(void);
void TestBuilder(void);
void TestRoundtripRules(void);
void RoundtripRule(const char *dataFile);
/**
* Tests grouping effect of 'single quotes' in rules.
**/
void TestQuoteGrouping();
void TestQuoteGrouping();
/**
* Tests word break status returns.