mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-17 02:37:25 +00:00
ICU-2648 Reduce the size of .brk files, and test that the rules roundtrip.
X-SVN-Rev: 11076
This commit is contained in:
parent
cd9de6d966
commit
f1919822df
5 changed files with 95 additions and 8 deletions
|
@ -112,6 +112,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// Remove comments and whitespace from the rules to make it smaller.
|
||||
UnicodeString strippedRules(RBBIRuleScanner::stripRules(fRules));
|
||||
|
||||
// Calculate the size of each section in the data.
|
||||
// Sizes here are padded up to a multiple of 8 for better memory alignment.
|
||||
// Sections sizes actually stored in the header are for the actual data
|
||||
|
@ -121,7 +124,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
||||
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t rulesSize = align8((fRules.length()+1) * sizeof(UChar));
|
||||
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
|
||||
+ trieSize + rulesSize;
|
||||
|
@ -145,14 +148,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
data->fTrie = data->fRTable + reverseTableSize;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fRuleSource = data->fTrie + trieSize;
|
||||
data->fRuleSourceLen = fRules.length() * sizeof(UChar);
|
||||
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
|
||||
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
fRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
|
|
@ -699,6 +699,33 @@ static const UChar chLParen = 0x28;
|
|||
static const UChar chRParen = 0x29;
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// stripRules Return a rules string without unnecessary
|
||||
// characters.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
|
||||
UnicodeString strippedRules;
|
||||
int rulesLength = rules.length();
|
||||
for (int idx = 0; idx < rulesLength; ) {
|
||||
UChar ch = rules[idx++];
|
||||
if (ch == chPound) {
|
||||
while (idx < rulesLength
|
||||
&& ch != chCR && ch != chLF && ch != chNEL)
|
||||
{
|
||||
ch = rules[idx++];
|
||||
}
|
||||
}
|
||||
if (!u_isWhitespace(ch)) {
|
||||
strippedRules.append(ch);
|
||||
}
|
||||
}
|
||||
// strippedRules = strippedRules.unescape();
|
||||
return strippedRules;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// nextCharLL Low Level Next Char from rule input source.
|
||||
|
|
|
@ -73,6 +73,12 @@ public:
|
|||
// trees, one each for the forward and
|
||||
// reverse rules,
|
||||
// and a list of UnicodeSets encountered.
|
||||
|
||||
/**
|
||||
* Return a rules string without unnecessary
|
||||
* characters.
|
||||
*/
|
||||
static UnicodeString stripRules(const UnicodeString &rules);
|
||||
private:
|
||||
|
||||
UBool doParseActions(EParseAction a);
|
||||
|
|
|
@ -18,8 +18,8 @@
|
|||
#include "unicode/rbbi.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "rbbiapts.h"
|
||||
#include "string.h"
|
||||
#include "stdio.h"
|
||||
#include "rbbidata.h"
|
||||
#include "cstring.h"
|
||||
|
||||
/**
|
||||
* API Test the RuleBasedBreakIterator class
|
||||
|
@ -850,6 +850,52 @@ void RBBIAPITest::TestRegistration() {
|
|||
delete root_char;
|
||||
}
|
||||
|
||||
void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
parseError.line = 0;
|
||||
parseError.offset = 0;
|
||||
UDataMemory *data = udata_open(NULL, "brk", dataFile, &status);
|
||||
uint32_t length;
|
||||
const UChar *builtSource;
|
||||
const uint8_t *rbbiRules;
|
||||
const uint8_t *builtRules;
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Can't open \"%s\"", dataFile);
|
||||
return;
|
||||
}
|
||||
|
||||
builtRules = (const uint8_t *)udata_getMemory(data);
|
||||
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
|
||||
RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
|
||||
u_errorName(status), parseError.line, parseError.offset);
|
||||
return;
|
||||
};
|
||||
rbbiRules = brkItr->getBinaryRules(length);
|
||||
logln("Comparing \"%s\" len=%d", dataFile, length);
|
||||
if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
|
||||
errln("Built rules and rebuilt rules are different %s", dataFile);
|
||||
return;
|
||||
}
|
||||
delete brkItr;
|
||||
udata_close(data);
|
||||
}
|
||||
|
||||
void RBBIAPITest::TestRoundtripRules() {
|
||||
RoundtripRule("word");
|
||||
RoundtripRule("title");
|
||||
RoundtripRule("sent");
|
||||
RoundtripRule("line");
|
||||
RoundtripRule("char");
|
||||
if (!quick) {
|
||||
RoundtripRule("word_th");
|
||||
RoundtripRule("line_th");
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
//---------------------------------------------
|
||||
|
@ -872,6 +918,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
case 10: name = "TestBug2190"; if (exec) TestBug2190(); break;
|
||||
case 11: name = "TestRegistration"; if (exec) TestRegistration(); break;
|
||||
case 12: name = "TestBoilerPlate"; if (exec) TestBoilerPlate(); break;
|
||||
case 13: name = "TestRoundtripRules"; if (exec) TestRoundtripRules(); break;
|
||||
|
||||
default: name = ""; break; /*needed to end loop*/
|
||||
}
|
||||
|
|
|
@ -59,17 +59,21 @@ public:
|
|||
/**
|
||||
* Tests the method IsBoundary() of RuleBasedBreakIterator
|
||||
**/
|
||||
void TestIsBoundary(void);
|
||||
void TestIsBoundary(void);
|
||||
|
||||
/**
|
||||
* Tests creating RuleBasedBreakIterator from rules strings.
|
||||
**/
|
||||
void TestBuilder(void);
|
||||
void TestBuilder(void);
|
||||
|
||||
void TestRoundtripRules(void);
|
||||
|
||||
void RoundtripRule(const char *dataFile);
|
||||
|
||||
/**
|
||||
* Tests grouping effect of 'single quotes' in rules.
|
||||
**/
|
||||
void TestQuoteGrouping();
|
||||
void TestQuoteGrouping();
|
||||
|
||||
/**
|
||||
* Tests word break status returns.
|
||||
|
|
Loading…
Add table
Reference in a new issue