mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-2924 RBBI rule builder, changes for safe point rules. Work in progress.
X-SVN-Rev: 13578
This commit is contained in:
parent
bdb879222e
commit
a9cdcba39e
6 changed files with 105 additions and 26 deletions
|
@ -75,11 +75,21 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
}
|
||||
|
||||
fUDataMem = NULL;
|
||||
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
|
||||
fReverseTable = NULL;
|
||||
fSafeFwdTable = NULL;
|
||||
fSafeRevTable = NULL;
|
||||
if (data->fFTableLen != 0) {
|
||||
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
|
||||
}
|
||||
if (data->fRTableLen != 0) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
if (data->fSFTableLen != 0) {
|
||||
fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
|
||||
}
|
||||
if (data->fSRTableLen != 0) {
|
||||
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
|
||||
}
|
||||
|
||||
|
||||
utrie_unserialize(&fTrie,
|
||||
|
@ -185,38 +195,48 @@ const UnicodeString &RBBIDataWrapper::getRuleSourceString() {
|
|||
// print - debugging function to dump the runtime data tables.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBIDataWrapper::printData() {
|
||||
void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
|
||||
#ifdef RBBI_DEBUG
|
||||
uint32_t c, s;
|
||||
uint32_t c;
|
||||
uint32_t s;
|
||||
|
||||
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
|
||||
RBBIDebugPrintf(" Version = %d\n", fHeader->fVersion);
|
||||
RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
|
||||
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
|
||||
|
||||
RBBIDebugPrintf(" Forward State Transition Table\n");
|
||||
RBBIDebugPrintf(" %s\n", heading);
|
||||
RBBIDebugPrintf("State | Acc LA Tag");
|
||||
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
|
||||
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {RBBIDebugPrintf("----");}
|
||||
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("----");
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
|
||||
for (s=0; s<fForwardTable->fNumStates; s++) {
|
||||
for (s=0; s<table->fNumStates; s++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)
|
||||
(fForwardTable->fTableData + (fForwardTable->fRowLen * s));
|
||||
(table->fTableData + (table->fRowLen * s));
|
||||
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
RBBIDebugPrintf("%3d ", row->fNextState[c]);
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void RBBIDataWrapper::printData() {
|
||||
#ifdef RBBI_DEBUG
|
||||
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
|
||||
RBBIDebugPrintf(" Version = %d\n", fHeader->fVersion);
|
||||
RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength);
|
||||
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
|
||||
|
||||
printTable("Forward State Transition Table", fForwardTable);
|
||||
printTable("Reverse State Transition Table", fReverseTable);
|
||||
printTable("Safe Forward State Transition Table", fSafeFwdTable);
|
||||
printTable("Safe Reverse State Transition Table", fSafeRevTable);
|
||||
|
||||
RBBIDebugPrintf("\nOrignal Rules source:\n");
|
||||
c = 0;
|
||||
for (;;) {
|
||||
if (fRuleSource[c] == 0)
|
||||
break;
|
||||
for (int32_t c=0; fRuleSource[c] != 0; c++) {
|
||||
RBBIDebugPrintf("%c", fRuleSource[c]);
|
||||
c++;
|
||||
}
|
||||
RBBIDebugPrintf("\n\n");
|
||||
#endif
|
||||
|
@ -318,6 +338,14 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
|
|||
ds->swapArray32(ds, inBytes+rbbiDH->fRTable, 8, outBytes, status);
|
||||
ds->swapArray16(ds, inBytes+rbbiDH->fRTable+8, rbbiDH->fRTableLen-8, outBytes, status);
|
||||
|
||||
// Safe Forward state table. Same layout as forward table, above.
|
||||
ds->swapArray32(ds, inBytes+rbbiDH->fSFTable, 8, outBytes, status);
|
||||
ds->swapArray16(ds, inBytes+rbbiDH->fSFTable+8, rbbiDH->fSFTableLen-8, outBytes, status);
|
||||
|
||||
// Safe Reverse state table. Same layout as forward table, above.
|
||||
ds->swapArray32(ds, inBytes+rbbiDH->fSRTable, 8, outBytes, status);
|
||||
ds->swapArray16(ds, inBytes+rbbiDH->fSRTable+8, rbbiDH->fSRTableLen-8, outBytes, status);
|
||||
|
||||
// Trie table for character categories
|
||||
utrie_swap(ds, inBytes+rbbiDH->fTrie, rbbiDH->fTrieLen, outBytes+rbbiDH->fTrie, status);
|
||||
|
||||
|
|
|
@ -54,6 +54,10 @@ struct RBBIDataHeader {
|
|||
uint32_t fFTableLen;
|
||||
uint32_t fRTable; // Offset to the reverse state transition table.
|
||||
uint32_t fRTableLen;
|
||||
uint32_t fSFTable; // safe point forward transition table
|
||||
uint32_t fSFTableLen;
|
||||
uint32_t fSRTable; // safe point reverse transition table
|
||||
uint32_t fSRTableLen;
|
||||
uint32_t fTrie; // Offset to Trie data for character categories
|
||||
uint32_t fTrieLen;
|
||||
uint32_t fRuleSource; // Offset to the source for for the break
|
||||
|
@ -110,6 +114,7 @@ public:
|
|||
int32_t hashCode();
|
||||
const UnicodeString &getRuleSourceString();
|
||||
void printData();
|
||||
void printTable(const char *heading, const RBBIStateTable *table);
|
||||
|
||||
//
|
||||
// Pointers to items within the data
|
||||
|
@ -117,6 +122,8 @@ public:
|
|||
const RBBIDataHeader *fHeader;
|
||||
const RBBIStateTable *fForwardTable;
|
||||
const RBBIStateTable *fReverseTable;
|
||||
const RBBIStateTable *fSafeFwdTable;
|
||||
const RBBIStateTable *fSafeRevTable;
|
||||
const UChar *fRuleSource;
|
||||
|
||||
UTrie fTrie;
|
||||
|
|
|
@ -57,8 +57,13 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
|||
|
||||
fForwardTree = NULL;
|
||||
fReverseTree = NULL;
|
||||
fSafeFwdTree = NULL;
|
||||
fSafeRevTree = NULL;
|
||||
fDefaultTree = &fForwardTree;
|
||||
fForwardTables = NULL;
|
||||
fReverseTables = NULL;
|
||||
fSafeFwdTables = NULL;
|
||||
fSafeRevTables = NULL;
|
||||
fChainRules = FALSE;
|
||||
fLBCMNoChain = FALSE;
|
||||
|
||||
|
@ -100,8 +105,13 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
|
|||
delete fSetBuilder;
|
||||
delete fForwardTables;
|
||||
delete fReverseTables;
|
||||
delete fSafeFwdTables;
|
||||
delete fSafeRevTables;
|
||||
|
||||
delete fForwardTree;
|
||||
delete fReverseTree;
|
||||
delete fSafeFwdTree;
|
||||
delete fSafeRevTree;
|
||||
delete fScanner;
|
||||
}
|
||||
|
||||
|
@ -134,6 +144,8 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
int32_t headerSize = align8(sizeof(RBBIDataHeader));
|
||||
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
||||
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
|
||||
int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
|
||||
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
|
@ -154,17 +166,24 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
|
||||
data->fFTable = headerSize;
|
||||
data->fFTableLen = forwardTableSize;
|
||||
data->fRTable = data->fFTable + forwardTableSize;
|
||||
data->fRTable = data->fFTable + forwardTableSize;
|
||||
data->fRTableLen = reverseTableSize;
|
||||
data->fTrie = data->fRTable + reverseTableSize;
|
||||
data->fSFTable = data->fRTable + reverseTableSize;
|
||||
data->fSFTableLen = safeFwdTableSize;
|
||||
data->fSRTable = data->fSFTable + safeFwdTableSize;
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
|
||||
data->fTrie = data->fSRTable + safeRevTableSize;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fRuleSource = data->fTrie + trieSize;
|
||||
data->fRuleSource = data->fTrie + trieSize;
|
||||
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
|
||||
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
||||
fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
|
||||
|
@ -213,8 +232,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
//
|
||||
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
|
||||
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
|
||||
builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
|
||||
builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
|
||||
if (U_SUCCESS(status)
|
||||
&& (builder.fForwardTables == NULL || builder.fReverseTables == NULL))
|
||||
&& (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
|
||||
builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL))
|
||||
{
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
|
@ -222,11 +244,12 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
|
||||
builder.fForwardTables->build();
|
||||
builder.fReverseTables->build();
|
||||
builder.fSafeFwdTables->build();
|
||||
builder.fSafeRevTables->build();
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Package up the compiled data into a memory image
|
||||
// in the run-time format.
|
||||
|
|
|
@ -122,6 +122,11 @@ public:
|
|||
RBBIRuleScanner *fScanner; // The scanner.
|
||||
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
|
||||
RBBINode *fReverseTree; // then manipulated by subsequent steps.
|
||||
RBBINode *fSafeFwdTree;
|
||||
RBBINode *fSafeRevTree;
|
||||
|
||||
RBBINode **fDefaultTree; // For rules not qualified with a !
|
||||
// the tree to which they belong to.
|
||||
|
||||
UBool fChainRules; // True for chained Unicode TR style rules.
|
||||
// False for traditional regexp rules.
|
||||
|
@ -134,6 +139,8 @@ public:
|
|||
|
||||
RBBITableBuilder *fForwardTables; // State transition tables
|
||||
RBBITableBuilder *fReverseTables;
|
||||
RBBITableBuilder *fSafeFwdTables;
|
||||
RBBITableBuilder *fSafeRevTables;
|
||||
|
||||
RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
|
||||
// data tables..
|
||||
|
|
|
@ -319,10 +319,12 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
|
|||
// The ';' that terminates an expression really just functions as a '|' with
|
||||
// a low operator prededence.
|
||||
//
|
||||
// Forward and reverse rules are collected separately. Or this rule into
|
||||
// the appropriate group of them.
|
||||
// Each of the four sets of rules are collected separately.
|
||||
// (Forward, Reverse, ForwardSafe, ReverseSafe)
|
||||
// OR this rule into the appropriate group of them.
|
||||
//
|
||||
RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : &fRB->fForwardTree);
|
||||
// RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : &fRB->fForwardTree); TODO: delete
|
||||
RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
|
||||
|
||||
if (*destRules != NULL) {
|
||||
// This is not the first rule encounted.
|
||||
|
@ -471,6 +473,14 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
|
|||
fRB->fChainRules = TRUE;
|
||||
} else if (opt == "LBCMNoChain") {
|
||||
fRB->fLBCMNoChain = TRUE;
|
||||
} else if (opt == "forward") {
|
||||
fRB->fDefaultTree = &fRB->fForwardTree;
|
||||
} else if (opt == "reverse") {
|
||||
fRB->fDefaultTree = &fRB->fReverseTree;
|
||||
} else if (opt == "safe_forward") {
|
||||
fRB->fDefaultTree = &fRB->fSafeFwdTree;
|
||||
} else if (opt == "safe_reverse") {
|
||||
fRB->fDefaultTree = &fRB->fSafeRevTree;
|
||||
} else {
|
||||
error(U_BRK_UNRECOGNIZED_OPTION);
|
||||
}
|
||||
|
@ -1025,6 +1035,10 @@ void RBBIRuleScanner::parse() {
|
|||
fRB->fForwardTree->printTree();
|
||||
RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
|
||||
fRB->fReverseTree->printTree();
|
||||
RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
|
||||
fRB->fSafeFwdTree->printTree();
|
||||
RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
|
||||
fRB->fSafeRevTree->printTree();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -216,6 +216,6 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
|||
# Note that the initial .. is to back over both halves of a CR/LF sequence
|
||||
# at the current position.
|
||||
#
|
||||
|
||||
!!reverse;
|
||||
!. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
|
||||
#!.*;
|
||||
|
|
Loading…
Add table
Reference in a new issue