mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-13194 RBBI auto reverse tables: size reduction, and remove hand written rules.
X-SVN-Rev: 41163
This commit is contained in:
parent
e5ab76b130
commit
aead9fb553
21 changed files with 103 additions and 659 deletions
|
@ -937,169 +937,6 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handlePrevious()
|
||||
//
|
||||
// Iterate backwards using the safe reverse rules.
|
||||
// The logic of this function is very similar to handleNext(), above.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
|
||||
int32_t state;
|
||||
uint16_t category = 0;
|
||||
RBBIRunMode mode;
|
||||
RBBIStateTableRow *row;
|
||||
UChar32 c;
|
||||
LookAheadResults lookAheadMatches;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
|
||||
const RBBIStateTable *stateTable = fData->fSafeRevTable;
|
||||
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPuts("Handle Previous pos char state category");
|
||||
}
|
||||
#endif
|
||||
|
||||
// if we're already at the start of the text, return DONE.
|
||||
if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// Set up the starting char.
|
||||
initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
result = initialPosition;
|
||||
c = UTEXT_PREVIOUS32(&fText);
|
||||
|
||||
// Set the initial state for the state machine
|
||||
state = START_STATE;
|
||||
row = (RBBIStateTableRow *)
|
||||
(stateTable->fTableData + (stateTable->fRowLen * state));
|
||||
category = 3;
|
||||
mode = RBBI_RUN;
|
||||
if (stateTable->fFlags & RBBI_BOF_REQUIRED) {
|
||||
category = 2;
|
||||
mode = RBBI_START;
|
||||
}
|
||||
|
||||
|
||||
// loop until we reach the start of the text or transition to state 0
|
||||
//
|
||||
for (;;) {
|
||||
if (c == U_SENTINEL) {
|
||||
// Reached end of input string.
|
||||
if (mode == RBBI_END) {
|
||||
// We have already run the loop one last time with the
|
||||
// character set to the psueudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
break;
|
||||
}
|
||||
// Run the loop one last time with the fake end-of-input character category.
|
||||
mode = RBBI_END;
|
||||
category = 1;
|
||||
}
|
||||
|
||||
//
|
||||
// Get the char category. An incoming category of 1 or 2 means that
|
||||
// we are preset for doing the beginning or end of input, and
|
||||
// that we shouldn't get a category from an actual text input character.
|
||||
//
|
||||
if (mode == RBBI_RUN) {
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
|
||||
if (0x20<=c && c<0x7f) {
|
||||
RBBIDebugPrintf("\"%c\" ", c);
|
||||
} else {
|
||||
RBBIDebugPrintf("%5x ", c);
|
||||
}
|
||||
RBBIDebugPrintf("%3d %3d\n", state, category);
|
||||
}
|
||||
#endif
|
||||
|
||||
// State Transition - move machine to its next state
|
||||
//
|
||||
|
||||
// fNextState is a variable-length array.
|
||||
U_ASSERT(category<fData->fHeader->fCatCount);
|
||||
state = row->fNextState[category]; /*Not accessing beyond memory*/
|
||||
row = (RBBIStateTableRow *)
|
||||
(stateTable->fTableData + (stateTable->fRowLen * state));
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case.
|
||||
result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
}
|
||||
|
||||
int16_t completedRule = row->fAccepting;
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed.
|
||||
int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
UTEXT_SETNATIVEINDEX(&fText, lookaheadResult);
|
||||
return lookaheadResult;
|
||||
}
|
||||
}
|
||||
int16_t rule = row->fLookAhead;
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
lookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// We have advanced through the string until it is certain that no
|
||||
// longer match is possible, no matter what characters follow.
|
||||
break;
|
||||
}
|
||||
|
||||
// Move (backwards) to the next character to process.
|
||||
// If this is a beginning-of-input loop iteration, don't advance
|
||||
// the input position. The next iteration will be processing the
|
||||
// first real input character.
|
||||
if (mode == RBBI_RUN) {
|
||||
c = UTEXT_PREVIOUS32(&fText);
|
||||
} else {
|
||||
if (mode == RBBI_START) {
|
||||
mode = RBBI_RUN;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
|
||||
// If the iterator failed to advance in the match engine, force it ahead by one.
|
||||
// (This really indicates a defect in the break rules. They should always match
|
||||
// at least one character.)
|
||||
if (result == initialPosition) {
|
||||
UTEXT_SETNATIVEINDEX(&fText, initialPosition);
|
||||
UTEXT_PREVIOUS32(&fText);
|
||||
result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPrintf("result = %d\n\n", result);
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handleSafePrevious()
|
||||
|
|
|
@ -80,7 +80,7 @@ UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
|
|||
void RBBIDataWrapper::init0() {
|
||||
fHeader = NULL;
|
||||
fForwardTable = NULL;
|
||||
fSafeRevTable = NULL;
|
||||
fReverseTable = NULL;
|
||||
fRuleSource = NULL;
|
||||
fRuleStatusTable = NULL;
|
||||
fTrie = NULL;
|
||||
|
@ -109,9 +109,6 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
if (data->fRTableLen != 0) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
if (data->fSRTableLen != 0) { // TODO: obsolete. Remove.
|
||||
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
|
||||
}
|
||||
|
||||
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
|
@ -262,7 +259,7 @@ void RBBIDataWrapper::printData() {
|
|||
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
|
||||
|
||||
printTable("Forward State Transition Table", fForwardTable);
|
||||
printTable("Safe Reverse State Transition Table", fSafeRevTable);
|
||||
printTable("Reverse State Transition Table", fReverseTable);
|
||||
|
||||
RBBIDebugPrintf("\nOrignal Rules source:\n");
|
||||
for (int32_t c=0; fRuleSource[c] != 0; c++) {
|
||||
|
@ -402,28 +399,6 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
|
|||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
|
||||
// Safe Forward state table. Same layout as forward table, above.
|
||||
tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
|
||||
tableLength = ds->readUInt32(rbbiDH->fSFTableLen);
|
||||
|
||||
if (tableLength > 0) {
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
outBytes+tableStartOffset, status);
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
|
||||
// Safe Reverse state table. Same layout as forward table, above.
|
||||
tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
|
||||
tableLength = ds->readUInt32(rbbiDH->fSRTableLen);
|
||||
|
||||
if (tableLength > 0) {
|
||||
ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
|
||||
outBytes+tableStartOffset, status);
|
||||
ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
|
||||
outBytes+tableStartOffset+topSize, status);
|
||||
}
|
||||
|
||||
// Trie table for character categories
|
||||
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
|
||||
|
|
|
@ -81,10 +81,6 @@ struct RBBIDataHeader {
|
|||
uint32_t fFTableLen;
|
||||
uint32_t fRTable; /* Offset to the reverse state transition table. */
|
||||
uint32_t fRTableLen;
|
||||
uint32_t fSFTable; /* safe point forward transition table */
|
||||
uint32_t fSFTableLen;
|
||||
uint32_t fSRTable; /* safe point reverse transition table */
|
||||
uint32_t fSRTableLen;
|
||||
uint32_t fTrie; /* Offset to Trie data for character categories */
|
||||
uint32_t fTrieLen;
|
||||
uint32_t fRuleSource; /* Offset to the source for for the break */
|
||||
|
@ -173,8 +169,7 @@ public:
|
|||
/* */
|
||||
const RBBIDataHeader *fHeader;
|
||||
const RBBIStateTable *fForwardTable;
|
||||
const RBBIStateTable *fReverseTable; // auto-generated safe reverse.
|
||||
const RBBIStateTable *fSafeRevTable; // hand-written safe reverse. TODO: delete this.
|
||||
const RBBIStateTable *fReverseTable;
|
||||
const UChar *fRuleSource;
|
||||
const int32_t *fRuleStatusTable;
|
||||
|
||||
|
|
|
@ -62,8 +62,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
|||
fSafeFwdTree = NULL;
|
||||
fSafeRevTree = NULL;
|
||||
fDefaultTree = &fForwardTree;
|
||||
fForwardTables = NULL;
|
||||
fSafeRevTables = NULL;
|
||||
fForwardTable = NULL;
|
||||
fRuleStatusVals = NULL;
|
||||
fChainRules = FALSE;
|
||||
fLBCMNoChain = FALSE;
|
||||
|
@ -112,9 +111,7 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
|
|||
|
||||
delete fUSetNodes;
|
||||
delete fSetBuilder;
|
||||
delete fForwardTables;
|
||||
delete fSafeRevTables;
|
||||
|
||||
delete fForwardTable;
|
||||
delete fForwardTree;
|
||||
delete fReverseTree;
|
||||
delete fSafeFwdTree;
|
||||
|
@ -153,9 +150,8 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
// without the padding.
|
||||
//
|
||||
int32_t headerSize = align8(sizeof(RBBIDataHeader));
|
||||
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
||||
int32_t reverseTableSize = align8(fForwardTables->getSafeTableSize());
|
||||
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); // TODO: remove hand-written rules.
|
||||
int32_t forwardTableSize = align8(fForwardTable->getTableSize());
|
||||
int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
||||
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
|
||||
|
@ -163,7 +159,6 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
int32_t totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
+ reverseTableSize
|
||||
+ safeRevTableSize
|
||||
+ statusTableSize + trieSize + rulesSize;
|
||||
|
||||
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
|
||||
|
@ -188,16 +183,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
data->fRTable = data->fFTable + data->fFTableLen;
|
||||
data->fRTableLen = reverseTableSize;
|
||||
|
||||
// Do not save the Safe Forward table.
|
||||
data->fSFTable = data->fRTable + data->fRTableLen;
|
||||
data->fSFTableLen = 0;
|
||||
|
||||
// Hand written reverse rules. TODO: remove, once synthesized ones are working.
|
||||
data->fSRTable = data->fSFTable + data->fSFTableLen;
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
U_ASSERT(safeRevTableSize > 0);
|
||||
|
||||
data->fTrie = data->fSRTable + data->fSRTableLen;
|
||||
data->fTrie = data->fRTable + data->fRTableLen;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fStatusTable = data->fTrie + trieSize;
|
||||
data->fStatusTableLen= statusTableSize;
|
||||
|
@ -206,10 +192,8 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable);
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
|
||||
fForwardTable->exportTable((uint8_t *)data + data->fFTable);
|
||||
fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable);
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
|
||||
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
|
||||
|
@ -286,25 +270,22 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
|||
//
|
||||
// Generate the DFA state transition table.
|
||||
//
|
||||
fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
|
||||
fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
|
||||
if (fForwardTables == nullptr || fSafeRevTables == nullptr)
|
||||
{
|
||||
fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
|
||||
if (fForwardTable == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fForwardTables->build();
|
||||
fSafeRevTables->build();
|
||||
fForwardTable->buildForwardTable();
|
||||
optimizeTables();
|
||||
fForwardTables->buildSafe(status);
|
||||
fForwardTable->buildSafeReverseTable(status);
|
||||
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
|
||||
fForwardTables->printStates();
|
||||
fForwardTables->printRuleStatusTable();
|
||||
fForwardTables->printSafeTable();
|
||||
fForwardTable->printStates();
|
||||
fForwardTable->printRuleStatusTable();
|
||||
fForwardTable->printReverseTable();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -327,14 +308,11 @@ void RBBIRuleBuilder::optimizeTables() {
|
|||
|
||||
leftClass = 3;
|
||||
rightClass = 0;
|
||||
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
|
||||
while (fForwardTable->findDuplCharClassFrom(leftClass, rightClass)) {
|
||||
fSetBuilder->mergeCategories(leftClass, rightClass);
|
||||
fForwardTables->removeColumn(rightClass);
|
||||
fSafeRevTables->removeColumn(rightClass);
|
||||
fForwardTable->removeColumn(rightClass);
|
||||
}
|
||||
|
||||
fForwardTables->removeDuplicateStates();
|
||||
fSafeRevTables->removeDuplicateStates();
|
||||
fForwardTable->removeDuplicateStates();
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -168,8 +168,7 @@ public:
|
|||
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
|
||||
UVector *fUSetNodes; // Vector of all uset nodes.
|
||||
|
||||
RBBITableBuilder *fForwardTables; // State transition tables
|
||||
RBBITableBuilder *fSafeRevTables;
|
||||
RBBITableBuilder *fForwardTable; // State transition table, build time form.
|
||||
|
||||
UVector *fRuleStatusVals; // The values that can be returned
|
||||
// from getRuleStatus().
|
||||
|
|
|
@ -61,7 +61,7 @@ RBBITableBuilder::~RBBITableBuilder() {
|
|||
// table from the RBBI rules parse tree.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::build() {
|
||||
void RBBITableBuilder::buildForwardTable() {
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
|
@ -1150,6 +1150,35 @@ bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplStat
|
|||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool RBBITableBuilder::findDuplicateSafeState(int32_t *firstState, int32_t *duplState) {
|
||||
int32_t numStates = fSafeTable->size();
|
||||
|
||||
for (; *firstState<numStates-1; ++(*firstState)) {
|
||||
UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(*firstState));
|
||||
for (*duplState=*firstState+1; *duplState<numStates; ++(*duplState)) {
|
||||
UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(*duplState));
|
||||
bool rowsMatch = true;
|
||||
int32_t numCols = firstRow->length();
|
||||
for (int32_t col=0; col < numCols; ++col) {
|
||||
int32_t firstVal = firstRow->charAt(col);
|
||||
int32_t duplVal = duplRow->charAt(col);
|
||||
if (!((firstVal == duplVal) ||
|
||||
((firstVal == *firstState || firstVal == *duplState) &&
|
||||
(duplVal == *firstState || duplVal == *duplState)))) {
|
||||
rowsMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rowsMatch) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
|
||||
U_ASSERT(keepState < duplState);
|
||||
U_ASSERT(duplState < fDStates->size());
|
||||
|
@ -1185,6 +1214,29 @@ void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
|
|||
}
|
||||
}
|
||||
|
||||
void RBBITableBuilder::removeSafeState(int32_t keepState, int32_t duplState) {
|
||||
U_ASSERT(keepState < duplState);
|
||||
U_ASSERT(duplState < fSafeTable->size());
|
||||
|
||||
fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
|
||||
// and will auto-delete the removed element.
|
||||
int32_t numStates = fSafeTable->size();
|
||||
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
for (int32_t state=0; state<numStates; ++state) {
|
||||
UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
|
||||
for (int32_t col=0; col<numCols; col++) {
|
||||
int32_t existingVal = sd->charAt(col);
|
||||
int32_t newVal = existingVal;
|
||||
if (existingVal == duplState) {
|
||||
newVal = keepState;
|
||||
} else if (existingVal > duplState) {
|
||||
newVal = existingVal - 1;
|
||||
}
|
||||
sd->setCharAt(col, newVal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* RemoveDuplicateStates
|
||||
|
@ -1198,6 +1250,7 @@ void RBBITableBuilder::removeDuplicateStates() {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// getTableSize() Calculate the size of the runtime form of this
|
||||
|
@ -1277,7 +1330,7 @@ void RBBITableBuilder::exportTable(void *where) {
|
|||
/**
|
||||
* Synthesize a safe state table from the main state table.
|
||||
*/
|
||||
void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
||||
void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
|
||||
// Find safe char class pairs.
|
||||
|
||||
// make a state table row for each trailing class, and map from class to row.
|
||||
|
@ -1358,8 +1411,13 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
|||
rowState.setCharAt(c1, 0);
|
||||
}
|
||||
|
||||
// TODO: Merge similar states.
|
||||
|
||||
// Remove duplicate or redundant rows from the table.
|
||||
int32_t firstState = 1;
|
||||
int32_t duplicateState = 0; // initial value is not used; set by findDuplicateSafeState().
|
||||
while (findDuplicateSafeState(&firstState, &duplicateState)) {
|
||||
// printf("Removing duplicate safe states (%d, %d)\n", firstState, duplicateState);
|
||||
removeSafeState(firstState, duplicateState);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1493,7 +1551,7 @@ void RBBITableBuilder::printStates() {
|
|||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBITableBuilder::printSafeTable() {
|
||||
void RBBITableBuilder::printReverseTable() {
|
||||
int c; // input "character"
|
||||
int n; // state number
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ public:
|
|||
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
|
||||
~RBBITableBuilder();
|
||||
|
||||
void build();
|
||||
void buildForwardTable();
|
||||
|
||||
/** Return the runtime size in bytes of the built state table. */
|
||||
int32_t getTableSize() const;
|
||||
|
@ -63,7 +63,8 @@ public:
|
|||
/** Check for, and remove dupicate states (table rows). */
|
||||
void removeDuplicateStates();
|
||||
|
||||
void buildSafe(UErrorCode &status);
|
||||
/** Build the safe reverse table from the already-constructed forward table. */
|
||||
void buildSafeReverseTable(UErrorCode &status);
|
||||
|
||||
/** Return the runtime size in bytes of the built safe reverse state table. */
|
||||
int32_t getSafeTableSize() const;
|
||||
|
@ -109,6 +110,21 @@ private:
|
|||
*/
|
||||
void removeState(int32_t keepState, int32_t duplState);
|
||||
|
||||
/** Find the next duplicate state in the safe reverse table. An iterator function.
|
||||
* @param firstState ptr to state variable. Begin looking at this state, set to the first of the
|
||||
* pair of duplicates on return.
|
||||
* @param duplicateState ptr to where to return the duplicate state of fistState. Output only.
|
||||
* @return true if a duplicate pair of states was found.
|
||||
*/
|
||||
bool findDuplicateSafeState(int32_t *firstState, int32_t *duplicateState);
|
||||
|
||||
/** Remove a duplicate state from the safe table.
|
||||
* @param keepState First of the duplicate pair. Keep it.
|
||||
* @param duplState Duplicate state. Remove it. Redirect all table references to the duplicate state
|
||||
* to refer to keepState instead.
|
||||
*/
|
||||
void removeSafeState(int32_t keepState, int32_t duplState);
|
||||
|
||||
// Set functions for UVector.
|
||||
// TODO: make a USet subclass of UVector
|
||||
|
||||
|
@ -123,7 +139,7 @@ public:
|
|||
void printPosSets(RBBINode *n /* = NULL*/);
|
||||
void printStates();
|
||||
void printRuleStatusTable();
|
||||
void printSafeTable();
|
||||
void printReverseTable();
|
||||
#else
|
||||
#define printSet(s)
|
||||
#define printPosSets(n)
|
||||
|
|
|
@ -49,7 +49,6 @@ $EmojiNRK = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\
|
|||
## -------------------------------------------------
|
||||
!!chain;
|
||||
!!lookAheadHardBreak;
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
|
||||
|
@ -80,9 +79,3 @@ $Prepend [^$Control $CR $LF];
|
|||
|
||||
# GB 999 Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
|
||||
|
|
|
@ -99,8 +99,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -338,36 +336,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -105,8 +105,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -347,35 +345,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -108,8 +108,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -350,35 +348,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -118,8 +118,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -364,35 +362,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -104,8 +104,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -349,35 +347,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -103,8 +103,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -342,35 +340,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -106,8 +106,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -348,35 +346,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA | $BAX) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -103,8 +103,6 @@ $ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -345,36 +343,3 @@ $EB $CM* $EM;
|
|||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 9
|
||||
^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
^$CM+ $SP / .;
|
||||
|
||||
# LB 14
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 15
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$SP+ $CM* ($CL | $CP);
|
||||
|
||||
# LB 17
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA | $HH) $CM* $HL;
|
||||
|
||||
# LB 25
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
($CL | $CP) $CM* ($NU | $IS | $SY);
|
||||
|
||||
# LB 30
|
||||
($CM* $RI)+;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -50,7 +50,6 @@ $CloseEx = $Close ($Extend | $Format)*;
|
|||
## -------------------------------------------------
|
||||
|
||||
!!chain;
|
||||
!!forward;
|
||||
|
||||
# Rule 3 - break after separators. Keep CR/LF together.
|
||||
#
|
||||
|
@ -82,32 +81,3 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
|||
#Rule 12
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
$SpEx_R = ($Extend | $Format)* $Sp;
|
||||
$ATermEx_R = ($Extend | $Format)* $ATerm;
|
||||
$STermEx_R = ($Extend | $Format)* $STerm;
|
||||
$CloseEx_R = ($Extend | $Format)* $Close;
|
||||
|
||||
[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
|
||||
#.*;
|
||||
|
||||
# Explanation for this rule:
|
||||
#
|
||||
# It needs to back over
|
||||
# The $Sep at which we probably begin
|
||||
# All of the non $Sep chars leading to the preceding $Sep
|
||||
# The preceding $Sep, which will be the second one that the rule matches.
|
||||
# Any immediately preceding STerm or ATerm sequences. We need to see these
|
||||
# to get the correct rule status when moving forwards again.
|
||||
#
|
||||
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
|
||||
# the entire string. TODO: can bof be replaced with ^
|
||||
#
|
||||
# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
|
||||
# at the beginning of the string at this point, and we don't want to fail.
|
||||
# Can only use {eof} once, and it is used later.
|
||||
#
|
||||
|
|
|
@ -51,7 +51,6 @@ $CloseEx = $Close ($Extend | $Format)*;
|
|||
## -------------------------------------------------
|
||||
|
||||
!!chain;
|
||||
!!forward;
|
||||
|
||||
# Rule 3 - break after separators. Keep CR/LF together.
|
||||
#
|
||||
|
@ -83,40 +82,3 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
|||
#Rule 12
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
$SpEx_R = ($Extend | $Format)* $Sp;
|
||||
$ATermEx_R = ($Extend | $Format)* $ATerm;
|
||||
$STermEx_R = ($Extend | $Format)* $STerm;
|
||||
$CloseEx_R = ($Extend | $Format)* $Close;
|
||||
|
||||
#
|
||||
# Reverse rules.
|
||||
# For now, use the old style inexact reverse rules, which are easier
|
||||
# to write, but less efficient.
|
||||
# TODO: exact reverse rules. It appears that exact reverse rules
|
||||
# may require improving support for look-ahead breaks in the
|
||||
# builder. Needs more investigation.
|
||||
#
|
||||
|
||||
[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
|
||||
|
||||
# Explanation for this rule:
|
||||
#
|
||||
# It needs to back over
|
||||
# The $Sep at which we probably begin
|
||||
# All of the non $Sep chars leading to the preceding $Sep
|
||||
# The preceding $Sep, which will be the second one that the rule matches.
|
||||
# Any immediately preceding STerm or ATerm sequences. We need to see these
|
||||
# to get the correct rule status when moving forwards again.
|
||||
#
|
||||
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
|
||||
# the entire string.
|
||||
#
|
||||
# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
|
||||
# at the beginning of the string at this point, and we don't want to fail.
|
||||
# Can only use {eof} once, and it is used later.
|
||||
#
|
||||
|
|
|
@ -13,8 +13,6 @@ $CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
|
|||
$Cased = [[:Upper_Case:][:Lower_Case:][:Lt:] - $CaseIgnorable];
|
||||
$NotCased = [[^ $Cased] - $CaseIgnorable];
|
||||
|
||||
!!forward;
|
||||
|
||||
# If the iterator begins on a CaseIgnorable, advance it past it/them.
|
||||
# This can occur at the start-of-text, or after application of the
|
||||
# safe-reverse rule.
|
||||
|
@ -26,12 +24,3 @@ $NotCased = [[^ $Cased] - $CaseIgnorable];
|
|||
# the uncased characters following the word.
|
||||
|
||||
$Cased ($Cased | $CaseIgnorable)* ($NotCased | $CaseIgnorable)*;
|
||||
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# Safe Reverse: the exact forward rule must not start in the middle
|
||||
# of a word, so the safe reverse skips over any Cased characters,
|
||||
# leaving it just before the start of a word.
|
||||
|
||||
($Cased | $CaseIgnorable)*;
|
||||
|
|
|
@ -97,9 +97,6 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
$CR $LF;
|
||||
|
@ -197,27 +194,3 @@ $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji foun
|
|||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
|
||||
|
||||
# rule 7b
|
||||
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
|
||||
# rule 13c
|
||||
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -97,9 +97,6 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
$CR $LF;
|
||||
|
@ -197,27 +194,3 @@ $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji foun
|
|||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
|
||||
|
||||
# rule 7b
|
||||
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
|
||||
# rule 13c
|
||||
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
Loading…
Add table
Reference in a new issue