mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-20303 Break Iterator, improve handling of look-ahead rules.
- Merge the look-ahead results slots used when multiple rules share a common accepting state. - Sequentially number the look-ahead result slot. Will eventually allow replacing the runtime map with an array. - Inhibit chaining out of look-ahead rules. This could never actually happen; when a hard break rule matches, the engine is stopped immediately, but the state table was being constructed as if it could happen. Reduces table size for line break rules. - Remove incorrect handling of fAccepting and fLookAhead fields of a state table row when removing duplicate states. Look-ahead slot number was being mis-interpreted as a state number.
This commit is contained in:
parent
7917df1e80
commit
faa2f9f9e1
13 changed files with 350 additions and 134 deletions
|
@ -883,9 +883,15 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
return lookaheadResult;
|
||||
}
|
||||
}
|
||||
|
||||
// If we are at the position of the '/' in a look-ahead (hard break) rule;
|
||||
// record the current position, to be returned later, if the full rule matches.
|
||||
// TODO: Move this check before the previous check of fAccepting.
|
||||
// This would enable hard-break rules with no following context.
|
||||
// But there are line break test failures when trying this. Investigate.
|
||||
// Issue ICU-20837
|
||||
int16_t rule = row->fLookAhead;
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
lookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
|
|
|
@ -1274,6 +1274,10 @@ void RBBIRuleScanner::scanSet() {
|
|||
|
||||
}
|
||||
|
||||
int32_t RBBIRuleScanner::numRules() {
|
||||
return fRuleNum;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -73,6 +73,8 @@ public:
|
|||
// reverse rules,
|
||||
// and a list of UnicodeSets encountered.
|
||||
|
||||
int32_t numRules(); // Return the number of rules that have been seen.
|
||||
|
||||
/**
|
||||
* Return a rules string without unnecessary
|
||||
* characters.
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "unicode/unistr.h"
|
||||
#include "rbbitblb.h"
|
||||
#include "rbbirb.h"
|
||||
#include "rbbiscan.h"
|
||||
#include "rbbisetb.h"
|
||||
#include "rbbidata.h"
|
||||
#include "cstring.h"
|
||||
|
@ -52,6 +53,7 @@ RBBITableBuilder::~RBBITableBuilder() {
|
|||
}
|
||||
delete fDStates;
|
||||
delete fSafeTable;
|
||||
delete fLookAheadRuleMap;
|
||||
}
|
||||
|
||||
|
||||
|
@ -121,7 +123,7 @@ void RBBITableBuilder::buildForwardTable() {
|
|||
}
|
||||
cn->fLeftChild = fTree;
|
||||
fTree->fParent = cn;
|
||||
cn->fRightChild = new RBBINode(RBBINode::endMark);
|
||||
RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
|
||||
// Delete and exit if memory allocation failed.
|
||||
if (cn->fRightChild == NULL) {
|
||||
*fStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
|
@ -164,7 +166,7 @@ void RBBITableBuilder::buildForwardTable() {
|
|||
// For "chained" rules, modify the followPos sets
|
||||
//
|
||||
if (fRB->fChainRules) {
|
||||
calcChainedFollowPos(fTree);
|
||||
calcChainedFollowPos(fTree, endMarkerNode);
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -178,6 +180,7 @@ void RBBITableBuilder::buildForwardTable() {
|
|||
// Build the DFA state transition tables.
|
||||
//
|
||||
buildStateTable();
|
||||
mapLookAheadRules();
|
||||
flagAcceptingStates();
|
||||
flagLookAheadStates();
|
||||
flagTaggedStates();
|
||||
|
@ -401,19 +404,13 @@ void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
|
|||
// to implement rule chaining. NOT described by Aho
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
||||
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
|
||||
|
||||
UVector endMarkerNodes(*fStatus);
|
||||
UVector leafNodes(*fStatus);
|
||||
int32_t i;
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// get a list of all endmarker nodes.
|
||||
tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
|
||||
|
||||
// get a list all leaf nodes
|
||||
tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
|
@ -442,28 +439,26 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
int32_t startNodeIx;
|
||||
|
||||
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
|
||||
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
|
||||
RBBINode *endNode = NULL;
|
||||
RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
|
||||
|
||||
// Identify leaf nodes that correspond to overall rule match positions.
|
||||
// These include an endMarkerNode in their followPos sets.
|
||||
for (i=0; i<endMarkerNodes.size(); i++) {
|
||||
if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
|
||||
endNode = tNode;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (endNode == NULL) {
|
||||
// node wasn't an end node. Try again with the next.
|
||||
// These include the endMarkNode in their followPos sets.
|
||||
//
|
||||
// Note: do not consider other end marker nodes, those that are added to
|
||||
// look-ahead rules. These can't chain; a match immediately stops
|
||||
// further matching. This leaves exactly one end marker node, the one
|
||||
// at the end of the complete tree.
|
||||
|
||||
if (!endNode->fFollowPos->contains(endMarkNode)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We've got a node that can end a match.
|
||||
|
||||
// Line Break Specific hack: If this node's val correspond to the $CM char class,
|
||||
// don't chain from it.
|
||||
// TODO: Add rule syntax for this behavior, get specifics out of here and
|
||||
// into the rule file.
|
||||
// !!LBCMNoChain implementation: If this node's val correspond to
|
||||
// the Line Break $CM char class, don't chain from it.
|
||||
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
|
||||
// by any of the standard ICU rules.
|
||||
if (fRB->fLBCMNoChain) {
|
||||
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
|
||||
if (c != -1) {
|
||||
|
@ -475,7 +470,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
RBBINode *startNode;
|
||||
|
@ -705,6 +699,77 @@ ExitBuildSTdeleteall:
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* mapLookAheadRules
|
||||
*
|
||||
*/
|
||||
void RBBITableBuilder::mapLookAheadRules() {
|
||||
fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
|
||||
if (fLookAheadRuleMap == nullptr) {
|
||||
*fStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
|
||||
int32_t laSlotsInUse = 0;
|
||||
|
||||
for (int32_t n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
int32_t laSlotForState = 0;
|
||||
|
||||
// Establish the look-ahead slot for this state, if the state covers
|
||||
// any look-ahead nodes - corresponding to the '/' in look-ahead rules.
|
||||
|
||||
// If any of the look-ahead nodes already have a slot assigned, use it,
|
||||
// otherwise assign a new one.
|
||||
|
||||
bool sawLookAheadNode = false;
|
||||
for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
|
||||
RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
|
||||
if (node->fType != RBBINode::NodeType::lookAhead) {
|
||||
continue;
|
||||
}
|
||||
sawLookAheadNode = true;
|
||||
int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
|
||||
U_ASSERT(ruleNum < fLookAheadRuleMap->size());
|
||||
U_ASSERT(ruleNum > 0);
|
||||
int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
|
||||
if (laSlot != 0) {
|
||||
if (laSlotForState == 0) {
|
||||
laSlotForState = laSlot;
|
||||
} else {
|
||||
// TODO: figure out if this can fail, change to setting an error code if so.
|
||||
U_ASSERT(laSlot == laSlotForState);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!sawLookAheadNode) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (laSlotForState == 0) {
|
||||
laSlotForState = ++laSlotsInUse;
|
||||
}
|
||||
|
||||
// For each look ahead node covered by this state,
|
||||
// set the mapping from the node's rule number to the look ahead slot.
|
||||
// There can be multiple nodes/rule numbers going to the same la slot.
|
||||
|
||||
for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
|
||||
RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
|
||||
if (node->fType != RBBINode::NodeType::lookAhead) {
|
||||
continue;
|
||||
}
|
||||
int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
|
||||
int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
|
||||
(void)existingVal;
|
||||
U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
|
||||
fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -744,28 +809,19 @@ void RBBITableBuilder::flagAcceptingStates() {
|
|||
|
||||
if (sd->fAccepting==0) {
|
||||
// State hasn't been marked as accepting yet. Do it now.
|
||||
sd->fAccepting = endMarker->fVal;
|
||||
sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
|
||||
if (sd->fAccepting == 0) {
|
||||
sd->fAccepting = -1;
|
||||
}
|
||||
}
|
||||
if (sd->fAccepting==-1 && endMarker->fVal != 0) {
|
||||
// Both lookahead and non-lookahead accepting for this state.
|
||||
// Favor the look-ahead. Expedient for line break.
|
||||
// TODO: need a more elegant resolution for conflicting rules.
|
||||
sd->fAccepting = endMarker->fVal;
|
||||
// Favor the look-ahead, because a look-ahead match needs to
|
||||
// immediately stop the run-time engine. First match, not longest.
|
||||
sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
|
||||
}
|
||||
// implicit else:
|
||||
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
|
||||
|
||||
// If the end marker node is from a look-ahead rule, set
|
||||
// the fLookAhead field for this state also.
|
||||
if (endMarker->fLookAheadEnd) {
|
||||
// TODO: don't change value if already set?
|
||||
// TODO: allow for more than one active look-ahead rule in engine.
|
||||
// Make value here an index to a side array in engine?
|
||||
sd->fLookAhead = sd->fAccepting;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -792,11 +848,20 @@ void RBBITableBuilder::flagLookAheadStates() {
|
|||
}
|
||||
for (i=0; i<lookAheadNodes.size(); i++) {
|
||||
lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
|
||||
U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
|
||||
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
|
||||
sd->fLookAhead = lookAheadNode->fVal;
|
||||
int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
|
||||
if (positionsIdx >= 0) {
|
||||
U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
|
||||
int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
|
||||
U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
|
||||
// if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
|
||||
// printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
|
||||
// __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
|
||||
// }
|
||||
sd->fLookAhead = lookaheadSlot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1204,16 +1269,6 @@ void RBBITableBuilder::removeState(IntPair duplStates) {
|
|||
}
|
||||
sd->fDtran->setElementAt(newVal, col);
|
||||
}
|
||||
if (sd->fAccepting == duplState) {
|
||||
sd->fAccepting = keepState;
|
||||
} else if (sd->fAccepting > duplState) {
|
||||
sd->fAccepting--;
|
||||
}
|
||||
if (sd->fLookAhead == duplState) {
|
||||
sd->fLookAhead = keepState;
|
||||
} else if (sd->fLookAhead > duplState) {
|
||||
sd->fLookAhead--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -91,9 +91,10 @@ private:
|
|||
void calcFirstPos(RBBINode *n);
|
||||
void calcLastPos(RBBINode *n);
|
||||
void calcFollowPos(RBBINode *n);
|
||||
void calcChainedFollowPos(RBBINode *n);
|
||||
void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
|
||||
void bofFixup();
|
||||
void buildStateTable();
|
||||
void mapLookAheadRules();
|
||||
void flagAcceptingStates();
|
||||
void flagLookAheadStates();
|
||||
void flagTaggedStates();
|
||||
|
@ -175,6 +176,9 @@ private:
|
|||
/** Synthesized safe table, UVector of UnicodeString, one string per table row. */
|
||||
UVector *fSafeTable;
|
||||
|
||||
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
|
||||
UVector32 *fLookAheadRuleMap = nullptr;
|
||||
|
||||
|
||||
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
|
||||
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
|
||||
|
|
|
@ -125,6 +125,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
TESTCASE_AUTO(TestBug13447);
|
||||
TESTCASE_AUTO(TestReverse);
|
||||
TESTCASE_AUTO(TestBug13692);
|
||||
TESTCASE_AUTO(TestDebugRules);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -4778,6 +4779,19 @@ void RBBITest::TestBug13692() {
|
|||
assertSuccess(WHERE, status);
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestProperties() {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
|
||||
if (!prependSet.isEmpty()) {
|
||||
errln(
|
||||
"[:GCB=Prepend:] is not empty any more. "
|
||||
"Uncomment relevant lines in source/data/brkitr/char.txt and "
|
||||
"change this test to the opposite condition.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
|
@ -4796,15 +4810,56 @@ void RBBITest::TestDebug(void) {
|
|||
assertSuccess(WHERE, status);
|
||||
}
|
||||
|
||||
void RBBITest::TestProperties() {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
|
||||
if (!prependSet.isEmpty()) {
|
||||
errln(
|
||||
"[:GCB=Prepend:] is not empty any more. "
|
||||
"Uncomment relevant lines in source/data/brkitr/char.txt and "
|
||||
"change this test to the opposite condition.");
|
||||
|
||||
//
|
||||
// TestDebugRules A stub test for use in debugging rule compilation problems.
|
||||
// Can be freely altered as needed or convenient.
|
||||
// Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
|
||||
// data files may not be available in all environments.
|
||||
// Any permanent test cases should be moved to rbbitst.txt
|
||||
// (see Bug 20303 in that file, for example), or to another test function in this file.
|
||||
//
|
||||
void RBBITest::TestDebugRules() {
|
||||
#if 0
|
||||
const char16_t *rules = u""
|
||||
"!!quoted_literals_only; \n"
|
||||
"!!chain; \n"
|
||||
"!!lookAheadHardBreak; \n"
|
||||
" \n"
|
||||
// "[a] / ; \n"
|
||||
"[a] [b] / [c] [d]; \n"
|
||||
"[a] [b] / [c] [d] {100}; \n"
|
||||
"[x] [a] [b] / [c] [d] {100}; \n"
|
||||
"[a] [b] [c] / [d] {100}; \n"
|
||||
//" [c] [d] / [e] [f]; \n"
|
||||
//"[a] [b] / [c]; \n"
|
||||
;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
CharString path(pathToDataDirectory(), status);
|
||||
path.appendPathPart("brkitr", status);
|
||||
path.appendPathPart("rules", status);
|
||||
path.appendPathPart("line.txt", status);
|
||||
int len;
|
||||
std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
|
||||
if (!assertSuccess(WHERE, status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UParseError pe;
|
||||
// rules = testFile.get();
|
||||
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
|
||||
|
||||
if (!assertSuccess(WHERE, status)) {
|
||||
delete bi;
|
||||
return;
|
||||
}
|
||||
// bi->dumpTables();
|
||||
|
||||
delete bi;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
|
|
@ -82,6 +82,7 @@ public:
|
|||
void TestReverse();
|
||||
void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
|
||||
void TestBug13692();
|
||||
void TestDebugRules();
|
||||
|
||||
void TestDebug();
|
||||
void TestProperties();
|
||||
|
|
20
icu4c/source/test/testdata/rbbitst.txt
vendored
20
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1991,3 +1991,23 @@ Bangkok)•</data>
|
|||
•
|
||||
•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。•
|
||||
•</data>
|
||||
|
||||
|
||||
#
|
||||
# Bug 20303 Multiple Look-ahead rules with similar contexts.
|
||||
# Check that samples of such rules are being handled correctly.
|
||||
#
|
||||
|
||||
<rules>
|
||||
!!forward;
|
||||
!!quoted_literals_only;
|
||||
!!chain;
|
||||
[a] [b] / [c] [d];
|
||||
[a] [b] / [c] [d] {100};
|
||||
[a] [b] / [e] [f] {200};
|
||||
[a] [b] / [e] [g] {300};
|
||||
[a] [b] [c] [h] {400};
|
||||
[x] [a] [b] / [c] [d] {500};
|
||||
[y] [a] [b] [c] [d] {600};
|
||||
</rules>
|
||||
<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>
|
||||
|
|
|
@ -73,7 +73,7 @@ class RBBIRuleScanner {
|
|||
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
|
||||
// $variable symbols.
|
||||
|
||||
HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
|
||||
HashMap<String, RBBISetTableEl> fSetTable = new HashMap<>(); // UnicocodeSet hash table, holds indexes to
|
||||
// the sets created while parsing rules.
|
||||
// The key is the string used for creating
|
||||
// the set.
|
||||
|
@ -933,7 +933,7 @@ class RBBIRuleScanner {
|
|||
// Perform any action specified by this row in the state table.
|
||||
if (doParseActions(tableEl.fAction) == false) {
|
||||
// Break out of the state machine loop if the
|
||||
// the action signalled some kind of error, or
|
||||
// the action signaled some kind of error, or
|
||||
// the action was to exit, occurs on normal end-of-rules-input.
|
||||
break;
|
||||
}
|
||||
|
@ -1070,7 +1070,7 @@ class RBBIRuleScanner {
|
|||
error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
|
||||
}
|
||||
|
||||
// Advance the RBBI parse postion over the UnicodeSet pattern.
|
||||
// Advance the RBBI parse position over the UnicodeSet pattern.
|
||||
// Don't just set fScanIndex because the line/char positions maintained
|
||||
// for error reporting would be thrown off.
|
||||
i = pos.getIndex();
|
||||
|
@ -1089,12 +1089,17 @@ class RBBIRuleScanner {
|
|||
n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
|
||||
// findSetFor() serves several purposes here:
|
||||
// - Adopts storage for the UnicodeSet, will be responsible for deleting.
|
||||
// - Mantains collection of all sets in use, needed later for establishing
|
||||
// - Maintains collection of all sets in use, needed later for establishing
|
||||
// character categories for run time engine.
|
||||
// - Eliminates mulitiple instances of the same set.
|
||||
// - Eliminates multiple instances of the same set.
|
||||
// - Creates a new uset node if necessary (if this isn't a duplicate.)
|
||||
findSetFor(n.fText, n, uset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of rules that have been seen.
|
||||
*/
|
||||
int numRules() {
|
||||
return fRuleNum;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -53,8 +53,8 @@ class RBBITableBuilder {
|
|||
// in RBBITableBuilder.fDStates
|
||||
|
||||
RBBIStateDescriptor(int maxInputSymbol) {
|
||||
fTagVals = new TreeSet<Integer>();
|
||||
fPositions = new HashSet<RBBINode>();
|
||||
fTagVals = new TreeSet<>();
|
||||
fPositions = new HashSet<>();
|
||||
fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
|
||||
// It is indexed by input symbols, and will
|
||||
// hold the next state number for each
|
||||
|
@ -74,6 +74,9 @@ class RBBITableBuilder {
|
|||
/** Synthesized safe table, a List of row arrays. */
|
||||
private List<short[]> fSafeTable;
|
||||
|
||||
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
|
||||
int[] fLookAheadRuleMap;
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor for RBBITableBuilder.
|
||||
|
@ -84,7 +87,7 @@ class RBBITableBuilder {
|
|||
RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
|
||||
fRootIx = rootNodeIx;
|
||||
fRB = rb;
|
||||
fDStates = new ArrayList<RBBIStateDescriptor>();
|
||||
fDStates = new ArrayList<>();
|
||||
}
|
||||
|
||||
|
||||
|
@ -137,7 +140,7 @@ class RBBITableBuilder {
|
|||
RBBINode cn = new RBBINode(RBBINode.opCat);
|
||||
cn.fLeftChild = fRB.fTreeRoots[fRootIx];
|
||||
fRB.fTreeRoots[fRootIx].fParent = cn;
|
||||
cn.fRightChild = new RBBINode(RBBINode.endMark);
|
||||
RBBINode endMarkerNode = cn.fRightChild = new RBBINode(RBBINode.endMark);
|
||||
cn.fRightChild.fParent = cn;
|
||||
fRB.fTreeRoots[fRootIx] = cn;
|
||||
|
||||
|
@ -172,7 +175,7 @@ class RBBITableBuilder {
|
|||
// For "chained" rules, modify the followPos sets
|
||||
//
|
||||
if (fRB.fChainRules) {
|
||||
calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
|
||||
calcChainedFollowPos(fRB.fTreeRoots[fRootIx], endMarkerNode);
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -186,6 +189,7 @@ class RBBITableBuilder {
|
|||
// Build the DFA state transition tables.
|
||||
//
|
||||
buildStateTable();
|
||||
mapLookAheadRules();
|
||||
flagAcceptingStates();
|
||||
flagLookAheadStates();
|
||||
flagTaggedStates();
|
||||
|
@ -390,13 +394,9 @@ class RBBITableBuilder {
|
|||
// to implement rule chaining. NOT described by Aho
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void calcChainedFollowPos(RBBINode tree) {
|
||||
void calcChainedFollowPos(RBBINode tree, RBBINode endMarkNode) {
|
||||
|
||||
List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
|
||||
List<RBBINode> leafNodes = new ArrayList<RBBINode>();
|
||||
|
||||
// get a list of all endmarker nodes.
|
||||
tree.findNodes(endMarkerNodes, RBBINode.endMark);
|
||||
List<RBBINode> leafNodes = new ArrayList<>();
|
||||
|
||||
// get a list all leaf nodes
|
||||
tree.findNodes(leafNodes, RBBINode.leafChar);
|
||||
|
@ -405,10 +405,10 @@ class RBBITableBuilder {
|
|||
// with inbound chaining enabled, which is the union of the
|
||||
// firstPosition sets from each of the rule root nodes.
|
||||
|
||||
List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
|
||||
List<RBBINode> ruleRootNodes = new ArrayList<>();
|
||||
addRuleRootNodes(ruleRootNodes, tree);
|
||||
|
||||
Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
|
||||
Set<RBBINode> matchStartNodes = new HashSet<>();
|
||||
for (RBBINode node: ruleRootNodes) {
|
||||
if (node.fChainIn) {
|
||||
matchStartNodes.addAll(node.fFirstPosSet);
|
||||
|
@ -417,28 +417,26 @@ class RBBITableBuilder {
|
|||
|
||||
// Iterate over all leaf nodes,
|
||||
//
|
||||
for (RBBINode tNode : leafNodes) {
|
||||
RBBINode endNode = null;
|
||||
for (RBBINode endNode : leafNodes) {
|
||||
|
||||
// Identify leaf nodes that correspond to overall rule match positions.
|
||||
// These include an endMarkerNode in their followPos sets.
|
||||
for (RBBINode endMarkerNode : endMarkerNodes) {
|
||||
if (tNode.fFollowPos.contains(endMarkerNode)) {
|
||||
endNode = tNode;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (endNode == null) {
|
||||
// node wasn't an end node. Try again with the next.
|
||||
// These include the endMarkNode in their followPos sets.
|
||||
//
|
||||
// Note: do not consider other end marker nodes, those that are added to
|
||||
// look-ahead rules. These can't chain; a match immediately stops
|
||||
// further matching. This leaves exactly one end marker node, the one
|
||||
// at the end of the complete tree.
|
||||
|
||||
if (!endNode.fFollowPos.contains(endMarkNode)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We've got a node that can end a match.
|
||||
|
||||
// Line Break Specific hack: If this node's val correspond to the $CM char class,
|
||||
// don't chain from it.
|
||||
// TODO: Add rule syntax for this behavior, get specifics out of here and
|
||||
// into the rule file.
|
||||
// !!LBCMNoChain implementation: If this node's val correspond to
|
||||
// the Line Break $CM char class, don't chain from it.
|
||||
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
|
||||
// by any of the standard ICU rules.
|
||||
if (fRB.fLBCMNoChain) {
|
||||
int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
|
||||
if (c != -1) {
|
||||
|
@ -571,7 +569,7 @@ class RBBITableBuilder {
|
|||
for (RBBINode p : T.fPositions) {
|
||||
if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
|
||||
if (U == null) {
|
||||
U = new HashSet<RBBINode>();
|
||||
U = new HashSet<>();
|
||||
}
|
||||
U.addAll(p.fFollowPos);
|
||||
}
|
||||
|
@ -610,7 +608,66 @@ class RBBITableBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* mapLookAheadRules
|
||||
*
|
||||
*/
|
||||
void mapLookAheadRules() {
|
||||
fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
|
||||
int laSlotsInUse = 0;
|
||||
|
||||
for (RBBIStateDescriptor sd: fDStates) {
|
||||
int laSlotForState = 0;
|
||||
|
||||
// Establish the look-ahead slot for this state, if the state covers
|
||||
// any look-ahead nodes - corresponding to the '/' in look-ahead rules.
|
||||
|
||||
// If any of the look-ahead nodes already have a slot assigned, use it,
|
||||
// otherwise assign a new one.
|
||||
|
||||
boolean sawLookAheadNode = false;
|
||||
for (RBBINode node: sd.fPositions) {
|
||||
if (node.fType != RBBINode.lookAhead) {
|
||||
continue;
|
||||
}
|
||||
sawLookAheadNode = true;
|
||||
int ruleNum = node.fVal; // Set when rule was originally parsed.
|
||||
assert(ruleNum < fLookAheadRuleMap.length);
|
||||
assert(ruleNum > 0);
|
||||
int laSlot = fLookAheadRuleMap[ruleNum];
|
||||
if (laSlot != 0) {
|
||||
if (laSlotForState == 0) {
|
||||
laSlotForState = laSlot;
|
||||
} else {
|
||||
// TODO: figure out if this can fail, change to setting an error code if so.
|
||||
assert(laSlot == laSlotForState);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!sawLookAheadNode) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (laSlotForState == 0) {
|
||||
laSlotForState = ++laSlotsInUse;
|
||||
}
|
||||
|
||||
// For each look ahead node covered by this state,
|
||||
// set the mapping from the node's rule number to the look ahead slot.
|
||||
// There can be multiple nodes/rule numbers going to the same la slot.
|
||||
|
||||
for (RBBINode node: sd.fPositions) {
|
||||
if (node.fType != RBBINode.lookAhead) {
|
||||
continue;
|
||||
}
|
||||
int ruleNum = node.fVal; // Set when rule was originally parsed.
|
||||
int existingVal = fLookAheadRuleMap[ruleNum];
|
||||
assert(existingVal == 0 || existingVal == laSlotForState);
|
||||
fLookAheadRuleMap[ruleNum] = laSlotForState;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -622,7 +679,7 @@ class RBBITableBuilder {
|
|||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void flagAcceptingStates() {
|
||||
List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
|
||||
List<RBBINode> endMarkerNodes = new ArrayList<>();
|
||||
RBBINode endMarker;
|
||||
int i;
|
||||
int n;
|
||||
|
@ -640,29 +697,20 @@ class RBBITableBuilder {
|
|||
// If no other value was specified, force it to -1.
|
||||
|
||||
if (sd.fAccepting==0) {
|
||||
// State hasn't been marked as accepting yet. Do it now.
|
||||
sd.fAccepting = endMarker.fVal;
|
||||
// State hasn't been marked as accepting yet. Do it now.
|
||||
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
|
||||
if (sd.fAccepting == 0) {
|
||||
sd.fAccepting = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (sd.fAccepting==-1 && endMarker.fVal != 0) {
|
||||
// Both lookahead and non-lookahead accepting for this state.
|
||||
// Favor the look-ahead. Expedient for line break.
|
||||
// TODO: need a more elegant resolution for conflicting rules.
|
||||
sd.fAccepting = endMarker.fVal;
|
||||
}
|
||||
// implicit else:
|
||||
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
|
||||
|
||||
// If the end marker node is from a look-ahead rule, set
|
||||
// the fLookAhead field for this state also.
|
||||
if (endMarker.fLookAheadEnd) {
|
||||
// TODO: don't change value if already set?
|
||||
// TODO: allow for more than one active look-ahead rule in engine.
|
||||
// Make value here an index to a side array in engine?
|
||||
sd.fLookAhead = sd.fAccepting;
|
||||
// Both lookahead and non-lookahead accepting for this state.
|
||||
// Favor the look-ahead, because a look-ahead match needs to
|
||||
// immediately stop the run-time engine. First match, not longest.
|
||||
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
|
||||
}
|
||||
// implicit else:
|
||||
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -675,7 +723,7 @@ class RBBITableBuilder {
|
|||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void flagLookAheadStates() {
|
||||
List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();
|
||||
List<RBBINode> lookAheadNodes = new ArrayList<>();
|
||||
RBBINode lookAheadNode;
|
||||
int i;
|
||||
int n;
|
||||
|
@ -683,11 +731,12 @@ class RBBITableBuilder {
|
|||
fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
|
||||
for (i=0; i<lookAheadNodes.size(); i++) {
|
||||
lookAheadNode = lookAheadNodes.get(i);
|
||||
|
||||
for (n=0; n<fDStates.size(); n++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(n);
|
||||
if (sd.fPositions.contains(lookAheadNode)) {
|
||||
sd.fLookAhead = lookAheadNode.fVal;
|
||||
int lookaheadSlot = fLookAheadRuleMap[lookAheadNode.fVal];
|
||||
assert(sd.fLookAhead == 0 || sd.fLookAhead == lookaheadSlot);
|
||||
sd.fLookAhead = lookaheadSlot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -702,7 +751,7 @@ class RBBITableBuilder {
|
|||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void flagTaggedStates() {
|
||||
List<RBBINode> tagNodes = new ArrayList<RBBINode>();
|
||||
List<RBBINode> tagNodes = new ArrayList<>();
|
||||
RBBINode tagNode;
|
||||
int i;
|
||||
int n;
|
||||
|
@ -765,10 +814,10 @@ class RBBITableBuilder {
|
|||
fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group
|
||||
fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero
|
||||
|
||||
SortedSet<Integer> s0 = new TreeSet<Integer>();
|
||||
SortedSet<Integer> s0 = new TreeSet<>();
|
||||
Integer izero = Integer.valueOf(0);
|
||||
fRB.fStatusSets.put(s0, izero);
|
||||
SortedSet<Integer> s1 = new TreeSet<Integer>();
|
||||
SortedSet<Integer> s1 = new TreeSet<>();
|
||||
s1.add(izero);
|
||||
fRB.fStatusSets.put(s0, izero);
|
||||
}
|
||||
|
@ -986,16 +1035,6 @@ class RBBITableBuilder {
|
|||
}
|
||||
sd.fDtran[col] = newVal;
|
||||
}
|
||||
if (sd.fAccepting == duplState) {
|
||||
sd.fAccepting = keepState;
|
||||
} else if (sd.fAccepting > duplState) {
|
||||
sd.fAccepting--;
|
||||
}
|
||||
if (sd.fLookAhead == duplState) {
|
||||
sd.fLookAhead = keepState;
|
||||
} else if (sd.fLookAhead > duplState) {
|
||||
sd.fLookAhead--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1167,7 +1206,7 @@ class RBBITableBuilder {
|
|||
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
|
||||
|
||||
assert(fSafeTable == null);
|
||||
fSafeTable = new ArrayList<short[]>();
|
||||
fSafeTable = new ArrayList<>();
|
||||
for (int row=0; row<numCharClasses + 2; ++row) {
|
||||
fSafeTable.add(new short[numCharClasses]);
|
||||
}
|
||||
|
|
|
@ -937,9 +937,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
}
|
||||
}
|
||||
|
||||
// If we are at the position of the '/' in a look-ahead (hard break) rule;
|
||||
// record the current position, to be returned later, if the full rule matches.
|
||||
// TODO: Move this check before the previous check of fAccepting.
|
||||
// This would enable hard-break rules with no following context.
|
||||
// But there are line break test failures when trying this. Investigate.
|
||||
// Issue ICU-20837
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int pos = text.getIndex();
|
||||
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
// The iterator has been left in the middle of a surrogate pair.
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f9b73d720421a85704fc64aa0949c94d52e450a44af96c715881e9e6ab0fa3e6
|
||||
size 12998988
|
||||
oid sha256:6b14ef66277d196e8b01365e9426a1585ea25a9cc346b4a454db2ecc157aed41
|
||||
size 12995957
|
||||
|
|
|
@ -1991,3 +1991,23 @@ Bangkok)•</data>
|
|||
•
|
||||
•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。•
|
||||
•</data>
|
||||
|
||||
|
||||
#
|
||||
# Bug 20303 Multiple Look-ahead rules with similar contexts.
|
||||
# Check that samples of such rules are being handled correctly.
|
||||
#
|
||||
|
||||
<rules>
|
||||
!!forward;
|
||||
!!quoted_literals_only;
|
||||
!!chain;
|
||||
[a] [b] / [c] [d];
|
||||
[a] [b] / [c] [d] {100};
|
||||
[a] [b] / [e] [f] {200};
|
||||
[a] [b] / [e] [g] {300};
|
||||
[a] [b] [c] [h] {400};
|
||||
[x] [a] [b] / [c] [d] {500};
|
||||
[y] [a] [b] [c] [d] {600};
|
||||
</rules>
|
||||
<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>
|
||||
|
|
Loading…
Add table
Reference in a new issue