ICU-20303 Break Iterator, improve handling of look-ahead rules.

- Merge the look-ahead results slots used when multiple rules share a common accepting state.
- Sequentially number the look-ahead result slot. Will eventually allow replacing the runtime map with an array.
- Inhibit chaining out of look-ahead rules. This could never actually happen; when a hard break
  rule matches, the engine is stopped immediately, but the state table was being constructed
  as if it could  happen. Reduces table size for line break rules.
- Remove incorrect handling of fAccepting and fLookAhead fields of a state table row
  when removing duplicate states. Look-ahead slot number was being mis-interpreted as a state number.
This commit is contained in:
Andy Heninger 2019-08-22 17:48:36 -07:00
parent 7917df1e80
commit faa2f9f9e1
13 changed files with 350 additions and 134 deletions

View file

@ -883,9 +883,15 @@ int32_t RuleBasedBreakIterator::handleNext() {
return lookaheadResult;
}
}
// If we are at the position of the '/' in a look-ahead (hard break) rule;
// record the current position, to be returned later, if the full rule matches.
// TODO: Move this check before the previous check of fAccepting.
// This would enable hard-break rules with no following context.
// But there are line break test failures when trying this. Investigate.
// Issue ICU-20837
int16_t rule = row->fLookAhead;
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
lookAheadMatches.setPosition(rule, pos);
}

View file

@ -1274,6 +1274,10 @@ void RBBIRuleScanner::scanSet() {
}
int32_t RBBIRuleScanner::numRules() {
return fRuleNum;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -73,6 +73,8 @@ public:
// reverse rules,
// and a list of UnicodeSets encountered.
int32_t numRules(); // Return the number of rules that have been seen.
/**
* Return a rules string without unnecessary
* characters.

View file

@ -18,6 +18,7 @@
#include "unicode/unistr.h"
#include "rbbitblb.h"
#include "rbbirb.h"
#include "rbbiscan.h"
#include "rbbisetb.h"
#include "rbbidata.h"
#include "cstring.h"
@ -52,6 +53,7 @@ RBBITableBuilder::~RBBITableBuilder() {
}
delete fDStates;
delete fSafeTable;
delete fLookAheadRuleMap;
}
@ -121,7 +123,7 @@ void RBBITableBuilder::buildForwardTable() {
}
cn->fLeftChild = fTree;
fTree->fParent = cn;
cn->fRightChild = new RBBINode(RBBINode::endMark);
RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
// Delete and exit if memory allocation failed.
if (cn->fRightChild == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
@ -164,7 +166,7 @@ void RBBITableBuilder::buildForwardTable() {
// For "chained" rules, modify the followPos sets
//
if (fRB->fChainRules) {
calcChainedFollowPos(fTree);
calcChainedFollowPos(fTree, endMarkerNode);
}
//
@ -178,6 +180,7 @@ void RBBITableBuilder::buildForwardTable() {
// Build the DFA state transition tables.
//
buildStateTable();
mapLookAheadRules();
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
@ -401,19 +404,13 @@ void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
// to implement rule chaining. NOT described by Aho
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
UVector endMarkerNodes(*fStatus);
UVector leafNodes(*fStatus);
int32_t i;
if (U_FAILURE(*fStatus)) {
return;
}
// get a list of all endmarker nodes.
tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
// get a list all leaf nodes
tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
if (U_FAILURE(*fStatus)) {
@ -442,28 +439,26 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
int32_t startNodeIx;
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
RBBINode *endNode = NULL;
RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
// Identify leaf nodes that correspond to overall rule match positions.
// These include an endMarkerNode in their followPos sets.
for (i=0; i<endMarkerNodes.size(); i++) {
if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
endNode = tNode;
break;
}
}
if (endNode == NULL) {
// node wasn't an end node. Try again with the next.
// These include the endMarkNode in their followPos sets.
//
// Note: do not consider other end marker nodes, those that are added to
// look-ahead rules. These can't chain; a match immediately stops
// further matching. This leaves exactly one end marker node, the one
// at the end of the complete tree.
if (!endNode->fFollowPos->contains(endMarkNode)) {
continue;
}
// We've got a node that can end a match.
// Line Break Specific hack: If this node's val correspond to the $CM char class,
// don't chain from it.
// TODO: Add rule syntax for this behavior, get specifics out of here and
// into the rule file.
// !!LBCMNoChain implementation: If this node's val correspond to
// the Line Break $CM char class, don't chain from it.
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
// by any of the standard ICU rules.
if (fRB->fLBCMNoChain) {
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
if (c != -1) {
@ -475,7 +470,6 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
}
}
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;
@ -705,6 +699,77 @@ ExitBuildSTdeleteall:
}
/**
* mapLookAheadRules
*
*/
void RBBITableBuilder::mapLookAheadRules() {
fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
if (fLookAheadRuleMap == nullptr) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(*fStatus)) {
return;
}
fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
int32_t laSlotsInUse = 0;
for (int32_t n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
int32_t laSlotForState = 0;
// Establish the look-ahead slot for this state, if the state covers
// any look-ahead nodes - corresponding to the '/' in look-ahead rules.
// If any of the look-ahead nodes already have a slot assigned, use it,
// otherwise assign a new one.
bool sawLookAheadNode = false;
for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
if (node->fType != RBBINode::NodeType::lookAhead) {
continue;
}
sawLookAheadNode = true;
int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
U_ASSERT(ruleNum < fLookAheadRuleMap->size());
U_ASSERT(ruleNum > 0);
int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
if (laSlot != 0) {
if (laSlotForState == 0) {
laSlotForState = laSlot;
} else {
// TODO: figure out if this can fail, change to setting an error code if so.
U_ASSERT(laSlot == laSlotForState);
}
}
}
if (!sawLookAheadNode) {
continue;
}
if (laSlotForState == 0) {
laSlotForState = ++laSlotsInUse;
}
// For each look ahead node covered by this state,
// set the mapping from the node's rule number to the look ahead slot.
// There can be multiple nodes/rule numbers going to the same la slot.
for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
if (node->fType != RBBINode::NodeType::lookAhead) {
continue;
}
int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
(void)existingVal;
U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
}
}
}
//-----------------------------------------------------------------------------
//
@ -744,28 +809,19 @@ void RBBITableBuilder::flagAcceptingStates() {
if (sd->fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
sd->fAccepting = endMarker->fVal;
sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
if (sd->fAccepting == 0) {
sd->fAccepting = -1;
}
}
if (sd->fAccepting==-1 && endMarker->fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead. Expedient for line break.
// TODO: need a more elegant resolution for conflicting rules.
sd->fAccepting = endMarker->fVal;
// Favor the look-ahead, because a look-ahead match needs to
// immediately stop the run-time engine. First match, not longest.
sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
}
// implicit else:
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field for this state also.
if (endMarker->fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
// Make value here an index to a side array in engine?
sd->fLookAhead = sd->fAccepting;
}
}
}
}
@ -792,11 +848,20 @@ void RBBITableBuilder::flagLookAheadStates() {
}
for (i=0; i<lookAheadNodes.size(); i++) {
lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
sd->fLookAhead = lookAheadNode->fVal;
int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
if (positionsIdx >= 0) {
U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
// if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
// printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
// __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
// }
sd->fLookAhead = lookaheadSlot;
}
}
}
@ -1204,16 +1269,6 @@ void RBBITableBuilder::removeState(IntPair duplStates) {
}
sd->fDtran->setElementAt(newVal, col);
}
if (sd->fAccepting == duplState) {
sd->fAccepting = keepState;
} else if (sd->fAccepting > duplState) {
sd->fAccepting--;
}
if (sd->fLookAhead == duplState) {
sd->fLookAhead = keepState;
} else if (sd->fLookAhead > duplState) {
sd->fLookAhead--;
}
}
}

View file

@ -91,9 +91,10 @@ private:
void calcFirstPos(RBBINode *n);
void calcLastPos(RBBINode *n);
void calcFollowPos(RBBINode *n);
void calcChainedFollowPos(RBBINode *n);
void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
void bofFixup();
void buildStateTable();
void mapLookAheadRules();
void flagAcceptingStates();
void flagLookAheadStates();
void flagTaggedStates();
@ -175,6 +176,9 @@ private:
/** Synthesized safe table, UVector of UnicodeString, one string per table row. */
UVector *fSafeTable;
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
UVector32 *fLookAheadRuleMap = nullptr;
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class

View file

@ -125,6 +125,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestBug13447);
TESTCASE_AUTO(TestReverse);
TESTCASE_AUTO(TestBug13692);
TESTCASE_AUTO(TestDebugRules);
TESTCASE_AUTO_END;
}
@ -4778,6 +4779,19 @@ void RBBITest::TestBug13692() {
assertSuccess(WHERE, status);
}
void RBBITest::TestProperties() {
UErrorCode errorCode = U_ZERO_ERROR;
UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
if (!prependSet.isEmpty()) {
errln(
"[:GCB=Prepend:] is not empty any more. "
"Uncomment relevant lines in source/data/brkitr/char.txt and "
"change this test to the opposite condition.");
}
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked
@ -4796,15 +4810,56 @@ void RBBITest::TestDebug(void) {
assertSuccess(WHERE, status);
}
void RBBITest::TestProperties() {
UErrorCode errorCode = U_ZERO_ERROR;
UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
if (!prependSet.isEmpty()) {
errln(
"[:GCB=Prepend:] is not empty any more. "
"Uncomment relevant lines in source/data/brkitr/char.txt and "
"change this test to the opposite condition.");
//
// TestDebugRules A stub test for use in debugging rule compilation problems.
// Can be freely altered as needed or convenient.
// Leave disabled - #ifdef'ed out - when not activley debugging. The rule source
// data files may not be available in all environments.
// Any permanent test cases should be moved to rbbitst.txt
// (see Bug 20303 in that file, for example), or to another test function in this file.
//
void RBBITest::TestDebugRules() {
#if 0
const char16_t *rules = u""
"!!quoted_literals_only; \n"
"!!chain; \n"
"!!lookAheadHardBreak; \n"
" \n"
// "[a] / ; \n"
"[a] [b] / [c] [d]; \n"
"[a] [b] / [c] [d] {100}; \n"
"[x] [a] [b] / [c] [d] {100}; \n"
"[a] [b] [c] / [d] {100}; \n"
//" [c] [d] / [e] [f]; \n"
//"[a] [b] / [c]; \n"
;
UErrorCode status = U_ZERO_ERROR;
CharString path(pathToDataDirectory(), status);
path.appendPathPart("brkitr", status);
path.appendPathPart("rules", status);
path.appendPathPart("line.txt", status);
int len;
std::unique_ptr<UChar []> testFile(ReadAndConvertFile(path.data(), len, "UTF-8", status));
if (!assertSuccess(WHERE, status)) {
return;
}
UParseError pe;
// rules = testFile.get();
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rules, pe, status);
if (!assertSuccess(WHERE, status)) {
delete bi;
return;
}
// bi->dumpTables();
delete bi;
#endif
}
#endif // #if !UCONFIG_NO_BREAK_ITERATION

View file

@ -82,6 +82,7 @@ public:
void TestReverse();
void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
void TestBug13692();
void TestDebugRules();
void TestDebug();
void TestProperties();

View file

@ -1991,3 +1991,23 @@ Bangkok)•</data>
•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。•
•</data>
#
# Bug 20303 Multiple Look-ahead rules with similar contexts.
# Check that samples of such rules are being handled correctly.
#
<rules>
!!forward;
!!quoted_literals_only;
!!chain;
[a] [b] / [c] [d];
[a] [b] / [c] [d] {100};
[a] [b] / [e] [f] {200};
[a] [b] / [e] [g] {300};
[a] [b] [c] [h] {400};
[x] [a] [b] / [c] [d] {500};
[y] [a] [b] [c] [d] {600};
</rules>
<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>

View file

@ -73,7 +73,7 @@ class RBBIRuleScanner {
RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
// $variable symbols.
HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to
HashMap<String, RBBISetTableEl> fSetTable = new HashMap<>(); // UnicocodeSet hash table, holds indexes to
// the sets created while parsing rules.
// The key is the string used for creating
// the set.
@ -933,7 +933,7 @@ class RBBIRuleScanner {
// Perform any action specified by this row in the state table.
if (doParseActions(tableEl.fAction) == false) {
// Break out of the state machine loop if the
// the action signalled some kind of error, or
// the action signaled some kind of error, or
// the action was to exit, occurs on normal end-of-rules-input.
break;
}
@ -1070,7 +1070,7 @@ class RBBIRuleScanner {
error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
}
// Advance the RBBI parse postion over the UnicodeSet pattern.
// Advance the RBBI parse position over the UnicodeSet pattern.
// Don't just set fScanIndex because the line/char positions maintained
// for error reporting would be thrown off.
i = pos.getIndex();
@ -1089,12 +1089,17 @@ class RBBIRuleScanner {
n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
// findSetFor() serves several purposes here:
// - Adopts storage for the UnicodeSet, will be responsible for deleting.
// - Mantains collection of all sets in use, needed later for establishing
// - Maintains collection of all sets in use, needed later for establishing
// character categories for run time engine.
// - Eliminates mulitiple instances of the same set.
// - Eliminates multiple instances of the same set.
// - Creates a new uset node if necessary (if this isn't a duplicate.)
findSetFor(n.fText, n, uset);
}
/**
* @return the number of rules that have been seen.
*/
int numRules() {
return fRuleNum;
}
}

View file

@ -53,8 +53,8 @@ class RBBITableBuilder {
// in RBBITableBuilder.fDStates
RBBIStateDescriptor(int maxInputSymbol) {
fTagVals = new TreeSet<Integer>();
fPositions = new HashSet<RBBINode>();
fTagVals = new TreeSet<>();
fPositions = new HashSet<>();
fDtran = new int[maxInputSymbol+1]; // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
@ -74,6 +74,9 @@ class RBBITableBuilder {
/** Synthesized safe table, a List of row arrays. */
private List<short[]> fSafeTable;
/** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
int[] fLookAheadRuleMap;
//-----------------------------------------------------------------------------
//
// Constructor for RBBITableBuilder.
@ -84,7 +87,7 @@ class RBBITableBuilder {
RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx) {
fRootIx = rootNodeIx;
fRB = rb;
fDStates = new ArrayList<RBBIStateDescriptor>();
fDStates = new ArrayList<>();
}
@ -137,7 +140,7 @@ class RBBITableBuilder {
RBBINode cn = new RBBINode(RBBINode.opCat);
cn.fLeftChild = fRB.fTreeRoots[fRootIx];
fRB.fTreeRoots[fRootIx].fParent = cn;
cn.fRightChild = new RBBINode(RBBINode.endMark);
RBBINode endMarkerNode = cn.fRightChild = new RBBINode(RBBINode.endMark);
cn.fRightChild.fParent = cn;
fRB.fTreeRoots[fRootIx] = cn;
@ -172,7 +175,7 @@ class RBBITableBuilder {
// For "chained" rules, modify the followPos sets
//
if (fRB.fChainRules) {
calcChainedFollowPos(fRB.fTreeRoots[fRootIx]);
calcChainedFollowPos(fRB.fTreeRoots[fRootIx], endMarkerNode);
}
//
@ -186,6 +189,7 @@ class RBBITableBuilder {
// Build the DFA state transition tables.
//
buildStateTable();
mapLookAheadRules();
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
@ -390,13 +394,9 @@ class RBBITableBuilder {
// to implement rule chaining. NOT described by Aho
//
//-----------------------------------------------------------------------------
void calcChainedFollowPos(RBBINode tree) {
void calcChainedFollowPos(RBBINode tree, RBBINode endMarkNode) {
List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
List<RBBINode> leafNodes = new ArrayList<RBBINode>();
// get a list of all endmarker nodes.
tree.findNodes(endMarkerNodes, RBBINode.endMark);
List<RBBINode> leafNodes = new ArrayList<>();
// get a list all leaf nodes
tree.findNodes(leafNodes, RBBINode.leafChar);
@ -405,10 +405,10 @@ class RBBITableBuilder {
// with inbound chaining enabled, which is the union of the
// firstPosition sets from each of the rule root nodes.
List<RBBINode> ruleRootNodes = new ArrayList<RBBINode>();
List<RBBINode> ruleRootNodes = new ArrayList<>();
addRuleRootNodes(ruleRootNodes, tree);
Set<RBBINode> matchStartNodes = new HashSet<RBBINode>();
Set<RBBINode> matchStartNodes = new HashSet<>();
for (RBBINode node: ruleRootNodes) {
if (node.fChainIn) {
matchStartNodes.addAll(node.fFirstPosSet);
@ -417,28 +417,26 @@ class RBBITableBuilder {
// Iterate over all leaf nodes,
//
for (RBBINode tNode : leafNodes) {
RBBINode endNode = null;
for (RBBINode endNode : leafNodes) {
// Identify leaf nodes that correspond to overall rule match positions.
// These include an endMarkerNode in their followPos sets.
for (RBBINode endMarkerNode : endMarkerNodes) {
if (tNode.fFollowPos.contains(endMarkerNode)) {
endNode = tNode;
break;
}
}
if (endNode == null) {
// node wasn't an end node. Try again with the next.
// These include the endMarkNode in their followPos sets.
//
// Note: do not consider other end marker nodes, those that are added to
// look-ahead rules. These can't chain; a match immediately stops
// further matching. This leaves exactly one end marker node, the one
// at the end of the complete tree.
if (!endNode.fFollowPos.contains(endMarkNode)) {
continue;
}
// We've got a node that can end a match.
// Line Break Specific hack: If this node's val correspond to the $CM char class,
// don't chain from it.
// TODO: Add rule syntax for this behavior, get specifics out of here and
// into the rule file.
// !!LBCMNoChain implementation: If this node's val correspond to
// the Line Break $CM char class, don't chain from it.
// TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
// by any of the standard ICU rules.
if (fRB.fLBCMNoChain) {
int c = this.fRB.fSetBuilder.getFirstChar(endNode.fVal);
if (c != -1) {
@ -571,7 +569,7 @@ class RBBITableBuilder {
for (RBBINode p : T.fPositions) {
if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) {
if (U == null) {
U = new HashSet<RBBINode>();
U = new HashSet<>();
}
U.addAll(p.fFollowPos);
}
@ -610,7 +608,66 @@ class RBBITableBuilder {
}
}
/**
* mapLookAheadRules
*
*/
void mapLookAheadRules() {
fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1];
int laSlotsInUse = 0;
for (RBBIStateDescriptor sd: fDStates) {
int laSlotForState = 0;
// Establish the look-ahead slot for this state, if the state covers
// any look-ahead nodes - corresponding to the '/' in look-ahead rules.
// If any of the look-ahead nodes already have a slot assigned, use it,
// otherwise assign a new one.
boolean sawLookAheadNode = false;
for (RBBINode node: sd.fPositions) {
if (node.fType != RBBINode.lookAhead) {
continue;
}
sawLookAheadNode = true;
int ruleNum = node.fVal; // Set when rule was originally parsed.
assert(ruleNum < fLookAheadRuleMap.length);
assert(ruleNum > 0);
int laSlot = fLookAheadRuleMap[ruleNum];
if (laSlot != 0) {
if (laSlotForState == 0) {
laSlotForState = laSlot;
} else {
// TODO: figure out if this can fail, change to setting an error code if so.
assert(laSlot == laSlotForState);
}
}
}
if (!sawLookAheadNode) {
continue;
}
if (laSlotForState == 0) {
laSlotForState = ++laSlotsInUse;
}
// For each look ahead node covered by this state,
// set the mapping from the node's rule number to the look ahead slot.
// There can be multiple nodes/rule numbers going to the same la slot.
for (RBBINode node: sd.fPositions) {
if (node.fType != RBBINode.lookAhead) {
continue;
}
int ruleNum = node.fVal; // Set when rule was originally parsed.
int existingVal = fLookAheadRuleMap[ruleNum];
assert(existingVal == 0 || existingVal == laSlotForState);
fLookAheadRuleMap[ruleNum] = laSlotForState;
}
}
}
//-----------------------------------------------------------------------------
//
@ -622,7 +679,7 @@ class RBBITableBuilder {
//
//-----------------------------------------------------------------------------
void flagAcceptingStates() {
List<RBBINode> endMarkerNodes = new ArrayList<RBBINode>();
List<RBBINode> endMarkerNodes = new ArrayList<>();
RBBINode endMarker;
int i;
int n;
@ -640,29 +697,20 @@ class RBBITableBuilder {
// If no other value was specified, force it to -1.
if (sd.fAccepting==0) {
// State hasn't been marked as accepting yet. Do it now.
sd.fAccepting = endMarker.fVal;
// State hasn't been marked as accepting yet. Do it now.
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
if (sd.fAccepting == 0) {
sd.fAccepting = -1;
}
}
}
if (sd.fAccepting==-1 && endMarker.fVal != 0) {
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead. Expedient for line break.
// TODO: need a more elegant resolution for conflicting rules.
sd.fAccepting = endMarker.fVal;
}
// implicit else:
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field for this state also.
if (endMarker.fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
// Make value here an index to a side array in engine?
sd.fLookAhead = sd.fAccepting;
// Both lookahead and non-lookahead accepting for this state.
// Favor the look-ahead, because a look-ahead match needs to
// immediately stop the run-time engine. First match, not longest.
sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
}
// implicit else:
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
}
}
}
@ -675,7 +723,7 @@ class RBBITableBuilder {
//
//-----------------------------------------------------------------------------
void flagLookAheadStates() {
List<RBBINode> lookAheadNodes = new ArrayList<RBBINode>();
List<RBBINode> lookAheadNodes = new ArrayList<>();
RBBINode lookAheadNode;
int i;
int n;
@ -683,11 +731,12 @@ class RBBITableBuilder {
fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
for (i=0; i<lookAheadNodes.size(); i++) {
lookAheadNode = lookAheadNodes.get(i);
for (n=0; n<fDStates.size(); n++) {
RBBIStateDescriptor sd = fDStates.get(n);
if (sd.fPositions.contains(lookAheadNode)) {
sd.fLookAhead = lookAheadNode.fVal;
int lookaheadSlot = fLookAheadRuleMap[lookAheadNode.fVal];
assert(sd.fLookAhead == 0 || sd.fLookAhead == lookaheadSlot);
sd.fLookAhead = lookaheadSlot;
}
}
}
@ -702,7 +751,7 @@ class RBBITableBuilder {
//
//-----------------------------------------------------------------------------
void flagTaggedStates() {
List<RBBINode> tagNodes = new ArrayList<RBBINode>();
List<RBBINode> tagNodes = new ArrayList<>();
RBBINode tagNode;
int i;
int n;
@ -765,10 +814,10 @@ class RBBITableBuilder {
fRB.fRuleStatusVals.add(Integer.valueOf(1)); // Num of statuses in group
fRB.fRuleStatusVals.add(Integer.valueOf(0)); // and our single status of zero
SortedSet<Integer> s0 = new TreeSet<Integer>();
SortedSet<Integer> s0 = new TreeSet<>();
Integer izero = Integer.valueOf(0);
fRB.fStatusSets.put(s0, izero);
SortedSet<Integer> s1 = new TreeSet<Integer>();
SortedSet<Integer> s1 = new TreeSet<>();
s1.add(izero);
fRB.fStatusSets.put(s0, izero);
}
@ -986,16 +1035,6 @@ class RBBITableBuilder {
}
sd.fDtran[col] = newVal;
}
if (sd.fAccepting == duplState) {
sd.fAccepting = keepState;
} else if (sd.fAccepting > duplState) {
sd.fAccepting--;
}
if (sd.fLookAhead == duplState) {
sd.fLookAhead = keepState;
} else if (sd.fLookAhead > duplState) {
sd.fLookAhead--;
}
}
}
@ -1167,7 +1206,7 @@ class RBBITableBuilder {
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
assert(fSafeTable == null);
fSafeTable = new ArrayList<short[]>();
fSafeTable = new ArrayList<>();
for (int row=0; row<numCharClasses + 2; ++row) {
fSafeTable.add(new short[numCharClasses]);
}

View file

@ -937,9 +937,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
}
// If we are at the position of the '/' in a look-ahead (hard break) rule;
// record the current position, to be returned later, if the full rule matches.
// TODO: Move this check before the previous check of fAccepting.
// This would enable hard-break rules with no following context.
// But there are line break test failures when trying this. Investigate.
// Issue ICU-20837
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = text.getIndex();
if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
// The iterator has been left in the middle of a surrogate pair.

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f9b73d720421a85704fc64aa0949c94d52e450a44af96c715881e9e6ab0fa3e6
size 12998988
oid sha256:6b14ef66277d196e8b01365e9426a1585ea25a9cc346b4a454db2ecc157aed41
size 12995957

View file

@ -1991,3 +1991,23 @@ Bangkok)•</data>
•より<400>詳しい<400>こと<400>を<400>お<400>知<400>り<400>に<400>なり<400>たい<400>方<400>は<400>、•Glossary<200>,• •Technical<200> •Introduction<200> •および<400> •Useful<200> •Resources<200>を<400>ご<400>参照<400>くだ<400>さい<400>。•
•</data>
#
# Bug 20303 Multiple Look-ahead rules with similar contexts.
# Check that samples of such rules are being handled correctly.
#
<rules>
!!forward;
!!quoted_literals_only;
!!chain;
[a] [b] / [c] [d];
[a] [b] / [c] [d] {100};
[a] [b] / [e] [f] {200};
[a] [b] / [e] [g] {300};
[a] [b] [c] [h] {400};
[x] [a] [b] / [c] [d] {500};
[y] [a] [b] [c] [d] {600};
</rules>
<data>•ab<100>c•d•ab<200>e•f•ab<300>e•g•abch<400>xab<500>c•d•yabcd<600></data>