diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 93b4c2e9247..cc79b3b8a39 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -155,7 +155,7 @@ void RuleBasedBreakIterator::init() { fText = NULL; fData = NULL; fCharMappings = NULL; - fLastBreakStatus = 0; + fLastBreakTag = 0; fDictionaryCharCount = 0; if (debugInitDone == FALSE) { @@ -489,11 +489,14 @@ int32_t RuleBasedBreakIterator::handleNext(void) { int32_t lookaheadResult = 0; // begin in state 1 - int32_t state = START_STATE; + int32_t state = START_STATE; int16_t category; - UChar32 c = fText->current32(); + UChar32 c = fText->current32(); RBBIStateTableRow *row; int32_t lookaheadStatus = 0; + int32_t lookaheadTag = 0; + + fLastBreakTag = 0; row = (RBBIStateTableRow *) (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state)); @@ -550,10 +553,13 @@ int32_t RuleBasedBreakIterator::handleNext(void) { goto continueOn; } - if (row->fAccepting != 0 && row->fLookAhead == 0) { + if (row->fAccepting == -1) { // Match found, common case, no lookahead involved. - result = fText->getIndex(); - lookaheadStatus = 0; // clear out any pending look-ahead matches. + // (It's possible that some lookahead rule matched here also, + // but since there's an unconditional match, we'll favor that.) + result = fText->getIndex(); + lookaheadStatus = 0; // clear out any pending look-ahead matches. + fLastBreakTag = row->fTag; // Remember the break status (tag) value. goto continueOn; } @@ -566,6 +572,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) { if (r > result) { lookaheadResult = r; lookaheadStatus = row->fLookAhead; + lookaheadTag = row->fTag; } goto continueOn; } @@ -576,7 +583,8 @@ int32_t RuleBasedBreakIterator::handleNext(void) { if (lookaheadResult > result) { assert(row->fAccepting == lookaheadStatus); // TODO: handle this case // of overlapping lookahead matches. - result = lookaheadResult; + result = lookaheadResult; + fLastBreakTag = lookaheadTag; lookaheadStatus = 0; } goto continueOn; @@ -631,6 +639,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) { int32_t result = fText->getIndex(); int32_t lookaheadStatus = 0; int32_t lookaheadResult = 0; + int32_t lookaheadTag = 0; UChar32 c = fText->current32(); RBBIStateTableRow *row; @@ -685,7 +694,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) { goto continueOn; } - if (row->fAccepting != 0 && row->fLookAhead == 0) { + if (row->fAccepting == -1) { // Match found, common case, no lookahead involved. result = fText->getIndex(); lookaheadStatus = 0; // clear out any pending look-ahead matches. @@ -694,13 +703,14 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) { if (row->fAccepting == 0 && row->fLookAhead != 0) { // Lookahead match point. Remember it, but only if no other rule - // has unconditinally matched to this point. + // has unconditionally matched to this point. // TODO: handle case where there's a pending match from a different rule // where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead. int32_t r = fText->getIndex(); if (r > result) { lookaheadResult = r; lookaheadStatus = row->fLookAhead; + lookaheadTag = row->fTag; } goto continueOn; } @@ -711,7 +721,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) { if (lookaheadResult > result) { assert(row->fAccepting == lookaheadStatus); // TODO: handle this case // of overlapping lookahead matches. - result = lookaheadResult; + result = lookaheadResult; + fLastBreakTag = lookaheadTag; lookaheadStatus = 0; } goto continueOn; @@ -752,8 +763,8 @@ RuleBasedBreakIterator::reset() // getRuleStatus() // //------------------------------------------------------------------------------- -int16_t RuleBasedBreakIterator::getRuleStatus() const { - return fLastBreakStatus; +int32_t RuleBasedBreakIterator::getRuleStatus() const { + return fLastBreakTag; } @@ -764,13 +775,13 @@ int16_t RuleBasedBreakIterator::getRuleStatus() const { // for standard iterator types. // //------------------------------------------------------------------------------- -const uint8_t *RuleBasedBreakIterator::getFlattenedData(uint32_t *length) { +const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { const uint8_t *retPtr = NULL; - *length = 0; + length = 0; if (fData != NULL) { retPtr = (const uint8_t *)fData->fHeader; - *length = fData->fHeader->fLength; + length = fData->fHeader->fLength; } return retPtr; } diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 7e4b8e3bd43..3fa930b9c77 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -164,10 +164,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { +//---------------------------------------------------------------------------------------- // -// RulesBasedBreakIterator, construct from source rules that are passed in -// in a UnicodeString +// createRuleBasedBreakIterator construct from source rules that are passed in +// in a UnicodeString // +//---------------------------------------------------------------------------------------- BreakIterator * RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, UParseError &parseError, diff --git a/icu4c/source/common/rbbirpt.h b/icu4c/source/common/rbbirpt.h index 0caf8f671b4..f02493fb318 100644 --- a/icu4c/source/common/rbbirpt.h +++ b/icu4c/source/common/rbbirpt.h @@ -101,7 +101,7 @@ struct RBBIRuleTableEl gRuleParseStateTable[] = { , {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30 , {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31 , {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32 - , {doExprCatOperator, 123 /*{*/, 49,0, FALSE} // 33 + , {doExprCatOperator, 123 /*{*/, 49,0, TRUE} // 33 , {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34 , {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35 , {doExprFinished, 255, 255,0, FALSE} // 36 diff --git a/icu4c/source/common/rbbirpt.txt b/icu4c/source/common/rbbirpt.txt index 9969cc6ddde..960da36f20e 100644 --- a/icu4c/source/common/rbbirpt.txt +++ b/icu4c/source/common/rbbirpt.txt @@ -129,7 +129,7 @@ expr-cont: '$' term doExprCatOperator '.' term doExprCatOperator '/' look-ahead doExprCatOperator - '{' tag-open doExprCatOperator + '{' n tag-open doExprCatOperator '|' n term doExprOrOperator ')' n pop doExprRParen default pop doExprFinished diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index 728d948651d..099109c4fe8 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -443,7 +443,7 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action, case doStartTagValue: // Scanned a '{', the opening delimiter for a tag value within a rule. n = pushNewNode(RBBINode::tag); - n->fVal = 0; + n->fVal = 0; n->fFirstPos = fScanIndex; n->fLastPos = fNextIndex; break; @@ -451,13 +451,15 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action, case doTagDigit: // Just scanned a decimal digit that's part of a tag value { + n = fNodeStack[fNodeStackPtr]; uint32_t v = u_charDigitValue(fC.fChar); assert(v >= 0); - n->fVal *= v; + n->fVal = n->fVal*10 + v; break; } case doTagValue: + n = fNodeStack[fNodeStackPtr]; n->fLastPos = fNextIndex; fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); break; @@ -952,6 +954,19 @@ void RBBIRuleScanner::parse() { } + // + // If there were NO user specified reverse rules, set up the equivalent of ".*;" + // + if (fRB->fReverseTree == NULL) { + fRB->fReverseTree = pushNewNode(RBBINode::opStar); + RBBINode *operand = pushNewNode(RBBINode::setRef); + findSetFor(kAny, operand); + fRB->fReverseTree->fLeftChild = operand; + operand->fParent = fRB->fReverseTree; + fNodeStackPtr -= 2; + } + + // // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index fe422c3210e..9368765a3ab 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -111,6 +111,7 @@ void RBBITableBuilder::build() { buildStateTable(); flagAcceptingStates(); flagLookAheadStates(); + flagTaggedStates(); if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();}; } diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 70bba5429b8..0560124a81a 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -201,7 +201,7 @@ protected: // RBBIDataWrapper *fData; UTrie *fCharMappings; - int16_t fLastBreakStatus; + int32_t fLastBreakTag; // Rule {tag} value for the most recent match. // // Counter for the number of characters encountered with the "dictionary" @@ -414,7 +414,7 @@ protected: * within brackets, {123}, for example. For rules that do not specify a * status, a default value of 0 is returned. */ - virtual int16_t getRuleStatus() const; + virtual int32_t getRuleStatus() const; /** * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. @@ -446,17 +446,20 @@ protected: /** - * Return the flattened form of compiled break rules, + * Return the binary form of compiled break rules, * which can then be used to create a new break iterator at some * time in the future. Creating a break iterator in this way * is much faster than building one from the source form of the * break rules. * - * @return A pointer to the flattened rule data. The storage + * The binary data is can only be used with the same version of ICU + * and on the same platform type (processor endian-ness) + * + * @return A pointer to the binary (compiled) rule data. The storage * belongs to the RulesBasedBreakIterator object, no the * caller, and must not be modified or deleted. */ - virtual const uint8_t *getFlattenedData(uint32_t *length); + virtual const uint8_t *getBinaryRules(uint32_t &length); #ifdef RBBI_DEBUG diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index 34fbd382fbd..f5e9429da2b 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -610,6 +610,7 @@ void RBBIAPITest::TestBuilder() { bi->setText(testString1); doBoundaryTest(*bi, testString1, bounds1); } + delete bi; } diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 07fc1f19174..0efb4a4f722 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -348,7 +348,7 @@ void RBBITest::TestDefaultRuleBasedWordIteration() // delete rbbi; } //-------------------------------------------------------------------- -//tests default rules based word iteration +//tests default rules based sentence iteration //-------------------------------------------------------------------- static const UChar kParagraphSeparator[] = {0x2029, 0}; static const UChar kLineSeparator[] = {0x2028, 0}; @@ -766,6 +766,53 @@ void RBBITest::TestTitleBreak() delete titleData; } + +//----------------------------------------------------------------------------------- +// +// Test for status {tag} return value from break rules. +// TODO: a more thorough test. +// +//----------------------------------------------------------------------------------- +void RBBITest::TestStatusReturn() { + UnicodeString rulesString1 = "$Letters = [:L:];\n" + "$Numbers = [:N:];\n" + "$Letters+{1};\n" + "$Numbers+{2};\n" + "Help\\ {4}/me\\!;\n" + "[^$Letters $Numbers];\n" + "!.*;\n"; + UnicodeString testString1 = "abc123..abc Help me Help me!"; + // 01234567890123456789012345678 + int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; + int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; + + UErrorCode status=U_ZERO_ERROR; + UParseError parseError; + + RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status); + if(U_FAILURE(status)) { + errln("FAIL : in construction"); + } else { + int32_t pos; + int32_t i = 0; + bi->setText(testString1); + for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { + if (pos != bounds1[i]) { + errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos); + break; + } + + int tag = bi->getRuleStatus(); + if (tag != brkStatus[i]) { + errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag); + break; + } + i++; + } + } + delete bi; +} + /* //Bug: if there is no word break before and after danda when it is followed by a space void RBBITest::TestDanda() @@ -1039,6 +1086,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha if(exec) TestHindiWordBreak(); break; case 6: name = "TestTitleBreak"; if(exec) TestTitleBreak(); break; + case 7: name = "TestStatusReturn"; + if(exec) TestStatusReturn(); break; // case 6: name = "TestDanda()"; // if(exec) TestDanda(); break; diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 7ca529ad946..0eefc3a3431 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -55,6 +55,12 @@ public: * Tests Title Case break iteration **/ void TestTitleBreak(void); + + /** + * Tests rule status return values + **/ + void TestStatusReturn(); + /** * Test Hindi Danda i.e make sure we have a break point before and after danda **/ diff --git a/icu4c/source/tools/genbrk/genbrk.cpp b/icu4c/source/tools/genbrk/genbrk.cpp index 117505df8ca..bbcb69bd8f6 100644 --- a/icu4c/source/tools/genbrk/genbrk.cpp +++ b/icu4c/source/tools/genbrk/genbrk.cpp @@ -191,7 +191,7 @@ int main(int argc, char **argv) { // uint32_t outDataSize; const uint8_t *outData; - outData = bi->getFlattenedData(&outDataSize); + outData = bi->getBinaryRules(outDataSize); //