From 840affc2004abd16703ae60e290cf91c24b24d18 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 4 Dec 2003 02:12:42 +0000 Subject: [PATCH] ICU-2924 rbbi builder, better handling of !!lookAheadHardBreak option X-SVN-Rev: 13994 --- icu4c/source/common/rbbi.cpp | 11 +++++---- icu4c/source/common/rbbidata.cpp | 42 +++++++++++++++----------------- icu4c/source/common/rbbidata.h | 28 ++++++++++----------- icu4c/source/common/rbbirb.cpp | 1 + icu4c/source/common/rbbirb.h | 4 +++ icu4c/source/common/rbbiscan.cpp | 13 +++++----- icu4c/source/common/rbbitblb.cpp | 13 +++++++--- icu4c/source/common/rbbitblb.h | 2 ++ 8 files changed, 61 insertions(+), 53 deletions(-) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index ca4336d2c92..e3a98f313c8 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -998,10 +998,11 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) UChar32 c = fText->previous32(); // previous character int32_t result = fText->getIndex(); - int32_t lookaheadStatus = 0;//[] = {0, 0, 0, 0, 0}; - int32_t lookaheadResult = 0;//[] = {0, 0, 0, 0, 0}; - int32_t lookaheadTag = 0;//[] = {0, 0, 0, 0, 0}; - /*int32_t lookaheadCount = 0;*/ + int32_t lookaheadStatus = 0; + int32_t lookaheadResult = 0; + int32_t lookaheadTag = 0; + UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; + RBBIStateTableRow *row; row = (RBBIStateTableRow *) @@ -1081,7 +1082,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) /// syn wee todo hard coded for line breaks stuff /// needs to provide a tag in rules to ensure a stop. - if (fData->fLookAheadHardBreak == TRUE) { + if (lookAheadHardBreak) { fText->setIndex(result); return result; } diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index ee5ab96c68c..c0bb041b577 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -107,15 +107,6 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { fRefCount = 1; - /// todo: maybe add this formally to the builder - UnicodeString hardbreak = UNICODE_STRING_SIMPLE("!!lookAheadHardBreak"); - if (fRuleString.indexOf(hardbreak) >= 0) { - fLookAheadHardBreak = TRUE; - } - else { - fLookAheadHardBreak = FALSE; - } - #ifdef RBBI_DEBUG char *debugEnv = getenv("U_RBBIDEBUG"); if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} @@ -356,37 +347,44 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD uprv_memset(outBytes, 0, length); } - // Forward state table. Two int32_t vars at the start, then all int16_ts. + // + // Each state table begins with several 32 bit fields. Calculate the size + // in bytes of these. + // + RBBIStateTable *stp = NULL; + int32_t topSize = (char *)stp->fTableData - (char *)stp; + + // Forward state table. tableStartOffset = ds->readUInt32(rbbiDH->fFTable); tableLength = ds->readUInt32(rbbiDH->fFTableLen); - ds->swapArray32(ds, inBytes+tableStartOffset, 8, + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, outBytes+tableStartOffset, status); - ds->swapArray16(ds, inBytes+tableStartOffset+8, tableLength-8, - outBytes+tableStartOffset+8, status); + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, + outBytes+tableStartOffset+topSize, status); // Reverse state table. Same layout as forward table, above. tableStartOffset = ds->readUInt32(rbbiDH->fRTable); tableLength = ds->readUInt32(rbbiDH->fRTableLen); - ds->swapArray32(ds, inBytes+tableStartOffset, 8, + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, outBytes+tableStartOffset, status); - ds->swapArray16(ds, inBytes+tableStartOffset+8, tableLength-8, - outBytes+tableStartOffset+8, status); + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, + outBytes+tableStartOffset+topSize, status); // Safe Forward state table. Same layout as forward table, above. tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); tableLength = ds->readUInt32(rbbiDH->fSFTableLen); - ds->swapArray32(ds, inBytes+tableStartOffset, 8, + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, outBytes+tableStartOffset, status); - ds->swapArray16(ds, inBytes+tableStartOffset+8, tableLength-8, - outBytes+tableStartOffset+8, status); + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, + outBytes+tableStartOffset+topSize, status); // Safe Reverse state table. Same layout as forward table, above. tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); tableLength = ds->readUInt32(rbbiDH->fSRTableLen); - ds->swapArray32(ds, inBytes+tableStartOffset, 8, + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, outBytes+tableStartOffset, status); - ds->swapArray16(ds, inBytes+tableStartOffset+8, tableLength-8, - outBytes+tableStartOffset+8, status); + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, + outBytes+tableStartOffset+topSize, status); // Trie table for character categories utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index 94a9c6a9dce..68fbc283a4d 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -53,9 +53,9 @@ ubrk_swap(const UDataSwapper *ds, U_NAMESPACE_BEGIN -/* */ -/* The following structs map exactly onto the raw data from ICU common data file. */ -/* */ +/* + * The following structs map exactly onto the raw data from ICU common data file. + */ struct RBBIDataHeader { uint32_t fMagic; /* == 0xbla0 */ uint32_t fVersion; /* == 1 */ @@ -108,13 +108,19 @@ struct RBBIStateTableRow { struct RBBIStateTable { - uint32_t fNumStates; /* Number of states. */ - uint32_t fRowLen; /* Length of a state table row, in bytes. */ - char fTableData[4]; /* First RBBIStateTableRow begins here. */ - /* (making it char[] simplifies ugly address */ + uint32_t fNumStates; /* Number of states. */ + uint32_t fRowLen; /* Length of a state table row, in bytes. */ + uint32_t fFlags; /* Option Flags for this state table */ + uint32_t fReserved; /* reserved */ + char fTableData[4]; /* First RBBIStateTableRow begins here. */ + /* (making it char[] simplifies ugly address */ /* arithmetic for indexing variable length rows.) */ }; +typedef enum { + RBBI_LOOKAHEAD_HARD_BREAK = 1 +} RBBIStateTableFlags; + /* */ /* The reference counting wrapper class */ @@ -145,14 +151,6 @@ public: const UChar *fRuleSource; UTrie fTrie; - /* if fLookAheadHardBreak is true, we will break at the first lookahead match */ - /* the search does not go on further to look for a longer match */ - /* this also allows breaks at both ends of the string */ - /* e.g. rule "ABC / D; ABCDE" and */ - /* text "ABCD ABCDE ABC" will give breaks at */ - /* 01234567890123 */ - /* {0, 3, 4, 5, 8, 9, 10, 11, 14} */ - UBool fLookAheadHardBreak; private: int32_t fRefCount; diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index ec824661531..d3c588f5ebb 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -66,6 +66,7 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, fSafeRevTables = NULL; fChainRules = FALSE; fLBCMNoChain = FALSE; + fLookAheadHardBreak = FALSE; UErrorCode oldstatus = status; diff --git a/icu4c/source/common/rbbirb.h b/icu4c/source/common/rbbirb.h index 63d59d4f5e2..aa7c0ec6715 100644 --- a/icu4c/source/common/rbbirb.h +++ b/icu4c/source/common/rbbirb.h @@ -134,6 +134,10 @@ public: UBool fLBCMNoChain; // True: suppress chaining of rules on // chars with LineBreak property == CM. + UBool fLookAheadHardBreak; // True: Look ahead matches cause an + // immediate break, no continuing for the + // longest match. + RBBISetBuilder *fSetBuilder; // Set and Character Category builder. UVector *fUSetNodes; // Vector of all uset nodes. diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index d840665d50e..6e8251b2cd0 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -32,6 +32,7 @@ #include "rbbirb.h" #include "rbbinode.h" #include "rbbiscan.h" +#include "rbbitblb.h" #include "uassert.h" @@ -473,17 +474,15 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action) } else if (opt == "LBCMNoChain") { fRB->fLBCMNoChain = TRUE; } else if (opt == "forward") { - fRB->fDefaultTree = &fRB->fForwardTree; + fRB->fDefaultTree = &fRB->fForwardTree; } else if (opt == "reverse") { - fRB->fDefaultTree = &fRB->fReverseTree; + fRB->fDefaultTree = &fRB->fReverseTree; } else if (opt == "safe_forward") { - fRB->fDefaultTree = &fRB->fSafeFwdTree; + fRB->fDefaultTree = &fRB->fSafeFwdTree; } else if (opt == "safe_reverse") { - fRB->fDefaultTree = &fRB->fSafeRevTree; + fRB->fDefaultTree = &fRB->fSafeRevTree; } else if (opt == "lookAheadHardBreak") { - // at the moment do nothing for this - // the code is handled in rbbi.cpp - // todo: think about how to handle this + fRB->fLookAheadHardBreak = TRUE; } else { error(U_BRK_UNRECOGNIZED_OPTION); } diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 218c917b87e..d1feaeaae31 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -25,10 +25,10 @@ U_NAMESPACE_BEGIN RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) : fTree(*rootNode) { - fRB = rb; - fStatus = fRB->fStatus; - UErrorCode status = U_ZERO_ERROR; - fDStates = new UVector(status); + fRB = rb; + fStatus = fRB->fStatus; + UErrorCode status = U_ZERO_ERROR; + fDStates = new UVector(status); if (U_FAILURE(*fStatus)) { return; } @@ -805,6 +805,11 @@ void RBBITableBuilder::exportTable(void *where) { table->fRowLen = sizeof(RBBIStateTableRow) + sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); table->fNumStates = fDStates->size(); + table->fFlags = 0; + if (fRB->fLookAheadHardBreak) { + table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; + } + table->fReserved = 0; for (state=0; statefNumStates; state++) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index 4f72c93e7de..effae884fc9 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -44,6 +44,7 @@ public: // Sufficient memory must exist at // the specified location. + private: void calcNullable(RBBINode *n); void calcFirstPos(RBBINode *n); @@ -76,6 +77,7 @@ private: // Index is state number // Contents are RBBIStateDescriptor pointers. + RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class };