ICU-13541 Improve RuleBasedBreakIterator construction time, patch from grhoten.

X-SVN-Rev: 40789
2025-04-08 06:53:45 +00:00 · 2018-01-19 22:30:56 +00:00 · 2018-01-19 22:30:56 +00:00 · ac0972f12c
commit ac0972f12c
parent a3dca5a303
4 changed files with 50 additions and 65 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -64,7 +64,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
 * Constructs a RuleBasedBreakIterator that uses the already-created
 * tables object that is passed in as a parameter.
 */
-RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
+RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
+ : fSCharIter(UnicodeString())
+{
    init(status);
    fData = new RBBIDataWrapper(data, status); // status checked in constructor
    if (U_FAILURE(status)) {return;}
@ -80,7 +82,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
 //
 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
                       uint32_t       ruleLength,
-                       UErrorCode     &status) {
+                       UErrorCode     &status)
+ : fSCharIter(UnicodeString())
+{
    init(status);
    if (U_FAILURE(status)) {
        return;
@ -110,6 +114,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
+ : fSCharIter(UnicodeString())
 {
    init(status);
    fData = new RBBIDataWrapper(udm, status); // status checked in constructor
@ -130,6 +135,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
                                                UParseError          &parseError,
                                                UErrorCode           &status)
+ : fSCharIter(UnicodeString())
 {
    init(status);
    if (U_FAILURE(status)) {return;}
@ -152,7 +158,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString  &rules,
 //                           Used when creating a RuleBasedBreakIterator from a set
 //                           of rules.
 //-------------------------------------------------------------------------------
-RuleBasedBreakIterator::RuleBasedBreakIterator() {
+RuleBasedBreakIterator::RuleBasedBreakIterator()
+ : fSCharIter(UnicodeString())
+{
    UErrorCode status = U_ZERO_ERROR;
    init(status);
 }
@ -165,7 +173,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() {
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
-: BreakIterator(other)
+: BreakIterator(other),
+  fSCharIter(UnicodeString())
 {
    UErrorCode status = U_ZERO_ERROR;
    this->init(status);
@ -177,15 +186,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
 * Destructor
 */
 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
        // fCharIter was adopted from the outside.
        delete fCharIter;
    }
    fCharIter = NULL;
-    delete fSCharIter;
-    fSCharIter = NULL;
-    delete fDCharIter;
-    fDCharIter = NULL;

    utext_close(fText);

@ -226,17 +231,21 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
    UErrorCode status = U_ZERO_ERROR;
    fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);

-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
        delete fCharIter;
    }
    fCharIter = NULL;

-    if (that.fCharIter != NULL ) {
+    if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
        // This is a little bit tricky - it will intially appear that
        //  this->fCharIter is adopted, even if that->fCharIter was
        //  not adopted.  That's ok.
        fCharIter = that.fCharIter->clone();
    }
+    fSCharIter = that.fSCharIter;
+    if (fCharIter == NULL) {
+        fCharIter = &fSCharIter;
+    }

    if (fData != NULL) {
        fData->removeReference();
@ -271,8 +280,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
 void RuleBasedBreakIterator::init(UErrorCode &status) {
    fText                 = NULL;
    fCharIter             = NULL;
-    fSCharIter            = NULL;
-    fDCharIter            = NULL;
    fData                 = NULL;
    fPosition             = 0;
    fRuleStatusIndex      = 0;
@ -393,20 +400,13 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
    //   Return one over an empty string instead - this is the closest
    //   we can come to signaling a failure.
    //   (GetText() is obsolete, this failure is sort of OK)
-    if (fDCharIter == NULL) {
-        static const UChar c = 0;
-        fDCharIter = new UCharCharacterIterator(&c, 0);
-        if (fDCharIter == NULL) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return;
-        }
-    }
+    fSCharIter.setText(UnicodeString());

-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
        // existing fCharIter was adopted from the outside.  Delete it now.
        delete fCharIter;
    }
-    fCharIter = fDCharIter;
+    fCharIter = &fSCharIter;

    this->first();
 }
@ -439,7 +439,7 @@ void
 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
    // If we are holding a CharacterIterator adopted from a
    //   previous call to this function, delete it now.
-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
        delete fCharIter;
    }

@ -473,17 +473,13 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
    //   Needed in case someone calls getText().
    //  Can not, unfortunately, do this lazily on the (probably never)
    //  call to getText(), because getText is const.
-    if (fSCharIter == NULL) {
-        fSCharIter = new StringCharacterIterator(newText);
-    } else {
-        fSCharIter->setText(newText);
-    }
+    fSCharIter.setText(newText);

-    if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
+    if (fCharIter != &fSCharIter) {
        // old fCharIter was adopted from the outside.  Delete it.
        delete fCharIter;
    }
-    fCharIter = fSCharIter;
+    fCharIter = &fSCharIter;

    this->first();
 }
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@ -26,14 +26,11 @@ U_NAMESPACE_BEGIN
 */

 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
-        fBI(bi), fBreaks(NULL), fPositionInCache(-1),
+        fBI(bi), fBreaks(status), fPositionInCache(-1),
        fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
-    fBreaks = new UVector32(status);
 }

 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
-    delete fBreaks;
-    fBreaks = NULL;
 }

 void RuleBasedBreakIterator::DictionaryCache::reset() {
@ -42,7 +39,7 @@ void RuleBasedBreakIterator::DictionaryCache::reset() {
    fLimit = 0;
    fFirstRuleStatusIndex = 0;
    fOtherRuleStatusIndex = 0;
-    fBreaks->removeAllElements();
+    fBreaks.removeAllElements();
 }

 UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
@ -54,13 +51,13 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
    // Sequential iteration, move from previous boundary to the following

    int32_t r = 0;
-    if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
+    if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
        ++fPositionInCache;
-        if (fPositionInCache >= fBreaks->size()) {
+        if (fPositionInCache >= fBreaks.size()) {
            fPositionInCache = -1;
            return FALSE;
        }
-        r = fBreaks->elementAti(fPositionInCache);
+        r = fBreaks.elementAti(fPositionInCache);
        U_ASSERT(r > fromPos);
        *result = r;
        *statusIndex = fOtherRuleStatusIndex;
@ -69,8 +66,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_

    // Random indexing. Linear search for the boundary following the given position.

-    for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
-        r= fBreaks->elementAti(fPositionInCache);
+    for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
+        r= fBreaks.elementAti(fPositionInCache);
        if (r > fromPos) {
            *result = r;
            *statusIndex = fOtherRuleStatusIndex;
@ -90,16 +87,16 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
    }

    if (fromPos == fLimit) {
-        fPositionInCache = fBreaks->size() - 1;
+        fPositionInCache = fBreaks.size() - 1;
        if (fPositionInCache >= 0) {
-            U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
+            U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
        }
    }

    int32_t r;
-    if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
+    if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
        --fPositionInCache;
-        r = fBreaks->elementAti(fPositionInCache);
+        r = fBreaks.elementAti(fPositionInCache);
        U_ASSERT(r < fromPos);
        *result = r;
        *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@ -111,8 +108,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
        return FALSE;
    }

-    for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
-        r = fBreaks->elementAti(fPositionInCache);
+    for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
+        r = fBreaks.elementAti(fPositionInCache);
        if (r < fromPos) {
            *result = r;
            *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@ -168,7 +165,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
        // Ask the language object if there are any breaks. It will add them to the cache and
        // leave the text pointer on the other side of its range, ready to search for the next one.
        if (lbe != NULL) {
-            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
+            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, fBreaks);
        }

        // Reload the loop variables for the next go-round
@ -182,21 +179,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo

    // printf("foundBreakCount = %d\n", foundBreakCount);
    if (foundBreakCount > 0) {
-        U_ASSERT(foundBreakCount == fBreaks->size());
-        if (startPos < fBreaks->elementAti(0)) {
+        U_ASSERT(foundBreakCount == fBreaks.size());
+        if (startPos < fBreaks.elementAti(0)) {
            // The dictionary did not place a boundary at the start of the segment of text.
            // Add one now. This should not commonly happen, but it would be easy for interactions
            // of the rules for dictionary segments and the break engine implementations to
            // inadvertently cause it. Cover it here, just in case.
-            fBreaks->insertElementAt(startPos, 0, status);
+            fBreaks.insertElementAt(startPos, 0, status);
        }
-        if (endPos > fBreaks->peeki()) {
-            fBreaks->push(endPos, status);
+        if (endPos > fBreaks.peeki()) {
+            fBreaks.push(endPos, status);
        }
        fPositionInCache = 0;
        // Note: Dictionary matching may extend beyond the original limit.
-        fStart = fBreaks->elementAti(0);
-        fLimit = fBreaks->peeki();
+        fStart = fBreaks.elementAti(0);
+        fLimit = fBreaks.peeki();
    } else {
        // there were no language-based breaks, even though the segment contained
        // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
--- a/icu4c/source/common/rbbi_cache.h
+++ b/icu4c/source/common/rbbi_cache.h
@ -56,7 +56,7 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory {

    RuleBasedBreakIterator *fBI;
    
-    UVector32          *fBreaks;                // A vector containing the boundaries.
+    UVector32           fBreaks;                // A vector containing the boundaries.
    int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
                                                //    or preceding(). Optimizes sequential access.
    int32_t             fStart;                 // Text position of first boundary in cache.
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -29,7 +29,6 @@
 #include "unicode/udata.h"
 #include "unicode/parseerr.h"
 #include "unicode/schriter.h"
-#include "unicode/uchriter.h"

 U_NAMESPACE_BEGIN

@ -72,14 +71,7 @@ private:
     *    a characterIterator that wraps that data.  Needed only for the
     *    implementation of getText(), a backwards compatibility issue.
     */
-    StringCharacterIterator *fSCharIter;
-
-    /**
-     *  When the input text is provided by a UText, this
-     *    dummy CharacterIterator over an empty string will
-     *    be returned from getText()
-     */
-    UCharCharacterIterator *fDCharIter;
+    StringCharacterIterator fSCharIter;

    /**
     * The rule data for this BreakIterator instance