ICU-13541 Improve RuleBasedBreakIterator construction time, patch from grhoten.

X-SVN-Rev: 40789
This commit is contained in:
Andy Heninger 2018-01-19 22:30:56 +00:00
parent a3dca5a303
commit ac0972f12c
4 changed files with 50 additions and 65 deletions

View file

@ -64,7 +64,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
* Constructs a RuleBasedBreakIterator that uses the already-created
* tables object that is passed in as a parameter.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
: fSCharIter(UnicodeString())
{
init(status);
fData = new RBBIDataWrapper(data, status); // status checked in constructor
if (U_FAILURE(status)) {return;}
@ -80,7 +82,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
//
RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
uint32_t ruleLength,
UErrorCode &status) {
UErrorCode &status)
: fSCharIter(UnicodeString())
{
init(status);
if (U_FAILURE(status)) {
return;
@ -110,6 +114,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
//
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
: fSCharIter(UnicodeString())
{
init(status);
fData = new RBBIDataWrapper(udm, status); // status checked in constructor
@ -130,6 +135,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status)
: fSCharIter(UnicodeString())
{
init(status);
if (U_FAILURE(status)) {return;}
@ -152,7 +158,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
// Used when creating a RuleBasedBreakIterator from a set
// of rules.
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator() {
RuleBasedBreakIterator::RuleBasedBreakIterator()
: fSCharIter(UnicodeString())
{
UErrorCode status = U_ZERO_ERROR;
init(status);
}
@ -165,7 +173,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() {
//
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
: BreakIterator(other)
: BreakIterator(other),
fSCharIter(UnicodeString())
{
UErrorCode status = U_ZERO_ERROR;
this->init(status);
@ -177,15 +186,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
* Destructor
*/
RuleBasedBreakIterator::~RuleBasedBreakIterator() {
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
if (fCharIter != &fSCharIter) {
// fCharIter was adopted from the outside.
delete fCharIter;
}
fCharIter = NULL;
delete fSCharIter;
fSCharIter = NULL;
delete fDCharIter;
fDCharIter = NULL;
utext_close(fText);
@ -226,17 +231,21 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
UErrorCode status = U_ZERO_ERROR;
fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
if (fCharIter != &fSCharIter) {
delete fCharIter;
}
fCharIter = NULL;
if (that.fCharIter != NULL ) {
if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
// This is a little bit tricky - it will intially appear that
// this->fCharIter is adopted, even if that->fCharIter was
// not adopted. That's ok.
fCharIter = that.fCharIter->clone();
}
fSCharIter = that.fSCharIter;
if (fCharIter == NULL) {
fCharIter = &fSCharIter;
}
if (fData != NULL) {
fData->removeReference();
@ -271,8 +280,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
void RuleBasedBreakIterator::init(UErrorCode &status) {
fText = NULL;
fCharIter = NULL;
fSCharIter = NULL;
fDCharIter = NULL;
fData = NULL;
fPosition = 0;
fRuleStatusIndex = 0;
@ -393,20 +400,13 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
// Return one over an empty string instead - this is the closest
// we can come to signaling a failure.
// (GetText() is obsolete, this failure is sort of OK)
if (fDCharIter == NULL) {
static const UChar c = 0;
fDCharIter = new UCharCharacterIterator(&c, 0);
if (fDCharIter == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
fSCharIter.setText(UnicodeString());
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
if (fCharIter != &fSCharIter) {
// existing fCharIter was adopted from the outside. Delete it now.
delete fCharIter;
}
fCharIter = fDCharIter;
fCharIter = &fSCharIter;
this->first();
}
@ -439,7 +439,7 @@ void
RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
// If we are holding a CharacterIterator adopted from a
// previous call to this function, delete it now.
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
if (fCharIter != &fSCharIter) {
delete fCharIter;
}
@ -473,17 +473,13 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
// Needed in case someone calls getText().
// Can not, unfortunately, do this lazily on the (probably never)
// call to getText(), because getText is const.
if (fSCharIter == NULL) {
fSCharIter = new StringCharacterIterator(newText);
} else {
fSCharIter->setText(newText);
}
fSCharIter.setText(newText);
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
if (fCharIter != &fSCharIter) {
// old fCharIter was adopted from the outside. Delete it.
delete fCharIter;
}
fCharIter = fSCharIter;
fCharIter = &fSCharIter;
this->first();
}

View file

@ -26,14 +26,11 @@ U_NAMESPACE_BEGIN
*/
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
fBI(bi), fBreaks(NULL), fPositionInCache(-1),
fBI(bi), fBreaks(status), fPositionInCache(-1),
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
fBreaks = new UVector32(status);
}
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
delete fBreaks;
fBreaks = NULL;
}
void RuleBasedBreakIterator::DictionaryCache::reset() {
@ -42,7 +39,7 @@ void RuleBasedBreakIterator::DictionaryCache::reset() {
fLimit = 0;
fFirstRuleStatusIndex = 0;
fOtherRuleStatusIndex = 0;
fBreaks->removeAllElements();
fBreaks.removeAllElements();
}
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
@ -54,13 +51,13 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
// Sequential iteration, move from previous boundary to the following
int32_t r = 0;
if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
++fPositionInCache;
if (fPositionInCache >= fBreaks->size()) {
if (fPositionInCache >= fBreaks.size()) {
fPositionInCache = -1;
return FALSE;
}
r = fBreaks->elementAti(fPositionInCache);
r = fBreaks.elementAti(fPositionInCache);
U_ASSERT(r > fromPos);
*result = r;
*statusIndex = fOtherRuleStatusIndex;
@ -69,8 +66,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
// Random indexing. Linear search for the boundary following the given position.
for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
r= fBreaks->elementAti(fPositionInCache);
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
r= fBreaks.elementAti(fPositionInCache);
if (r > fromPos) {
*result = r;
*statusIndex = fOtherRuleStatusIndex;
@ -90,16 +87,16 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
}
if (fromPos == fLimit) {
fPositionInCache = fBreaks->size() - 1;
fPositionInCache = fBreaks.size() - 1;
if (fPositionInCache >= 0) {
U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
}
}
int32_t r;
if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
--fPositionInCache;
r = fBreaks->elementAti(fPositionInCache);
r = fBreaks.elementAti(fPositionInCache);
U_ASSERT(r < fromPos);
*result = r;
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@ -111,8 +108,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
return FALSE;
}
for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
r = fBreaks->elementAti(fPositionInCache);
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
r = fBreaks.elementAti(fPositionInCache);
if (r < fromPos) {
*result = r;
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
@ -168,7 +165,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != NULL) {
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, fBreaks);
}
// Reload the loop variables for the next go-round
@ -182,21 +179,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// printf("foundBreakCount = %d\n", foundBreakCount);
if (foundBreakCount > 0) {
U_ASSERT(foundBreakCount == fBreaks->size());
if (startPos < fBreaks->elementAti(0)) {
U_ASSERT(foundBreakCount == fBreaks.size());
if (startPos < fBreaks.elementAti(0)) {
// The dictionary did not place a boundary at the start of the segment of text.
// Add one now. This should not commonly happen, but it would be easy for interactions
// of the rules for dictionary segments and the break engine implementations to
// inadvertently cause it. Cover it here, just in case.
fBreaks->insertElementAt(startPos, 0, status);
fBreaks.insertElementAt(startPos, 0, status);
}
if (endPos > fBreaks->peeki()) {
fBreaks->push(endPos, status);
if (endPos > fBreaks.peeki()) {
fBreaks.push(endPos, status);
}
fPositionInCache = 0;
// Note: Dictionary matching may extend beyond the original limit.
fStart = fBreaks->elementAti(0);
fLimit = fBreaks->peeki();
fStart = fBreaks.elementAti(0);
fLimit = fBreaks.peeki();
} else {
// there were no language-based breaks, even though the segment contained
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache

View file

@ -56,7 +56,7 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory {
RuleBasedBreakIterator *fBI;
UVector32 *fBreaks; // A vector containing the boundaries.
UVector32 fBreaks; // A vector containing the boundaries.
int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
// or preceding(). Optimizes sequential access.
int32_t fStart; // Text position of first boundary in cache.

View file

@ -29,7 +29,6 @@
#include "unicode/udata.h"
#include "unicode/parseerr.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
U_NAMESPACE_BEGIN
@ -72,14 +71,7 @@ private:
* a characterIterator that wraps that data. Needed only for the
* implementation of getText(), a backwards compatibility issue.
*/
StringCharacterIterator *fSCharIter;
/**
* When the input text is provided by a UText, this
* dummy CharacterIterator over an empty string will
* be returned from getText()
*/
UCharCharacterIterator *fDCharIter;
StringCharacterIterator fSCharIter;
/**
* The rule data for this BreakIterator instance