mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13541 Improve RuleBasedBreakIterator construction time, patch from grhoten.
X-SVN-Rev: 40789
This commit is contained in:
parent
a3dca5a303
commit
ac0972f12c
4 changed files with 50 additions and 65 deletions
|
@ -64,7 +64,9 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
|
|||
* Constructs a RuleBasedBreakIterator that uses the already-created
|
||||
* tables object that is passed in as a parameter.
|
||||
*/
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) {
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
|
||||
: fSCharIter(UnicodeString())
|
||||
{
|
||||
init(status);
|
||||
fData = new RBBIDataWrapper(data, status); // status checked in constructor
|
||||
if (U_FAILURE(status)) {return;}
|
||||
|
@ -80,7 +82,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
|
|||
//
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
|
||||
uint32_t ruleLength,
|
||||
UErrorCode &status) {
|
||||
UErrorCode &status)
|
||||
: fSCharIter(UnicodeString())
|
||||
{
|
||||
init(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -110,6 +114,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
|
|||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
|
||||
: fSCharIter(UnicodeString())
|
||||
{
|
||||
init(status);
|
||||
fData = new RBBIDataWrapper(udm, status); // status checked in constructor
|
||||
|
@ -130,6 +135,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
|
|||
RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
|
||||
UParseError &parseError,
|
||||
UErrorCode &status)
|
||||
: fSCharIter(UnicodeString())
|
||||
{
|
||||
init(status);
|
||||
if (U_FAILURE(status)) {return;}
|
||||
|
@ -152,7 +158,9 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
|
|||
// Used when creating a RuleBasedBreakIterator from a set
|
||||
// of rules.
|
||||
//-------------------------------------------------------------------------------
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator() {
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator()
|
||||
: fSCharIter(UnicodeString())
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
init(status);
|
||||
}
|
||||
|
@ -165,7 +173,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() {
|
|||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
|
||||
: BreakIterator(other)
|
||||
: BreakIterator(other),
|
||||
fSCharIter(UnicodeString())
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
this->init(status);
|
||||
|
@ -177,15 +186,11 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
|
|||
* Destructor
|
||||
*/
|
||||
RuleBasedBreakIterator::~RuleBasedBreakIterator() {
|
||||
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
|
||||
if (fCharIter != &fSCharIter) {
|
||||
// fCharIter was adopted from the outside.
|
||||
delete fCharIter;
|
||||
}
|
||||
fCharIter = NULL;
|
||||
delete fSCharIter;
|
||||
fSCharIter = NULL;
|
||||
delete fDCharIter;
|
||||
fDCharIter = NULL;
|
||||
|
||||
utext_close(fText);
|
||||
|
||||
|
@ -226,17 +231,21 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
|||
UErrorCode status = U_ZERO_ERROR;
|
||||
fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
|
||||
|
||||
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
|
||||
if (fCharIter != &fSCharIter) {
|
||||
delete fCharIter;
|
||||
}
|
||||
fCharIter = NULL;
|
||||
|
||||
if (that.fCharIter != NULL ) {
|
||||
if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
|
||||
// This is a little bit tricky - it will intially appear that
|
||||
// this->fCharIter is adopted, even if that->fCharIter was
|
||||
// not adopted. That's ok.
|
||||
fCharIter = that.fCharIter->clone();
|
||||
}
|
||||
fSCharIter = that.fSCharIter;
|
||||
if (fCharIter == NULL) {
|
||||
fCharIter = &fSCharIter;
|
||||
}
|
||||
|
||||
if (fData != NULL) {
|
||||
fData->removeReference();
|
||||
|
@ -271,8 +280,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
|||
void RuleBasedBreakIterator::init(UErrorCode &status) {
|
||||
fText = NULL;
|
||||
fCharIter = NULL;
|
||||
fSCharIter = NULL;
|
||||
fDCharIter = NULL;
|
||||
fData = NULL;
|
||||
fPosition = 0;
|
||||
fRuleStatusIndex = 0;
|
||||
|
@ -393,20 +400,13 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
|
|||
// Return one over an empty string instead - this is the closest
|
||||
// we can come to signaling a failure.
|
||||
// (GetText() is obsolete, this failure is sort of OK)
|
||||
if (fDCharIter == NULL) {
|
||||
static const UChar c = 0;
|
||||
fDCharIter = new UCharCharacterIterator(&c, 0);
|
||||
if (fDCharIter == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
fSCharIter.setText(UnicodeString());
|
||||
|
||||
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
|
||||
if (fCharIter != &fSCharIter) {
|
||||
// existing fCharIter was adopted from the outside. Delete it now.
|
||||
delete fCharIter;
|
||||
}
|
||||
fCharIter = fDCharIter;
|
||||
fCharIter = &fSCharIter;
|
||||
|
||||
this->first();
|
||||
}
|
||||
|
@ -439,7 +439,7 @@ void
|
|||
RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
|
||||
// If we are holding a CharacterIterator adopted from a
|
||||
// previous call to this function, delete it now.
|
||||
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
|
||||
if (fCharIter != &fSCharIter) {
|
||||
delete fCharIter;
|
||||
}
|
||||
|
||||
|
@ -473,17 +473,13 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
|
|||
// Needed in case someone calls getText().
|
||||
// Can not, unfortunately, do this lazily on the (probably never)
|
||||
// call to getText(), because getText is const.
|
||||
if (fSCharIter == NULL) {
|
||||
fSCharIter = new StringCharacterIterator(newText);
|
||||
} else {
|
||||
fSCharIter->setText(newText);
|
||||
}
|
||||
fSCharIter.setText(newText);
|
||||
|
||||
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
|
||||
if (fCharIter != &fSCharIter) {
|
||||
// old fCharIter was adopted from the outside. Delete it.
|
||||
delete fCharIter;
|
||||
}
|
||||
fCharIter = fSCharIter;
|
||||
fCharIter = &fSCharIter;
|
||||
|
||||
this->first();
|
||||
}
|
||||
|
|
|
@ -26,14 +26,11 @@ U_NAMESPACE_BEGIN
|
|||
*/
|
||||
|
||||
RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
|
||||
fBI(bi), fBreaks(NULL), fPositionInCache(-1),
|
||||
fBI(bi), fBreaks(status), fPositionInCache(-1),
|
||||
fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
|
||||
fBreaks = new UVector32(status);
|
||||
}
|
||||
|
||||
RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
|
||||
delete fBreaks;
|
||||
fBreaks = NULL;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::DictionaryCache::reset() {
|
||||
|
@ -42,7 +39,7 @@ void RuleBasedBreakIterator::DictionaryCache::reset() {
|
|||
fLimit = 0;
|
||||
fFirstRuleStatusIndex = 0;
|
||||
fOtherRuleStatusIndex = 0;
|
||||
fBreaks->removeAllElements();
|
||||
fBreaks.removeAllElements();
|
||||
}
|
||||
|
||||
UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) {
|
||||
|
@ -54,13 +51,13 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
|
|||
// Sequential iteration, move from previous boundary to the following
|
||||
|
||||
int32_t r = 0;
|
||||
if (fPositionInCache >= 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
|
||||
if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
|
||||
++fPositionInCache;
|
||||
if (fPositionInCache >= fBreaks->size()) {
|
||||
if (fPositionInCache >= fBreaks.size()) {
|
||||
fPositionInCache = -1;
|
||||
return FALSE;
|
||||
}
|
||||
r = fBreaks->elementAti(fPositionInCache);
|
||||
r = fBreaks.elementAti(fPositionInCache);
|
||||
U_ASSERT(r > fromPos);
|
||||
*result = r;
|
||||
*statusIndex = fOtherRuleStatusIndex;
|
||||
|
@ -69,8 +66,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_
|
|||
|
||||
// Random indexing. Linear search for the boundary following the given position.
|
||||
|
||||
for (fPositionInCache = 0; fPositionInCache < fBreaks->size(); ++fPositionInCache) {
|
||||
r= fBreaks->elementAti(fPositionInCache);
|
||||
for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
|
||||
r= fBreaks.elementAti(fPositionInCache);
|
||||
if (r > fromPos) {
|
||||
*result = r;
|
||||
*statusIndex = fOtherRuleStatusIndex;
|
||||
|
@ -90,16 +87,16 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
|
|||
}
|
||||
|
||||
if (fromPos == fLimit) {
|
||||
fPositionInCache = fBreaks->size() - 1;
|
||||
fPositionInCache = fBreaks.size() - 1;
|
||||
if (fPositionInCache >= 0) {
|
||||
U_ASSERT(fBreaks->elementAti(fPositionInCache) == fromPos);
|
||||
U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t r;
|
||||
if (fPositionInCache > 0 && fPositionInCache < fBreaks->size() && fBreaks->elementAti(fPositionInCache) == fromPos) {
|
||||
if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) {
|
||||
--fPositionInCache;
|
||||
r = fBreaks->elementAti(fPositionInCache);
|
||||
r = fBreaks.elementAti(fPositionInCache);
|
||||
U_ASSERT(r < fromPos);
|
||||
*result = r;
|
||||
*statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||||
|
@ -111,8 +108,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
for (fPositionInCache = fBreaks->size()-1; fPositionInCache >= 0; --fPositionInCache) {
|
||||
r = fBreaks->elementAti(fPositionInCache);
|
||||
for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
|
||||
r = fBreaks.elementAti(fPositionInCache);
|
||||
if (r < fromPos) {
|
||||
*result = r;
|
||||
*statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
|
||||
|
@ -168,7 +165,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != NULL) {
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, fBreaks);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
|
@ -182,21 +179,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
|
||||
// printf("foundBreakCount = %d\n", foundBreakCount);
|
||||
if (foundBreakCount > 0) {
|
||||
U_ASSERT(foundBreakCount == fBreaks->size());
|
||||
if (startPos < fBreaks->elementAti(0)) {
|
||||
U_ASSERT(foundBreakCount == fBreaks.size());
|
||||
if (startPos < fBreaks.elementAti(0)) {
|
||||
// The dictionary did not place a boundary at the start of the segment of text.
|
||||
// Add one now. This should not commonly happen, but it would be easy for interactions
|
||||
// of the rules for dictionary segments and the break engine implementations to
|
||||
// inadvertently cause it. Cover it here, just in case.
|
||||
fBreaks->insertElementAt(startPos, 0, status);
|
||||
fBreaks.insertElementAt(startPos, 0, status);
|
||||
}
|
||||
if (endPos > fBreaks->peeki()) {
|
||||
fBreaks->push(endPos, status);
|
||||
if (endPos > fBreaks.peeki()) {
|
||||
fBreaks.push(endPos, status);
|
||||
}
|
||||
fPositionInCache = 0;
|
||||
// Note: Dictionary matching may extend beyond the original limit.
|
||||
fStart = fBreaks->elementAti(0);
|
||||
fLimit = fBreaks->peeki();
|
||||
fStart = fBreaks.elementAti(0);
|
||||
fLimit = fBreaks.peeki();
|
||||
} else {
|
||||
// there were no language-based breaks, even though the segment contained
|
||||
// dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
|
||||
|
|
|
@ -56,7 +56,7 @@ class RuleBasedBreakIterator::DictionaryCache: public UMemory {
|
|||
|
||||
RuleBasedBreakIterator *fBI;
|
||||
|
||||
UVector32 *fBreaks; // A vector containing the boundaries.
|
||||
UVector32 fBreaks; // A vector containing the boundaries.
|
||||
int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following()
|
||||
// or preceding(). Optimizes sequential access.
|
||||
int32_t fStart; // Text position of first boundary in cache.
|
||||
|
|
|
@ -29,7 +29,6 @@
|
|||
#include "unicode/udata.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/uchriter.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -72,14 +71,7 @@ private:
|
|||
* a characterIterator that wraps that data. Needed only for the
|
||||
* implementation of getText(), a backwards compatibility issue.
|
||||
*/
|
||||
StringCharacterIterator *fSCharIter;
|
||||
|
||||
/**
|
||||
* When the input text is provided by a UText, this
|
||||
* dummy CharacterIterator over an empty string will
|
||||
* be returned from getText()
|
||||
*/
|
||||
UCharCharacterIterator *fDCharIter;
|
||||
StringCharacterIterator fSCharIter;
|
||||
|
||||
/**
|
||||
* The rule data for this BreakIterator instance
|
||||
|
|
Loading…
Add table
Reference in a new issue