ICU-21180 RuleBasedBreakIterator, refactor init.

In class RuleBasedBreakIterator, refactor how object initialization is handled
by the various constructors, taking advantage of C++11's ability to directly
initialize data members in the class declaration.

This will simplify ongoing maintenance of the code by eliminating the need
to keep initialization lists synchronized with the class data members.
This is being done now in preparation for additional changes to fix problems
with the handling of memory allocation failures.
This commit is contained in:
Andy Heninger 2022-10-20 20:30:24 -07:00
parent 2d19377a8d
commit 67a7e2caf0
3 changed files with 71 additions and 104 deletions

View file

@ -63,9 +63,8 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
* tables object that is passed in as a parameter.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
: fSCharIter(UnicodeString())
: RuleBasedBreakIterator(status)
{
init(status);
fData = new RBBIDataWrapper(data, status); // status checked in constructor
if (U_FAILURE(status)) {return;}
if(fData == nullptr) {
@ -102,9 +101,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseB
RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
uint32_t ruleLength,
UErrorCode &status)
: fSCharIter(UnicodeString())
: RuleBasedBreakIterator(status)
{
init(status);
if (U_FAILURE(status)) {
return;
}
@ -141,9 +139,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
//
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
: fSCharIter(UnicodeString())
: RuleBasedBreakIterator(status)
{
init(status);
fData = new RBBIDataWrapper(udm, status); // status checked in constructor
if (U_FAILURE(status)) {return;}
if(fData == nullptr) {
@ -170,9 +167,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta
RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status)
: fSCharIter(UnicodeString())
: RuleBasedBreakIterator(status)
{
init(status);
if (U_FAILURE(status)) {return;}
RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
@ -194,10 +190,35 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
// of rules.
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator()
: fSCharIter(UnicodeString())
: RuleBasedBreakIterator(fErrorCode)
{
UErrorCode status = U_ZERO_ERROR;
init(status);
}
/**
* Simple Constructor with an error code.
* Handles common initialization for all other constructors.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode &status) {
utext_openUChars(&fText, nullptr, 0, &status);
LocalPointer<DictionaryCache> lpDictionaryCache(new DictionaryCache(this, status), status);
LocalPointer<BreakCache> lpBreakCache(new BreakCache(this, status), status);
if (U_FAILURE(status)) {
fErrorCode = status;
return;
}
fDictionaryCache = lpDictionaryCache.orphan();
fBreakCache = lpBreakCache.orphan();
#ifdef RBBI_DEBUG
static UBool debugInitDone = false;
if (debugInitDone == false) {
char *debugEnv = getenv("U_RBBIDEBUG");
if (debugEnv && uprv_strstr(debugEnv, "trace")) {
gTrace = true;
}
debugInitDone = true;
}
#endif
}
@ -208,11 +229,8 @@ RuleBasedBreakIterator::RuleBasedBreakIterator()
//
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
: BreakIterator(other),
fSCharIter(UnicodeString())
: RuleBasedBreakIterator()
{
UErrorCode status = U_ZERO_ERROR;
this->init(status);
*this = other;
}
@ -315,58 +333,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
return *this;
}
//-----------------------------------------------------------------------------
//
// init() Shared initialization routine. Used by all the constructors.
// Initializes all fields, leaving the object in a consistent state.
//
//-----------------------------------------------------------------------------
void RuleBasedBreakIterator::init(UErrorCode &status) {
fCharIter = nullptr;
fData = nullptr;
fPosition = 0;
fRuleStatusIndex = 0;
fDone = false;
fDictionaryCharCount = 0;
fLanguageBreakEngines = nullptr;
fUnhandledBreakEngine = nullptr;
fBreakCache = nullptr;
fDictionaryCache = nullptr;
fLookAheadMatches = nullptr;
fIsPhraseBreaking = false;
// Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
// fText = UTEXT_INITIALIZER;
static const UText initializedUText = UTEXT_INITIALIZER;
uprv_memcpy(&fText, &initializedUText, sizeof(UText));
if (U_FAILURE(status)) {
return;
}
utext_openUChars(&fText, nullptr, 0, &status);
fDictionaryCache = new DictionaryCache(this, status);
fBreakCache = new BreakCache(this, status);
if (U_SUCCESS(status) && (fDictionaryCache == nullptr || fBreakCache == nullptr)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
#ifdef RBBI_DEBUG
static UBool debugInitDone = false;
if (debugInitDone == false) {
char *debugEnv = getenv("U_RBBIDEBUG");
if (debugEnv && uprv_strstr(debugEnv, "trace")) {
gTrace = true;
}
debugInitDone = true;
}
#endif
}
//-----------------------------------------------------------------------------
//
// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
@ -447,7 +413,7 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
// Return one over an empty string instead - this is the closest
// we can come to signaling a failure.
// (GetText() is obsolete, this failure is sort of OK)
fSCharIter.setText(UnicodeString());
fSCharIter.setText(u"", 0);
if (fCharIter != &fSCharIter) {
// existing fCharIter was adopted from the outside. Delete it now.
@ -520,7 +486,7 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
// Needed in case someone calls getText().
// Can not, unfortunately, do this lazily on the (probably never)
// call to getText(), because getText is const.
fSCharIter.setText(newText);
fSCharIter.setText(newText.getBuffer(), newText.length());
if (fCharIter != &fSCharIter) {
// old fCharIter was adopted from the outside. Delete it.

View file

@ -61,7 +61,7 @@ private:
* The UText through which this BreakIterator accesses the text
* @internal (private)
*/
UText fText;
UText fText = UTEXT_INITIALIZER;
#ifndef U_HIDE_INTERNAL_API
public:
@ -71,32 +71,38 @@ public:
* Not for general use; Public only for testing purposes.
* @internal
*/
RBBIDataWrapper *fData;
RBBIDataWrapper *fData = nullptr;
private:
/**
* The saved error code associated with this break iterator.
* This is the value to be returned by copyErrorTo().
*/
UErrorCode fErrorCode = U_ZERO_ERROR;
/**
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
* Never has the value UBRK_DONE (-1).
*/
int32_t fPosition;
int32_t fPosition = 0;
/**
* TODO:
*/
int32_t fRuleStatusIndex;
int32_t fRuleStatusIndex = 0;
/**
* Cache of previously determined boundary positions.
*/
class BreakCache;
BreakCache *fBreakCache;
BreakCache *fBreakCache = nullptr;
/**
* Cache of boundary positions within a region of text that has been
* sub-divided by dictionary based breaking.
*/
class DictionaryCache;
DictionaryCache *fDictionaryCache;
DictionaryCache *fDictionaryCache = nullptr;
/**
*
@ -105,7 +111,7 @@ private:
* handle a given character.
* @internal (private)
*/
UStack *fLanguageBreakEngines;
UStack *fLanguageBreakEngines = nullptr;
/**
*
@ -114,43 +120,43 @@ private:
* LanguageBreakEngine.
* @internal (private)
*/
UnhandledEngine *fUnhandledBreakEngine;
UnhandledEngine *fUnhandledBreakEngine = nullptr;
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set.
* @internal (private)
*/
uint32_t fDictionaryCharCount;
uint32_t fDictionaryCharCount = 0;
/**
* A character iterator that refers to the same text as the UText, above.
* Only included for compatibility with old API, which was based on CharacterIterators.
* Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
*/
CharacterIterator *fCharIter;
CharacterIterator *fCharIter = &fSCharIter;
/**
* When the input text is provided by a UnicodeString, this will point to
* a characterIterator that wraps that data. Needed only for the
* implementation of getText(), a backwards compatibility issue.
*/
StringCharacterIterator fSCharIter;
UCharCharacterIterator fSCharIter {u"", 0};
/**
* True when iteration has run off the end, and iterator functions should return UBRK_DONE.
*/
UBool fDone;
bool fDone = false;
/**
* Array of look-ahead tentative results.
*/
int32_t *fLookAheadMatches;
int32_t *fLookAheadMatches = nullptr;
/**
* A flag to indicate if phrase based breaking is enabled.
*/
UBool fIsPhraseBreaking;
UBool fIsPhraseBreaking = false;
//=======================================================================
// constructors
@ -188,10 +194,19 @@ private:
/** @internal */
friend class BreakIterator;
/**
* Default constructor with an error code parameter.
* Aside from error handling, otherwise identical to the default constructor.
* Internally, handles common initialization for other constructors.
* @internal (private)
*/
RuleBasedBreakIterator(UErrorCode &status);
public:
/** Default constructor. Creates an empty shell of an iterator, with no
* rules or text to iterate over. Object can subsequently be assigned to.
* rules or text to iterate over. Object can subsequently be assigned to,
* but is otherwise unusable.
* @stable ICU 2.2
*/
RuleBasedBreakIterator();
@ -289,7 +304,9 @@ public:
* @return true if both BreakIterators are not same.
* @stable ICU 2.0
*/
inline bool operator!=(const BreakIterator& that) const;
inline bool operator!=(const BreakIterator& that) const {
return !operator==(that);
}
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
@ -335,8 +352,7 @@ public:
* </p>
* <p>
* When the break iterator is operating on text supplied via a UText,
* this function will fail. Lacking any way to signal failures, it
* returns an CharacterIterator containing no text.
* this function will fail, returning a CharacterIterator containing no text.
* The function getUText() provides similar functionality,
* is reliable, and is more efficient.
* </p>
@ -648,12 +664,6 @@ private:
//=======================================================================
// implementation
//=======================================================================
/**
* Common initialization function, used by constructors and bufferClone.
* @internal (private)
*/
void init(UErrorCode &status);
/**
* Iterate backwards from an arbitrary position in the input text using the
* synthesized Safe Reverse rules.
@ -726,16 +736,6 @@ private:
#endif /* U_HIDE_INTERNAL_API */
};
//------------------------------------------------------------------------------
//
// Inline Functions Definitions ...
//
//------------------------------------------------------------------------------
inline bool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
return !operator==(that);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -5231,7 +5231,7 @@ void RBBITest::TestTraceCreateBreakEngine(void) {
// To word break the following text, BreakIterator will create 5 dictionary
// break engine internally.
brkitr->setText(
UnicodeString text(
u"test "
u"測試 " // Hani
u"សាកល្បង " // Khmr
@ -5240,6 +5240,7 @@ void RBBITest::TestTraceCreateBreakEngine(void) {
u"ทดสอบ " // Thai
u"test "
);
brkitr->setText(text);
// Loop through all the text.
while (brkitr->next() > 0) ;