ICU-5170 improved rbbi getText(), returning a characterIterator, behavior, as discussed with Markus on the mail list.

X-SVN-Rev: 19620
This commit is contained in:
Andy Heninger 2006-05-08 16:08:53 +00:00
parent 79df33682b
commit eda5a86809
2 changed files with 83 additions and 260 deletions

View file

@ -16,6 +16,7 @@
#include "unicode/rbbi.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
#include "unicode/udata.h"
#include "unicode/uclean.h"
#include "rbbidata.h"
@ -39,243 +40,6 @@ static const int16_t STOP_STATE = 0; // The state-transition value indicati
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
//-------------------------------------------------------------------------------
//
// UText functions As a temporary implementation, create a type of CharacterIterator
// that works over UText, and let the RBBI engine continue to
// work on CharacterIterator, which it always has.
//
// The permanent solution is to rework the RBBI engine to use
// UText directly, which will be more efficient for all input
// sources.
//
// This CharacterIterator implementation over UText is not complete,
// it has only what is needed for RBBI, and is not intended
// to ever become public.
//
// TODO: Move this to some other file.
//
//-------------------------------------------------------------------------------
class CharacterIteratorUT: public CharacterIterator {
public:
CharacterIteratorUT(UText *ut);
virtual ~CharacterIteratorUT();
virtual CharacterIterator *clone() const;
virtual UBool operator==(const ForwardCharacterIterator& that) const;
virtual UChar setIndex(int32_t position);
virtual UChar32 previous32(void);
virtual UChar32 next32(void);
virtual UBool hasNext();
virtual UChar32 current32(void) const;
virtual UBool hasPrevious();
virtual int32_t move(int32_t delta, EOrigin origin);
virtual int32_t move32(int32_t, EOrigin);
virtual void getText(UnicodeString &);
static UClassID getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
UText *fText;
virtual void resetTo(const UText *ut, UErrorCode *status);
private:
CharacterIteratorUT();
// The following functions are not needed by RBBI,
// but are pure virtual in CharacterIterator, so must be defined.
// Only stubs are provided in this implementation.
// TODO: need them all if we are going to really support RBBI::getText().
virtual int32_t hashCode(void) const {U_ASSERT(FALSE); return 0;};
virtual UChar nextPostInc(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 next32PostInc(void) {U_ASSERT(FALSE); return 0;};
virtual UChar first(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 first32(void) {U_ASSERT(FALSE); return 0;};
virtual UChar last(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 last32(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 setIndex32(int32_t) {U_ASSERT(FALSE); return 0;};
virtual UChar current(void) const {U_ASSERT(FALSE); return 0;};
virtual UChar next(void) {U_ASSERT(FALSE); return 0;};
virtual UChar previous(void) {U_ASSERT(FALSE); return 0;};
};
//
// The following fields are inherited from CharacterIterator.
// This implementation __MUST__ keep them current because of non-virtual inline
// functions defined in CharacterIterator.
// int32_t textLength; // length of the text.
// int32_t pos; // current index position
// int32_t begin; // starting index. Always 0 for us.
// int32_t end; // ending index
//
// CharacterIterator was designed assuming that utf-16 indexing would be used,
// but native indexing will pass through OK. This partial implementation only
// provides the '32' flavored code point access, not UChar access.
//
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CharacterIteratorUT)
CharacterIteratorUT::CharacterIteratorUT(UText *ut) {
fText = 0;
textLength = 0;
pos = 0;
begin = 0;
end = 0;
if (ut == NULL) {
return;
}
UErrorCode status = U_ZERO_ERROR;
fText = utext_clone(NULL, ut, FALSE, TRUE, &status); // Shallow, Read-only clone.
if (fText != NULL) {
// Set the inherited CharacterItertor fields
textLength = (int32_t)utext_nativeLength(ut);
end = textLength;
}
}
CharacterIteratorUT::CharacterIteratorUT() {
fText = NULL;
textLength = 0;
pos = 0;
begin = 0;
end = 0;
}
CharacterIteratorUT::~CharacterIteratorUT() {
utext_close(fText);
}
CharacterIterator *CharacterIteratorUT::clone() const {
CharacterIteratorUT *result = new CharacterIteratorUT(this->fText);
return result;
}
UBool CharacterIteratorUT::operator==(const ForwardCharacterIterator& that) const {
if (this->getDynamicClassID() != that.getDynamicClassID()) {
return FALSE;
}
const CharacterIteratorUT *realThat = (const CharacterIteratorUT *)&that;
UBool result = this->fText->context == realThat->fText->context;
return result;
}
UChar CharacterIteratorUT::setIndex(int32_t position) {
pos = position;
if (pos < 0) {
pos = 0;
} else if (pos > end) {
pos = end;
}
utext_setNativeIndex(fText, pos);
pos = (int32_t)utext_getNativeIndex(fText); // because utext snaps to code point boundary.
return 0x0000ffff; // RBBI doesn't use return value, and UText can't return a UChar easily.
}
UChar32 CharacterIteratorUT::previous32(void) {
UChar32 result = UTEXT_PREVIOUS32(fText);
pos = (int32_t)utext_getNativeIndex(fText); // TODO: maybe optimize common case?
if (result < 0) {
result = 0x0000ffff;
}
return result;
}
UChar32 CharacterIteratorUT::next32(void) {
// TODO: optimize.
UTEXT_NEXT32(fText);
pos = (int32_t)utext_getNativeIndex(fText);
UChar32 result = UTEXT_NEXT32(fText);
if (result < 0) {
result = 0x0000ffff;
} else {
UTEXT_PREVIOUS32(fText);
}
return result;
}
UBool CharacterIteratorUT::hasNext() {
// What would really be best for RBBI is a hasNext32()
UBool result = TRUE;
if (pos >= end) {
result = FALSE;
}
return result;
}
UChar32 CharacterIteratorUT::current32(void) const {
UChar32 result = utext_current32(fText);
if (result < 0) {
result = 0x0000ffff;
}
return result;
}
UBool CharacterIteratorUT::hasPrevious() {
UBool result = pos > 0;
return result;
}
int32_t CharacterIteratorUT::move(int32_t delta, EOrigin origin) {
// only needed for the inherited inline implementation of setToStart().
int32_t result = pos;
switch (origin) {
case kStart:
result = delta;
break;
case kCurrent:
result = pos + delta;
break;
case kEnd:
result = end + delta;
break;
default:
U_ASSERT(FALSE);
}
utext_setNativeIndex(fText, result);
pos = (int32_t)utext_getNativeIndex(fText); // align to cp boundary
return result;
}
int32_t CharacterIteratorUT::move32(int32_t amt, EOrigin origin) {
switch (origin) {
case kCurrent:
utext_moveIndex32(fText, amt);
break;
default:
// don't bother with kStart, kEnd. Not Used by break iteration.
U_ASSERT(FALSE);
}
pos = (int32_t)utext_getNativeIndex(fText);
return pos;
}
void CharacterIteratorUT::resetTo(const UText *ut, UErrorCode *status) {
// Reset this CharacterIteratorUT to use a new UText.
fText = utext_clone(fText, ut, FALSE, TRUE, status);
utext_setNativeIndex(fText, 0);
textLength = (int32_t)utext_nativeLength(fText);
pos = 0;
end = textLength;
}
void CharacterIteratorUT::getText(UnicodeString &dest) {
dest.truncate(0);
for (UChar32 c=utext_next32From(fText, 0); c!=U_SENTINEL; c=utext_next32(fText)) {
dest.append(c);
}
}
//=======================================================================
// constructors
//=======================================================================
@ -368,8 +132,16 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
* Destructor
*/
RuleBasedBreakIterator::~RuleBasedBreakIterator() {
delete fCharIter;
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
// fCharIter was adopted from the outside.
delete fCharIter;
}
fCharIter = NULL;
delete fSCharIter;
fCharIter = NULL;
delete fDCharIter;
fDCharIter = NULL;
utext_close(fText);
if (fData != NULL) {
@ -408,11 +180,19 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
// TODO: clone fLanguageBreakEngines from "that"
UErrorCode status = U_ZERO_ERROR;
fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
if (that.fCharIter != NULL) {
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
delete fCharIter;
}
fCharIter = NULL;
if (that.fCharIter != NULL ) {
// This is a little bit tricky - it will intially appear that
// this->fCharIter is adopted, even if that->fCharIter was
// not adopted. That's ok.
fCharIter = that.fCharIter->clone();
}
if (fData != NULL) {
fData->removeReference();
fData = NULL;
@ -438,6 +218,8 @@ void RuleBasedBreakIterator::init() {
UErrorCode status = U_ZERO_ERROR;
fText = utext_openUChars(NULL, NULL, 0, &status);
fCharIter = NULL;
fSCharIter = NULL;
fDCharIter = NULL;
fData = NULL;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
@ -524,8 +306,24 @@ void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
}
reset();
fText = utext_clone(fText, ut, FALSE, TRUE, &status);
delete fCharIter; // A CharacterIterator will be lazily created if user asks.
fCharIter = NULL; // It's obsolete, usually wont be needed.
// Set up a dummy CharacterIterator to be returned if anyone
// calls getText(). With input from UText, there is no reasonable
// way to return a characterIterator over the actual input text.
// Return one over an empty string instead - this is the closest
// we can come to signaling a failure.
// (GetText() is obsolete, this failure is sort of OK)
if (fDCharIter == NULL) {
static UChar c = 0;
fDCharIter = new UCharCharacterIterator(&c, 0);
}
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
// existing fCharIter was adopted from the outside. Delete it now.
delete fCharIter;
}
fCharIter = fDCharIter;
this->first();
}
@ -563,17 +361,10 @@ RuleBasedBreakIterator::getRules() const {
/**
* Return a CharacterIterator over the text being analyzed.
* @return An iterator over the text being analyzed.
*/
CharacterIterator&
RuleBasedBreakIterator::getText() const {
RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
//
if (nonConstThis->fCharIter == NULL) {
nonConstThis->fCharIter = new CharacterIteratorUT(fText);
}
return *nonConstThis->fCharIter;
return *fCharIter;
}
/**
@ -583,7 +374,12 @@ RuleBasedBreakIterator::getText() const {
*/
void
RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
delete fCharIter;
// If we are holding a CharacterIterator adopted from a
// previous call to this function, delete it now.
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
delete fCharIter;
}
fCharIter = newText;
UErrorCode status = U_ZERO_ERROR;
reset();
@ -607,8 +403,23 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
UErrorCode status = U_ZERO_ERROR;
reset();
fText = utext_openConstUnicodeString(fText, &newText, &status);
delete fCharIter;
fCharIter = NULL;
// Set up a character iterator on the string.
// Needed in case someone calls getText().
// Can not, unfortunately, do this lazily on the (probably never)
// call to getText(), because getText is const.
if (fSCharIter == NULL) {
fSCharIter = new StringCharacterIterator(newText);
} else {
fSCharIter->setText(newText);
}
if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
// old fCharIter was adopted from the outside. Delete it.
delete fCharIter;
}
fCharIter = fSCharIter;
this->first();
}

View file

@ -26,6 +26,8 @@
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/parseerr.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
struct UTrie;
@ -44,6 +46,7 @@ struct RBBIStateTable;
/**
*
* A subclass of BreakIterator whose behavior is specified using a list of rules.
@ -70,11 +73,25 @@ protected:
/**
* A character iterator that refers to the same text as the UText, above.
* Lazily created when requested by a caller.
* Only included for compatibility with old API, which was based on CharacterIterators.
* Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
*/
CharacterIterator *fCharIter;
/**
* When the input text is provided by a UnicodeString, this will point to
* a characterIterator that wraps that data. Needed only for the
* implementation of getText(), a backwards compatibility issue.
*/
StringCharacterIterator *fSCharIter;
/**
* When the input text is provided by a UText, this
* dummy CharacterIterator over an empty string will
* be returned from getText()
*/
UCharCharacterIterator *fDCharIter;
/**
* The rule data for this BreakIterator instance
* @internal
@ -296,14 +313,9 @@ public:
* the break iterator. If concurrent operation is needed, clone the
* returned character iterator first and operate on the clone.
* <p/>
* This function is not thread safe. Two threads must not make concurrent
* calls to BreakIterator::getText(). This is an exception to the general
* rules for thread safety in ICU, which are that const functions are
* thread safe.
* <p/>
* When the break iterator is operating on text supplied via a UText,
* this function will fail. Lacking any way to signal failures, it
* returns an CharacterIterator containing no text in this case.
* returns an CharacterIterator containing no text.
* The function getUText() provides similar functionality,
* is reliable, and is more efficient.
*