From 0f8bc50e815c0e79fb9f603e8705d6c9327d83e0 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Sun, 26 Jun 2005 21:31:36 +0000 Subject: [PATCH] ICU-3944 text access, RBBI access. Fix extra space allocation problem X-SVN-Rev: 18060 --- icu4c/source/common/rbbi.cpp | 233 +++++++++++++++++++++++- icu4c/source/common/utext.cpp | 8 +- icu4c/source/test/intltest/rbbiapts.cpp | 53 ++++++ 3 files changed, 288 insertions(+), 6 deletions(-) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index d7aec8bccbf..986d077da0e 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -330,7 +330,8 @@ int32_t RuleBasedBreakIterator::first(void) { if (fText == NULL) return BreakIterator::DONE; - fText->first(); + //fText->first(); + fText->setToStart(); return fText->getIndex(); } @@ -1350,17 +1351,239 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { //------------------------------------------------------------------------------- // -// UText functions +// UText functions As a temporary implementation, create a type of CharacterIterator +// that works over UText, and let the RBBI engine continue to +// work on CharacterIterator, which it always has. +// +// The permanent solution is to rework the RBBI engine to use +// UText directly, which will be more efficient for all input +// sources. +// +// This CharacterIterator implementation over UText is not complete, +// it has only what is needed for RBBI, and is not intended +// to ever become public. // //------------------------------------------------------------------------------- + +class CharacterIteratorUT: public CharacterIterator { +public: + CharacterIteratorUT(UText *ut); + virtual ~CharacterIteratorUT(); + + virtual CharacterIterator *clone() const; + virtual UBool operator==(const ForwardCharacterIterator& that) const; + virtual UChar setIndex(int32_t position); + virtual UChar32 previous32(void); + virtual UChar32 next32(void); + virtual UBool hasNext(); + virtual UChar32 current32(void) const; + virtual UBool hasPrevious(); + virtual int32_t move(int32_t delta, EOrigin origin); + static UClassID getStaticClassID(void); + virtual UClassID getDynamicClassID(void) const; + + UText *fUText; + virtual void resetTo(const UText *ut, UErrorCode *status); + +private: + CharacterIteratorUT(); + + // The following functions are not needed by RBBI, + // but are pure virtual in CharacterIterator, so must be defined. + // Only stubs are provided in this implementation. + virtual int32_t hashCode(void) const {U_ASSERT(FALSE); return 0;}; + virtual UChar nextPostInc(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar32 next32PostInc(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar first(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar32 first32(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar last(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar32 last32(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar32 setIndex32(int32_t position) {U_ASSERT(FALSE); return 0;}; + virtual UChar current(void) const {U_ASSERT(FALSE); return 0;}; + virtual UChar next(void) {U_ASSERT(FALSE); return 0;}; + virtual UChar previous(void) {U_ASSERT(FALSE); return 0;}; + virtual int32_t move32(int32_t delta, EOrigin origin) {U_ASSERT(FALSE); return 0;}; + virtual void getText(UnicodeString& result) {U_ASSERT(FALSE);}; +}; + + + +// +// The following fields are inherited from CharacterIterator. +// This implementation __MUST__ keep them current because of non-virtual inline +// functions defined in CharacterIterator. +// int32_t textLength; // length of the text. +// int32_t pos; // current index position +// int32_t begin; // starting index. Always 0 for us. +// int32_t end; // ending index +// +// CharacterIterator was designed assuming that utf-16 indexing would be used, +// but native indexing will pass through OK. This partial implementation only +// provides the '32' flavored code point access, not UChar access. +// + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CharacterIteratorUT); + +CharacterIteratorUT::CharacterIteratorUT(UText *ut) { + UErrorCode status = U_ZERO_ERROR; + fUText = utext_clone(NULL, ut, FALSE, &status); + + // Set the inherited CharacterItertor fields + textLength = utext_nativeLength(ut); + pos = 0; + begin = 0; + end = textLength; +} + +CharacterIteratorUT::CharacterIteratorUT() { + fUText = NULL; + textLength = 0; + pos = 0; + begin = 0; + end = 0; +} + +CharacterIteratorUT::~CharacterIteratorUT() { + utext_close(fUText); +} + + +CharacterIterator *CharacterIteratorUT::clone() const { + UErrorCode status = U_ZERO_ERROR; + CharacterIteratorUT *result = new CharacterIteratorUT(); + result->fUText = utext_clone(NULL, fUText, TRUE, &status); + if (U_SUCCESS(status)) { + result->textLength = utext_nativeLength(fUText); + result->pos = 0; + result->begin = 0; + result->end = textLength; + } + return result; +} + +UBool CharacterIteratorUT::operator==(const ForwardCharacterIterator& that) const { + if (this->getDynamicClassID() != that.getDynamicClassID()) { + return FALSE; + } + const CharacterIteratorUT *realThat = (const CharacterIteratorUT *)&that; + UBool result = this->fUText->context == realThat->fUText->context; + return result; +} + +UChar CharacterIteratorUT::setIndex(int32_t position) { + pos = position; + if (pos > end) { + pos = end; + } + utext_setNativeIndex(fUText, pos); + return 0xffff; // RBBI doesn't use return value, and UText can't return a UChar easily. +} + +UChar32 CharacterIteratorUT::previous32(void) { + UChar32 result = UTEXT_PREVIOUS32(fUText); + pos = utext_getNativeIndex(fUText); // TODO: maybe optimize common case? + if (result < 0) { + result = 0x0000ffff; + } + return result; +} + +UChar32 CharacterIteratorUT::next32(void) { + // TODO: optimize. + UTEXT_NEXT32(fUText); + pos = utext_getNativeIndex(fUText); + UChar32 result = UTEXT_NEXT32(fUText); + if (result < 0) { + result = 0x0000ffff; + } else { + UTEXT_PREVIOUS32(fUText); + } + return result; +} + +UBool CharacterIteratorUT::hasNext() { + // What would really be best for RBBI is a hasNext32() + UBool result = TRUE; + if (pos >= end-1) { + result = FALSE; + } + return result; +} + +UChar32 CharacterIteratorUT::current32(void) const { + UChar32 result = utext_current32(fUText); + if (result < 0) { + result = 0x0000ffff; + } + return result; +} + +UBool CharacterIteratorUT::hasPrevious() { + UBool result = pos > 0; + return result; +} + +int32_t CharacterIteratorUT::move(int32_t delta, EOrigin origin) { + // only needed for the inherited inline implementation of setToStart(). + int32_t result = pos; + switch (origin) { +case kStart: + result = delta; + break; +case kCurrent: + result = pos + delta; + break; +case kEnd: + result = end + delta; + break; +default: + U_ASSERT(FALSE); + } + utext_setNativeIndex(fUText, result); + pos = utext_getNativeIndex(fUText); // align to cp boundary + return result; +} + + + +void CharacterIteratorUT::resetTo(const UText *ut, UErrorCode *status) { + // Reset this CharacterIteratorUT to use a new UText. + fUText = utext_clone(fUText, ut, FALSE, status); + utext_setNativeIndex(fUText, 0); + textLength = utext_nativeLength(fUText); + pos = 0; + end = textLength; +} + void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { - // TODO: implement this. + if (U_FAILURE(status)) { + return; + } + reset(); + if (fText != NULL && + fText->getDynamicClassID() == CharacterIteratorUT::getStaticClassID()) + { + // The break iterator is already using a UText based character iterator. + // Copy the new UText into the existing character iterator's UText. + CharacterIteratorUT *utcr = (CharacterIteratorUT *)fText; + utcr->resetTo(ut, &status); + } else { + delete fText; + fText = new CharacterIteratorUT(ut); + } + this->first(); } UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { - // TODO: implement this. - return fillIn; + UText *result = NULL; + if (U_SUCCESS(status) && fText!=NULL && + fText->getDynamicClassID() == CharacterIteratorUT::getStaticClassID()) + { + CharacterIteratorUT *utcr = (CharacterIteratorUT *)fText; + result = utext_clone(result, utcr->fUText, FALSE, &status); + } + return result; } diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp index e458ddbae5e..1f890ea4140 100644 --- a/icu4c/source/common/utext.cpp +++ b/icu4c/source/common/utext.cpp @@ -113,6 +113,11 @@ utext_setNativeIndex(UText *ut, int32_t index) { U_DRAFT UChar32 U_EXPORT2 utext_current32(UText *ut) { UChar32 c = U_SENTINEL; + if (ut->chunk.offset==ut->chunk.length) { + // Current position is just off the end of the chunk. + // Can also happen at startup, with a zero length chunk at zero offset. + ut->access(ut, ut->chunk.nativeLimit, TRUE, &ut->chunk); + } if (ut->chunk.offset < ut->chunk.length) { c = ut->chunk.contents[ut->chunk.offset]; if (U16_IS_SURROGATE(c)) { @@ -429,7 +434,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { *ut = emptyText; ut->flags |= UTEXT_HEAP_ALLOCATED; if (spaceRequired>0) { - ut->extraSize = spaceRequired; + ut->extraSize = extraSpace; ut->pExtra = &((ExtendedUText *)ut)->extension; } } @@ -461,6 +466,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { *status = U_MEMORY_ALLOCATION_ERROR; } else { ut->extraSize = extraSpace; + ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED; } } } diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index ef44f4b5cc7..1ed7b9eb013 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -21,6 +21,7 @@ #include "rbbidata.h" #include "cstring.h" #include "unicode/ustring.h" +#include "unicode/utext.h" /** * API Test the RuleBasedBreakIterator class @@ -292,6 +293,58 @@ void RBBIAPITest::TestGetSetAdoptText() errln((UnicodeString)"ERROR:4 error in adoptText "); } + // UText API + // + // Quick test to see if UText is working at all. + // + const char *s1 = "hello world"; + const char *s2 = "see ya"; + // 012345678901 + + status = U_ZERO_ERROR; + UText *ut = utext_openUTF8(NULL, s1, -1, &status); + wordIter1->setText(ut, status); + TEST_ASSERT_SUCCESS(status); + + int32_t pos; + pos = wordIter1->first(); + TEST_ASSERT(pos==0); + pos = wordIter1->next(); + TEST_ASSERT(pos==5); + pos = wordIter1->next(); + TEST_ASSERT(pos==6); + pos = wordIter1->next(); + TEST_ASSERT(pos==11); + pos = wordIter1->next(); + TEST_ASSERT(pos==UBRK_DONE); + + status = U_ZERO_ERROR; + UText *ut2 = utext_openUTF8(NULL, s2, -1, &status); + TEST_ASSERT_SUCCESS(status); + wordIter1->setText(ut2, status); + TEST_ASSERT_SUCCESS(status); + + pos = wordIter1->first(); + TEST_ASSERT(pos==0); + pos = wordIter1->next(); + TEST_ASSERT(pos==3); + pos = wordIter1->next(); + TEST_ASSERT(pos==4); + + pos = wordIter1->last(); + TEST_ASSERT(pos==6); + pos = wordIter1->previous(); + TEST_ASSERT(pos==4); + pos = wordIter1->previous(); + TEST_ASSERT(pos==3); + pos = wordIter1->previous(); + TEST_ASSERT(pos==0); + pos = wordIter1->previous(); + TEST_ASSERT(pos==UBRK_DONE); + + utext_close(ut); + utext_close(ut2); + delete wordIter1; delete charIter1; delete rb;