mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-3944 text access, work in progress
X-SVN-Rev: 17988
This commit is contained in:
parent
9fc80fe9b2
commit
32b19f04b2
9 changed files with 303 additions and 187 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2004 IBM Corp. All rights reserved.
|
||||
* Copyright (C) 1999-2005 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
|
@ -37,7 +37,7 @@ RuleBasedBreakIterator() {
|
|||
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
|
||||
const char* dictionaryFilename,
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode& status)
|
||||
: RuleBasedBreakIterator(rbbiData, status)
|
||||
{
|
||||
|
@ -143,7 +143,7 @@ DictionaryBasedBreakIterator::previous()
|
|||
reset();
|
||||
int32_t result = RuleBasedBreakIterator::previous();
|
||||
if (cachedBreakPositions != NULL) {
|
||||
for (positionInCache=0;
|
||||
for (positionInCache=0;
|
||||
cachedBreakPositions[positionInCache] != result;
|
||||
positionInCache++);
|
||||
U_ASSERT(positionInCache < numCachedBreakPositions);
|
||||
|
@ -334,7 +334,7 @@ BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
|
|||
}
|
||||
|
||||
//
|
||||
// If user buffer size is zero this is a preflight operation to
|
||||
// If user buffer size is zero this is a preflight operation to
|
||||
// obtain the needed buffer size, allowing for worst case misalignment.
|
||||
//
|
||||
if (bufferSize == 0) {
|
||||
|
@ -367,7 +367,7 @@ BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
|
|||
}
|
||||
|
||||
//
|
||||
// Initialize the clone object.
|
||||
// Initialize the clone object.
|
||||
// TODO: using an overloaded C++ "operator new" to directly initialize the
|
||||
// copy in the user's buffer would be better, but it doesn't seem
|
||||
// to get along with namespaces. Investigate why.
|
||||
|
@ -383,7 +383,7 @@ BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
|
|||
if (status != U_SAFECLONE_ALLOCATED_WARNING) {
|
||||
clone->fBufferClone = TRUE;
|
||||
}
|
||||
return clone;
|
||||
return clone;
|
||||
}
|
||||
|
||||
|
||||
|
@ -405,15 +405,15 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
// that needs to be kept with the word). Seek from the beginning of the
|
||||
// range to the first dictionary character
|
||||
fText->setIndex(startPos);
|
||||
UChar c = fText->current();
|
||||
UChar32 c = fText->current32();
|
||||
while (isDictionaryChar(c) == FALSE) {
|
||||
c = fText->next();
|
||||
c = fText->next32();
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return; // UStack below overwrites the status error codes
|
||||
}
|
||||
|
||||
|
||||
// initialize. We maintain two stacks: currentBreakPositions contains
|
||||
// the list of break positions that will be returned if we successfully
|
||||
// finish traversing the whole range now. possibleBreakPositions lists
|
||||
|
@ -429,9 +429,9 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
// further, this saves us from having to follow each possible path
|
||||
// through the text all the way to the error (hopefully avoiding many
|
||||
// future recursive calls as well).
|
||||
// there can be only one kind of error in UStack and UVector, so we'll
|
||||
// there can be only one kind of error in UStack and UVector, so we'll
|
||||
// just let the error fall through
|
||||
UStack currentBreakPositions(status);
|
||||
UStack currentBreakPositions(status);
|
||||
UStack possibleBreakPositions(status);
|
||||
UVector wrongBreakPositions(status);
|
||||
|
||||
|
@ -456,8 +456,15 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
return;
|
||||
}
|
||||
// initialize (we always exit the loop with a break statement)
|
||||
c = fText->current();
|
||||
c = fText->current32();
|
||||
for (;;) {
|
||||
// The dictionary implementation doesn't do supplementary chars.
|
||||
// Put them through as an unpaired surrogate, which
|
||||
// will end any dictionary match in progress.
|
||||
// With any luck, this dictionary implementation will be retired soon.
|
||||
if (c>0x10000) {
|
||||
c = 0xd800;
|
||||
}
|
||||
|
||||
// if we can transition to state "-1" from our current state, we're
|
||||
// on the last character of a legal word. Push that position onto
|
||||
|
@ -470,7 +477,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
}
|
||||
|
||||
// look up the new state to transition to in the dictionary
|
||||
state = fTables->fDictionary->at(state, c);
|
||||
state = fTables->fDictionary->at(state, (UChar)c);
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the "end of word" state, then it was a non-dictionary character
|
||||
|
@ -515,7 +522,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
possibleBreakPositions.peeki())) {
|
||||
possibleBreakPositions.popi();
|
||||
}
|
||||
|
||||
|
||||
// if we've used up all possible break-position combinations, there's
|
||||
// an error or an unknown word in the text. In this case, we start
|
||||
// over, treating the farthest character we've reached as the beginning
|
||||
|
@ -532,7 +539,8 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
}
|
||||
bestBreakPositions.removeAllElements();
|
||||
if (farthestEndPoint < endPos) {
|
||||
fText->setIndex(farthestEndPoint + 1);
|
||||
fText->setIndex(farthestEndPoint);
|
||||
fText->next32();
|
||||
}
|
||||
else {
|
||||
break;
|
||||
|
@ -547,7 +555,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
return;
|
||||
}
|
||||
}
|
||||
fText->next();
|
||||
fText->next32();
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -574,7 +582,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
|
||||
// re-sync "c" for the next go-round, and drop out of the loop if
|
||||
// we've made it off the end of the range
|
||||
c = fText->current();
|
||||
c = fText->current32();
|
||||
if (fText->getIndex() >= endPos) {
|
||||
break;
|
||||
}
|
||||
|
@ -583,7 +591,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
// if we didn't hit any exceptional conditions on this last iteration,
|
||||
// just advance to the next character and loop
|
||||
else {
|
||||
c = fText->next();
|
||||
c = fText->next32();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1348,6 +1348,21 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
|
|||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// UText functions
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
|
||||
// TODO: implement this.
|
||||
}
|
||||
|
||||
|
||||
UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
|
||||
// TODO: implement this.
|
||||
return fillIn;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*****************************************************************************************
|
||||
* Copyright (C) 1996-2004, International Business Machines
|
||||
* Copyright (C) 1996-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -70,14 +70,16 @@ ubrk_open(UBreakIteratorType type,
|
|||
return 0;
|
||||
}
|
||||
|
||||
UCharCharacterIterator *iter = 0;
|
||||
iter = new UCharCharacterIterator(text, textLength);
|
||||
if(iter == 0) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete result;
|
||||
return 0;
|
||||
if (text != NULL) {
|
||||
UCharCharacterIterator *iter = 0;
|
||||
iter = new UCharCharacterIterator(text, textLength);
|
||||
if(iter == 0) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete result;
|
||||
return 0;
|
||||
}
|
||||
result->adoptText(iter);
|
||||
}
|
||||
result->adoptText(iter);
|
||||
|
||||
return (UBreakIterator*)result;
|
||||
}
|
||||
|
@ -186,6 +188,19 @@ ubrk_setText(UBreakIterator* bi,
|
|||
}
|
||||
}
|
||||
|
||||
U_DRAFT void U_EXPORT2
|
||||
ubrk_setUText(UBreakIterator *bi,
|
||||
UText *text,
|
||||
UErrorCode *status)
|
||||
{
|
||||
BreakIterator *brit = (BreakIterator *)bi;
|
||||
brit->setText(text, *status);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ubrk_current(const UBreakIterator *bi)
|
||||
{
|
||||
|
@ -273,8 +288,8 @@ ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity,
|
|||
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
ubrk_getLocaleByType(const UBreakIterator *bi,
|
||||
ULocDataLocaleType type,
|
||||
ubrk_getLocaleByType(const UBreakIterator *bi,
|
||||
ULocDataLocaleType type,
|
||||
UErrorCode* status)
|
||||
{
|
||||
if (bi == NULL) {
|
||||
|
|
|
@ -261,6 +261,23 @@ public:
|
|||
*/
|
||||
virtual const CharacterIterator& getText(void) const = 0;
|
||||
|
||||
|
||||
/**
|
||||
* Get a UText for the text being analyzed.
|
||||
* The returned UText is a shallow clone of the UText used internally
|
||||
* by the break iterator implementation. It can safely be used to
|
||||
* access the text without impacting any break iterator operations,
|
||||
* but the underlying text itself must not be altered.
|
||||
*
|
||||
* @param fillIn A UText to be filled in. If NULL, a new UText will be
|
||||
* allocated to hold the result.
|
||||
* @status receives any error codes.
|
||||
* @return The current UText for this break iterator. If an input
|
||||
* UText was provided, it will always be returned.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
|
||||
|
||||
/**
|
||||
* Change the text over which this operates. The text boundary is
|
||||
* reset to the start.
|
||||
|
@ -270,12 +287,19 @@ public:
|
|||
virtual void setText(const UnicodeString &text) = 0;
|
||||
|
||||
/**
|
||||
* Change the text over which this operates. The boundary iteration position is
|
||||
* reset to the start.
|
||||
* Reset the break iterator to operate over the text represented by
|
||||
* the UText. The iterator position is reset to the start.
|
||||
*
|
||||
* This function makes a shallow clone of the supplied UText. This means
|
||||
* that the caller is free to immediately close or otherwise reuse the
|
||||
* Utext that was passed as a parameter, but that the underlying text itself
|
||||
* must not be altered while being referenced by the break iterator.
|
||||
*
|
||||
* @param text The UText used to change the text.
|
||||
* @stable ICU 2.0
|
||||
* @status receives any error codes.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
//virtual void setText(UText &text) = 0;
|
||||
virtual void setText(UText *text, UErrorCode &status) = 0;
|
||||
|
||||
/**
|
||||
* Change the text over which this operates. The text boundary is
|
||||
|
|
|
@ -17,10 +17,10 @@
|
|||
#include "unicode/utypes.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \file
|
||||
* \brief C++ API: Rule Based Break Iterator
|
||||
*/
|
||||
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
|
@ -242,6 +242,22 @@ public:
|
|||
virtual const CharacterIterator& getText(void) const;
|
||||
|
||||
|
||||
/**
|
||||
* Get a UText for the text being analyzed.
|
||||
* The returned UText is a shallow clone of the UText used internally
|
||||
* by the break iterator implementation. It can safely be used to
|
||||
* access the text without impacting any break iterator operations,
|
||||
* but the underlying text itself must not be altered.
|
||||
*
|
||||
* @param fillIn A UText to be filled in. If NULL, a new UText will be
|
||||
* allocated to hold the result.
|
||||
* @status receives any error codes.
|
||||
* @return The current UText for this break iterator. If an input
|
||||
* UText was provided, it will always be returned.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
|
@ -259,6 +275,21 @@ public:
|
|||
*/
|
||||
virtual void setText(const UnicodeString& newText);
|
||||
|
||||
/**
|
||||
* Reset the break iterator to operate over the text represented by
|
||||
* the UText. The iterator position is reset to the start.
|
||||
*
|
||||
* This function makes a shallow clone of the supplied UText. This means
|
||||
* that the caller is free to immediately close or otherwise reuse the
|
||||
* Utext that was passed as a parameter, but that the underlying text itself
|
||||
* must not be altered while being referenced by the break iterator.
|
||||
*
|
||||
* @param text The UText used to change the text.
|
||||
* @param status Receives any error codes.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
virtual void setText(UText *text, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the beginning of the text.
|
||||
* (i.e., the CharacterIterator's starting offset).
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/utext.h"
|
||||
|
||||
/**
|
||||
* A text-break iterator.
|
||||
|
@ -392,6 +393,21 @@ ubrk_setText(UBreakIterator* bi,
|
|||
int32_t textLength,
|
||||
UErrorCode* status);
|
||||
|
||||
|
||||
/**
|
||||
* Sets an existing iterator to point to a new piece of text
|
||||
* @param bi The iterator to use
|
||||
* @param text The text to be set
|
||||
* @param status The error code
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
ubrk_setUText(UBreakIterator* bi,
|
||||
UText* text,
|
||||
UErrorCode* status);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Determine the most recently-returned text boundary.
|
||||
*
|
||||
|
|
|
@ -329,7 +329,7 @@ utext_isLengthExpensive(const UText *ut);
|
|||
*
|
||||
* This function is roughly equivalent to the the sequence
|
||||
* utext_setIndex(index);
|
||||
* utext_current();
|
||||
* utext_current32();
|
||||
* (There is a difference if the index is out of bounds by being less than zero)
|
||||
*
|
||||
* @param ut the text to be accessed
|
||||
|
@ -354,7 +354,7 @@ utext_char32At(UText *ut, int32_t nativeIndex);
|
|||
* @draft ICU 3.4
|
||||
*/
|
||||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_current(UText *ut);
|
||||
utext_current32(UText *ut);
|
||||
|
||||
|
||||
/**
|
||||
|
@ -750,32 +750,32 @@ enum {
|
|||
* For example, byte indexes into UTF-8 text or UTF-32 indexes into UTF-32 text.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
UTEXT_PROVIDER_NON_UTF16_INDEXES,
|
||||
UTEXT_PROVIDER_NON_UTF16_INDEXES = 0,
|
||||
/**
|
||||
* It is potentially time consuming for the provider to determine the length of the text.
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE,
|
||||
UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1,
|
||||
/**
|
||||
* Text chunks remain valid and usable until the text object is modified or
|
||||
* deleted, not just until the next time the access() function is called
|
||||
* (which is the default).
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
UTEXT_PROVIDER_STABLE_CHUNKS,
|
||||
UTEXT_PROVIDER_STABLE_CHUNKS = 2,
|
||||
/**
|
||||
* The provider supports modifying the text via the replace() and copy()
|
||||
* functions.
|
||||
* @see Replaceable
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
UTEXT_PROVIDER_WRITABLE,
|
||||
UTEXT_PROVIDER_WRITABLE = 3,
|
||||
/**
|
||||
* There is meta data associated with the text.
|
||||
* @see Replaceable::hasMetaData()
|
||||
* @draft ICU 3.4
|
||||
*/
|
||||
UTEXT_PROVIDER_HAS_META_DATA
|
||||
UTEXT_PROVIDER_HAS_META_DATA = 4
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
@ -101,7 +101,7 @@ utext_setIndex(UText *ut, int32_t index) {
|
|||
if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
|
||||
UChar c = ut->chunk.contents[ut->chunk.offset];
|
||||
if (U16_TRAIL(c)) {
|
||||
utext_current(ut); // force index to the start of the curent code point.
|
||||
utext_current32(ut); // force index to the start of the curent code point.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -111,7 +111,7 @@ utext_setIndex(UText *ut, int32_t index) {
|
|||
|
||||
|
||||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_current(UText *ut) {
|
||||
utext_current32(UText *ut) {
|
||||
UChar32 c = U_SENTINEL;
|
||||
if (ut->chunk.offset < ut->chunk.length) {
|
||||
c = ut->chunk.contents[ut->chunk.offset];
|
||||
|
@ -161,7 +161,7 @@ utext_next32(UText *ut) {
|
|||
if (U16_IS_SURROGATE(c)) {
|
||||
// looking at a surrogate. Could be unpaired, need to be careful.
|
||||
// Speed doesn't matter, will be very rare.
|
||||
c = utext_current(ut);
|
||||
c = utext_current32(ut);
|
||||
if (U_IS_SUPPLEMENTARY(c)) {
|
||||
offset++;
|
||||
}
|
||||
|
@ -192,7 +192,7 @@ utext_previous32(UText *ut) {
|
|||
if (U16_IS_SURROGATE(c)) {
|
||||
// Note that utext_current() will move the chunk offset to the lead surrogate
|
||||
// if we come in referring to trail half of a surrogate pair.
|
||||
c = utext_current(ut);
|
||||
c = utext_current32(ut);
|
||||
}
|
||||
|
||||
prev32_return:
|
||||
|
@ -224,7 +224,7 @@ utext_next32From(UText *ut, int32_t index) {
|
|||
// Surrogate code unit. Could be pointing at either half of a pair, or at
|
||||
// an unpaired surrogate. Let utext_current() do the work. Speed doesn't matter.
|
||||
chunk->offset = offset;
|
||||
c = utext_current(ut);
|
||||
c = utext_current32(ut);
|
||||
if (U_IS_SUPPLEMENTARY(c)) {
|
||||
offset++;
|
||||
}
|
||||
|
@ -257,8 +257,8 @@ utext_previous32From(UText *ut, int32_t index) {
|
|||
c = chunk->contents[offset];
|
||||
chunk->offset = offset;
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
c = utext_current(ut); // get supplementary char if not unpaired surrogate,
|
||||
// and adjust offset to start.
|
||||
c = utext_current32(ut); // get supplementary char if not unpaired surrogate,
|
||||
// and adjust offset to start.
|
||||
}
|
||||
prev32return:
|
||||
return c;
|
||||
|
@ -911,7 +911,6 @@ U_CDECL_END
|
|||
//
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#if 0 // initially commented out to reduce testing
|
||||
|
||||
/*
|
||||
* TODO: use a flag in RepText to support readonly strings?
|
||||
|
@ -922,124 +921,159 @@ U_CDECL_END
|
|||
// to allow for possible trimming for code point boundaries
|
||||
enum { REP_TEXT_CHUNK_SIZE=10 };
|
||||
|
||||
struct RepText : public UText {
|
||||
/* chunk UChars */
|
||||
UChar s[REP_TEXT_CHUNK_SIZE];
|
||||
struct ReplExtra {
|
||||
/*
|
||||
* Chunk UChars.
|
||||
* +1 to simplify filling with surrogate pair at the end.
|
||||
*/
|
||||
UChar s[REP_TEXT_CHUNK_SIZE+1];
|
||||
};
|
||||
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static UText * U_CALLCONV
|
||||
repTextClone(const UText *t) {
|
||||
RepText *t2=(RepText *)uprv_malloc(sizeof(RepText));
|
||||
if(t2!=NULL) {
|
||||
*t2=*(const RepText *)t;
|
||||
t2->context=((const Replaceable *)t->context)->clone();
|
||||
if(t2->context==NULL) {
|
||||
uprv_free(t2);
|
||||
t2=NULL;
|
||||
}
|
||||
repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
|
||||
// First do a generic shallow clone. Does everything needed for the UText struct itself.
|
||||
dest = noopTextClone(dest, src, deep, status);
|
||||
|
||||
if (deep && U_SUCCESS(*status)) {
|
||||
const Replaceable *replSrc = (const Replaceable *)src->context;
|
||||
dest->context = replSrc->clone();
|
||||
}
|
||||
return t2;
|
||||
return dest;
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
repTextGetProperties(UText *t) {
|
||||
int32_t props=I32_FLAG(UTEXT_PROVIDER_WRITABLE);
|
||||
if(((const Replaceable *)((const RepText *)t)->context)->hasMetaData()) {
|
||||
props|=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
|
||||
}
|
||||
return props;
|
||||
}
|
||||
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
repTextLength(UText *t) {
|
||||
return ((const Replaceable *)((const RepText *)t)->context)->length();
|
||||
repTextLength(UText *ut) {
|
||||
const Replaceable *replSrc = (const Replaceable *)ut->context;
|
||||
int32_t len = replSrc->length();
|
||||
return len;
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
repTextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk) {
|
||||
RepText *rt=(RepText *)t;
|
||||
const Replaceable *rep=(const Replaceable *)rt->context;
|
||||
int32_t start, limit, length=rep->length();
|
||||
int32_t chunkStart, chunkLength, chunkOffset;
|
||||
|
||||
static UBool U_CALLCONV
|
||||
repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
|
||||
const Replaceable *rep=(const Replaceable *)ut->context;
|
||||
int32_t start; // index of the start of the chunk to be loaded
|
||||
int32_t limit; // index of the end+1 of the chunk to be loaded.
|
||||
int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
|
||||
|
||||
|
||||
/*
|
||||
* Compute start/limit boundaries around index, for a segment of text
|
||||
* to be extracted.
|
||||
* The segment will be trimmed to not include halves of surrogate pairs.
|
||||
* To allow for the possibility that our user gave an index to the trailing
|
||||
* half of a surrogate pair, we must request one extra preceding UChar when
|
||||
* going in the forward direction. This will ensure that the buffer has the
|
||||
* entire code point at the specified index.
|
||||
*/
|
||||
if(forward) {
|
||||
if(length<=index) {
|
||||
return -1;
|
||||
|
||||
if (index>=ut->chunk.nativeStart && index<ut->chunk.nativeLimit) {
|
||||
// Buffer already contains the requested position.
|
||||
ut->chunk.offset = index - ut->chunk.nativeStart;
|
||||
return TRUE;
|
||||
}
|
||||
limit=index+REP_TEXT_CHUNK_SIZE-1;
|
||||
if(limit>length) {
|
||||
limit=length;
|
||||
if (index>=length && ut->chunk.nativeLimit==length) {
|
||||
// Request for end of string, and buffer already extends up to it.
|
||||
// Can't get the data, but don't change the buffer.
|
||||
ut->chunk.offset = length - ut->chunk.nativeStart;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (index<0) {
|
||||
index = 0;
|
||||
}
|
||||
ut->chunk.nativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
|
||||
// Going forward, so we want to have the buffer with stuff at and beyond
|
||||
// the requested index. The -1 gets us one code point before the
|
||||
// requested index also, to handle the case of the index being on
|
||||
// a trail surrogate of a surrogate pair.
|
||||
if(ut->chunk.nativeLimit > length) {
|
||||
ut->chunk.nativeLimit = length;
|
||||
}
|
||||
// unless buffer ran off end, start is index-1.
|
||||
ut->chunk.nativeStart = ut->chunk.nativeLimit - REP_TEXT_CHUNK_SIZE;
|
||||
if(ut->chunk.nativeStart < 0) {
|
||||
ut->chunk.nativeStart = 0;
|
||||
}
|
||||
} else {
|
||||
// Reverse iteration. Fill buffer with data preceding the requested index.
|
||||
if(index<0) {
|
||||
index = 0;
|
||||
}
|
||||
if (index>ut->chunk.nativeStart && index<=ut->chunk.nativeLimit) {
|
||||
// Requested position already in buffer.
|
||||
ut->chunk.offset = index - ut->chunk.nativeStart;
|
||||
return TRUE;
|
||||
}
|
||||
if (index==0 && ut->chunk.nativeStart==0) {
|
||||
// Request for start, buffer already begins at start.
|
||||
// No data, but keep the buffer as is.
|
||||
ut->chunk.offset = 0;
|
||||
return FALSE;
|
||||
}
|
||||
limit = index;
|
||||
if (limit>length) {
|
||||
limit = length;
|
||||
}
|
||||
start=limit-REP_TEXT_CHUNK_SIZE;
|
||||
if(start<0) {
|
||||
start=0;
|
||||
}
|
||||
} else {
|
||||
if(index<0) {
|
||||
return -1;
|
||||
}
|
||||
start=index-REP_TEXT_CHUNK_SIZE+1;
|
||||
if(start<0) {
|
||||
start=0;
|
||||
}
|
||||
limit=start+REP_TEXT_CHUNK_SIZE;
|
||||
if(length<limit) {
|
||||
limit=length;
|
||||
}
|
||||
}
|
||||
UnicodeString buffer(rt->s, 0, REP_TEXT_CHUNK_SIZE); // writable alias
|
||||
rep->extractBetween(start, limit, buffer);
|
||||
ReplExtra *ex = (ReplExtra *)ut->pExtra;
|
||||
// UnicodeString with its buffer a writable alias to the chunk buffer
|
||||
UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
|
||||
rep->extractBetween(ut->chunk.nativeStart, ut->chunk.nativeLimit, buffer);
|
||||
|
||||
chunkStart=0;
|
||||
chunkLength=limit-start;
|
||||
chunkOffset=index-start;
|
||||
ut->chunk.contents = ex->s;
|
||||
ut->chunk.length = ut->chunk.nativeLimit - ut->chunk.nativeStart;
|
||||
ut->chunk.offset = index - ut->chunk.nativeStart;
|
||||
|
||||
// trim contents for code point boundaries
|
||||
if(0<start && U16_IS_TRAIL(rt->s[chunkStart])) {
|
||||
++chunkStart;
|
||||
--chunkLength;
|
||||
++start;
|
||||
}
|
||||
if(limit<length && U16_IS_LEAD(rt->s[chunkStart+chunkLength-1])) {
|
||||
--chunkLength;
|
||||
--limit;
|
||||
// Surrogate pairs from the input text must not span chunk boundaries.
|
||||
// If end of chunk could be the start of a surrogate, trim it off.
|
||||
if (ut->chunk.nativeLimit < length &&
|
||||
U16_IS_LEAD(ex->s[ut->chunk.length-1])) {
|
||||
ut->chunk.length--;
|
||||
}
|
||||
|
||||
|
||||
// if the first UChar in the chunk could be the trailing half of a surrogate pair,
|
||||
// trim it off.
|
||||
if(ut->chunk.nativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
|
||||
++(ut->chunk.contents);
|
||||
--(ut->chunk.length);
|
||||
--(ut->chunk.offset);
|
||||
}
|
||||
|
||||
// adjust the index/chunkOffset to a code point boundary
|
||||
U16_SET_CP_START(rt->s, chunkStart, chunkOffset);
|
||||
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset);
|
||||
|
||||
chunk->contents=rt->s+chunkStart;
|
||||
chunk->length=chunkLength;
|
||||
chunk->start=start;
|
||||
chunk->limit=limit;
|
||||
chunk->nonUTF16Indexes=FALSE;
|
||||
return chunkOffset; // chunkOffset corresponding to index
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
repTextExtract(UText *t,
|
||||
repTextExtract(UText *ut,
|
||||
int32_t start, int32_t limit,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
RepText *rt=(RepText *)t;
|
||||
const Replaceable *rep=(const Replaceable *)rt->context;
|
||||
UErrorCode *status) {
|
||||
const Replaceable *rep=(const Replaceable *)ut->context;
|
||||
int32_t length=rep->length();
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
if(start<0 || start>limit || length<limit) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
*status=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
length=limit-start;
|
||||
|
@ -1048,28 +1082,27 @@ repTextExtract(UText *t,
|
|||
}
|
||||
UnicodeString buffer(dest, 0, destCapacity); // writable alias
|
||||
rep->extractBetween(start, limit, buffer);
|
||||
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
|
||||
return u_terminateUChars(dest, destCapacity, length, status);
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
repTextReplace(UText *t,
|
||||
repTextReplace(UText *ut,
|
||||
int32_t start, int32_t limit,
|
||||
const UChar *src, int32_t length,
|
||||
UTextChunk *chunk,
|
||||
UErrorCode *pErrorCode) {
|
||||
RepText *rt=(RepText *)t;
|
||||
Replaceable *rep=(Replaceable *)rt->context;
|
||||
UErrorCode *status) {
|
||||
Replaceable *rep=(Replaceable *)ut->context;
|
||||
int32_t oldLength;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if(src==NULL && length!=0) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
oldLength=rep->length(); // will subtract from new length
|
||||
if(start<0 || start>limit || oldLength<limit) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
*status=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
// prepare
|
||||
|
@ -1082,24 +1115,22 @@ repTextReplace(UText *t,
|
|||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
repTextCopy(UText *t,
|
||||
repTextCopy(UText *ut,
|
||||
int32_t start, int32_t limit,
|
||||
int32_t destIndex,
|
||||
UBool move,
|
||||
UTextChunk *chunk,
|
||||
UErrorCode *pErrorCode) {
|
||||
RepText *rt=(RepText *)t;
|
||||
Replaceable *rep=(Replaceable *)rt->context;
|
||||
UErrorCode *status) {
|
||||
Replaceable *rep=(Replaceable *)ut->context;
|
||||
int32_t length=rep->length();
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return;
|
||||
}
|
||||
if( start<0 || start>limit || length<limit ||
|
||||
destIndex<0 || length<destIndex ||
|
||||
(start<destIndex && destIndex<limit)
|
||||
) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
*status=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
if(move) {
|
||||
|
@ -1118,61 +1149,37 @@ repTextCopy(UText *t,
|
|||
// never invalidate the chunk because we have a copy of the characters
|
||||
}
|
||||
|
||||
static const UText repText={
|
||||
NULL, NULL, NULL, NULL,
|
||||
(int32_t)sizeof(UText), 0, 0, 0,
|
||||
repTextClone,
|
||||
repTextGetProperties,
|
||||
repTextLength,
|
||||
repTextAccess,
|
||||
repTextExtract,
|
||||
repTextReplace,
|
||||
repTextCopy,
|
||||
NULL, // mapOffsetToNative
|
||||
NULL // mapIndexToUTF16
|
||||
};
|
||||
|
||||
|
||||
U_DRAFT UText * U_EXPORT2
|
||||
utext_openReplaceable(Replaceable *rep, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
if(rep==NULL) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
RepText *rt=(RepText *)uprv_malloc(sizeof(RepText));
|
||||
if(rt==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
ut = utext_setup(ut, sizeof(ReplExtra), status);
|
||||
|
||||
ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
|
||||
if(rep->hasMetaData()) {
|
||||
ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
|
||||
}
|
||||
*((UText *)rt)=repText;
|
||||
rt->context=rep;
|
||||
return rt;
|
||||
|
||||
ut->clone = noopTextClone;
|
||||
ut->length = repTextLength;
|
||||
ut->access = repTextAccess;
|
||||
ut->extract = repTextExtract;
|
||||
ut->replace = repTextReplace;
|
||||
ut->copy = repTextCopy;
|
||||
|
||||
ut->context=rep;
|
||||
return ut;
|
||||
}
|
||||
|
||||
U_DRAFT void U_EXPORT2
|
||||
utext_closeReplaceable(UText *t) {
|
||||
if(t!=NULL) {
|
||||
uprv_free((RepText *)t);
|
||||
}
|
||||
}
|
||||
|
||||
U_DRAFT void U_EXPORT2
|
||||
utext_resetReplaceable(UText *t, Replaceable *rep, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
if(rep==NULL) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
RepText *rt=(RepText *)t;
|
||||
rt->context=rep;
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -128,17 +128,17 @@ static void TestAPI(void) {
|
|||
c = utext_char32At(uta, 0);
|
||||
TEST_ASSERT(c==uString[0]);
|
||||
|
||||
c = utext_current(uta);
|
||||
c = utext_current32(uta);
|
||||
TEST_ASSERT(c==uString[0]);
|
||||
|
||||
c = utext_next32(uta);
|
||||
TEST_ASSERT(c==uString[0]);
|
||||
c = utext_current(uta);
|
||||
c = utext_current32(uta);
|
||||
TEST_ASSERT(c==uString[1]);
|
||||
|
||||
c = utext_previous32(uta);
|
||||
TEST_ASSERT(c==uString[0]);
|
||||
c = utext_current(uta);
|
||||
c = utext_current32(uta);
|
||||
TEST_ASSERT(c==uString[0]);
|
||||
|
||||
c = utext_next32From(uta, 1);
|
||||
|
@ -170,7 +170,7 @@ static void TestAPI(void) {
|
|||
utext_setIndex(uta, 0);
|
||||
c = UTEXT_NEXT32(uta);
|
||||
TEST_ASSERT(c==uString[0]);
|
||||
c = utext_current(uta);
|
||||
c = utext_current32(uta);
|
||||
TEST_ASSERT(c==uString[1]);
|
||||
|
||||
c = UTEXT_PREVIOUS32(uta);
|
||||
|
|
Loading…
Add table
Reference in a new issue