ICU-2397 add UTF-8 UCharIterator

X-SVN-Rev: 10830
This commit is contained in:
Markus Scherer 2003-01-10 01:51:12 +00:00
parent 7a0647cb87
commit 10394346b1
2 changed files with 328 additions and 1 deletions

View file

@ -19,6 +19,7 @@
#include "unicode/chariter.h"
#include "unicode/rep.h"
#include "unicode/uiter.h"
#include "cstring.h"
U_CDECL_BEGIN
@ -106,7 +107,7 @@ stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origi
pos=iter->length+delta;
break;
default:
return -1; /* Error */
return -1; /* Error */
}
if(pos<iter->start) {
@ -361,6 +362,307 @@ uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
}
}
/* UCharIterator implementation for UTF-8 strings --------------------------- */
/*
* Possible, probably necessary only for an implementation for arbitrary
* converters:
* Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
* This would require to turn reservedFn into a close function and
* to introduce a uiter_close(iter).
*/
#define UITER_CNV_CAPACITY 16
/*
* Minimal implementation:
* Maintain a single-UChar buffer for an additional surrogate.
* The caller must not modify start and limit because they are used internally.
*
* Use UCharIterator fields as follows:
* context pointer to UTF-8 string
* length UTF-16 length of the string; -1 until lazy evaluation
* start current UTF-8 index
* index current UTF-16 index
* limit UTF-8 length of the string
* reservedField supplementary code point
*
* Since UCharIterator delivers 16-bit code units, the iteration can be
* currently in the middle of the byte sequence for a supplementary code point.
* In this case, reservedField will contain that code point and start will
* point to after the corresponding byte sequence.
* Otherwise, reservedField will be 0.
*/
/*
* Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
* Add implementations that do not call strlen() for iteration but check for NUL.
*/
static int32_t U_CALLCONV
utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
switch(origin) {
case UITER_ZERO:
case UITER_START:
return 0;
case UITER_CURRENT:
return iter->index;
case UITER_LIMIT:
case UITER_LENGTH:
if(iter->length<0) {
const uint8_t *s;
UChar32 c;
int32_t i, limit, length;
s=(const uint8_t *)iter->context;
i=iter->start;
limit=iter->limit;
length=iter->index;
if(iter->reservedField!=0) {
iter->reservedField=0;
++length;
}
while(i<limit) {
U8_NEXT(s, i, limit, c);
if(c<=0xffff) {
++length;
} else {
length+=2;
}
}
iter->length=length;
}
return iter->length;
default:
/* not a valid origin */
/* Should never get here! */
return -1;
}
}
static int32_t U_CALLCONV
utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
const uint8_t *s;
UChar32 c;
int32_t pos; /* requested UTF-16 index */
int32_t i, limit; /* UTF-8 index & length */
/* calculate the requested UTF-16 position */
switch(origin) {
case UITER_ZERO:
case UITER_START:
pos=delta;
break;
case UITER_CURRENT:
pos=iter->index+delta;
break;
case UITER_LIMIT:
case UITER_LENGTH:
pos=utf8IteratorGetIndex(iter, UITER_LENGTH)+delta;
break;
default:
return -1; /* Error */
}
/* shortcuts: pinning to the edges of the string */
if(pos<=0) {
iter->index=iter->start=iter->reservedField=0;
return 0;
} else if(iter->length>=0 && pos>=iter->length) {
iter->index=iter->length;
iter->start=iter->limit;
iter->reservedField=0;
return iter->index;
}
if(pos<iter->index/2) {
/* go forward from the start instead of backward from the current index */
iter->index=iter->start=iter->reservedField=0;
}
/* ### TODO: consider going backward from the end in some cases! */
delta=pos-iter->index;
if(delta==0) {
return iter->index; /* nothing to do */
}
/* move towards the requested position if possible */
s=(const uint8_t *)iter->context;
pos=iter->index;
i=iter->start;
limit=iter->limit;
if(delta>0) {
/* go forward */
if(iter->reservedField!=0) {
iter->reservedField=0;
++pos;
--delta;
}
while(delta>0 && i<limit) {
U8_NEXT(s, i, limit, c);
if(c<0xffff) {
++pos;
--delta;
} else if(delta>=2) {
pos+=2;
delta-=2;
} else /* delta==1 */ {
/* stop in the middle of a supplementary code point */
iter->reservedField=c;
++pos;
break; /* delta=0; */
}
}
if(i==limit && iter->length<0) {
iter->length=pos;
}
} else /* delta<0 */ {
/* go backward */
if(iter->reservedField!=0) {
iter->reservedField=0;
--pos;
++delta;
}
while(delta<0 && i>0) {
U8_PREV(s, 0, i, c);
if(c<0xffff) {
--pos;
++delta;
} else if(delta<=-2) {
pos-=2;
delta+=2;
} else /* delta==-1 */ {
/* stop in the middle of a supplementary code point */
iter->reservedField=c;
--pos;
break; /* delta=0; */
}
}
}
iter->start=i;
return iter->index=pos;
}
static UBool U_CALLCONV
utf8IteratorHasNext(UCharIterator *iter) {
return iter->reservedField!=0 || iter->start<iter->limit;
}
static UBool U_CALLCONV
utf8IteratorHasPrevious(UCharIterator *iter) {
return iter->index>0;
}
static UChar32 U_CALLCONV
utf8IteratorCurrent(UCharIterator *iter) {
if(iter->reservedField!=0) {
return U16_TRAIL(iter->reservedField);
} else if(iter->start<iter->limit) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
int32_t i=iter->start;
U8_NEXT(s, i, iter->limit, c);
if(c<0) {
return 0xfffd;
} else if(c<=0xffff) {
return c;
} else {
return U16_LEAD(c);
}
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf8IteratorNext(UCharIterator *iter) {
if(iter->reservedField!=0) {
UChar trail=U16_TRAIL(iter->reservedField);
iter->reservedField=0;
++iter->index;
return trail;
} else if(iter->start<iter->limit) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
U8_NEXT(s, iter->start, iter->limit, c);
++iter->index;
if(iter->length<0 && iter->start==iter->limit) {
iter->length= c<=0xffff ? iter->index : iter->index+1;
}
if(c<0) {
return 0xfffd;
} else if(c<=0xffff) {
return c;
} else {
iter->reservedField=c;
return U16_LEAD(c);
}
} else {
return U_SENTINEL;
}
}
static UChar32 U_CALLCONV
utf8IteratorPrevious(UCharIterator *iter) {
if(iter->reservedField!=0) {
UChar lead=U16_LEAD(iter->reservedField);
iter->reservedField=0;
iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
--iter->index;
return lead;
} else if(iter->start>0) {
const uint8_t *s=(const uint8_t *)iter->context;
UChar32 c;
U8_PREV(s, 0, iter->start, c);
--iter->index;
if(c<0) {
return 0xfffd;
} else if(c<=0xffff) {
return c;
} else {
iter->start+=4; /* back to behind this supplementary code point for consistent state */
iter->reservedField=c;
return U16_TRAIL(c);
}
} else {
return U_SENTINEL;
}
}
static const UCharIterator utf8Iterator={
0, 0, 0, 0, 0, 0,
utf8IteratorGetIndex,
utf8IteratorMove,
utf8IteratorHasNext,
utf8IteratorHasPrevious,
utf8IteratorCurrent,
utf8IteratorNext,
utf8IteratorPrevious,
0
};
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
if(iter!=0) {
if(s!=0 && length>=-1) {
*iter=utf8Iterator;
iter->context=s;
if(length>=0) {
iter->limit=length;
} else {
iter->limit=uprv_strlen(s);
}
iter->length= iter->limit==0 ? 0 : -1;
} else {
*iter=noopIterator;
}
}
}
/* Helper functions --------------------------------------------------------- */
U_CAPI UChar32 U_EXPORT2

View file

@ -395,6 +395,31 @@ uiter_previous32(UCharIterator *iter);
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-8 string.
*
* Sets the UCharIterator function pointers for iteration over the UTF-8 string s
* with UTF-8 iteration boundaries 0 and length.
* The implementation counts the UTF-16 index on the fly and
* lazily evaluates the UTF-16 length of the text.
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
* When the reservedField is not 0, then it contains a supplementary code point
* and the UTF-16 index is between the two corresponding surrogates.
* At that point, the UTF-8 index is behind that code point.
*
* The UTF-8 string pointer s is set into UCharIterator.context without copying
* or reallocating the string contents.
*
* @param iter UCharIterator structure to be set for iteration
* @param s UTF-8 string to iterate over
* @param length Length of s in bytes, or -1 if NUL-terminated
*
* @see UCharIterator
* @draft ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
#ifdef XP_CPLUSPLUS
/**