ICU-1115 add multicharacter strings and iteration to uset API

X-SVN-Rev: 8784
This commit is contained in:
Alan Liu 2002-06-04 23:37:20 +00:00
parent 172c42e123
commit 63cb0eebd2
4 changed files with 287 additions and 69 deletions

View file

@ -822,8 +822,12 @@ public:
virtual UChar32 getRangeEnd(int32_t index) const;
/**
* Serializes this set into an array of 16-bit integers. The array
* has following format (each line is one 16-bit integer):
* Serializes this set into an array of 16-bit integers. Serialization
* (currently) only records the characters in the set; multicharacter
* strings are ignored.
*
* The array has following format (each line is one 16-bit
* integer):
*
* length = (n+2*m) | (m!=0?0x8000:0)
* bmpLength = n; present if m!=0
@ -896,10 +900,18 @@ public:
private:
static const char fgClassID;
// Private API for the USet API
friend class USetAccess;
int32_t getStringCount() const;
const UnicodeString* getString(int32_t index) const;
private:
static const char fgClassID;
//----------------------------------------------------------------
// RuleBasedTransliterator support
//----------------------------------------------------------------

View file

@ -30,7 +30,7 @@ enum {
/**
* A serialized form of a Unicode set. Limited manipulations are
* possible directly on a serialized set.
* possible directly on a serialized set. See below.
*/
struct USerializedSet {
const uint16_t *array;
@ -39,6 +39,11 @@ struct USerializedSet {
};
typedef struct USerializedSet USerializedSet;
/*********************************************************************
* USet API
*********************************************************************/
/**
* Creates a USet object that contains the range of characters
* start..end, inclusive.
@ -47,16 +52,48 @@ typedef struct USerializedSet USerializedSet;
* @return a newly created USet. The caller must call uset_close() on
* it when done.
*/
U_CAPI USet * U_EXPORT2
U_CAPI USet* U_EXPORT2
uset_open(UChar32 start, UChar32 end);
/**
* Creates a set from the given pattern. See the UnicodeSet class
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param patternLength the length of the pattern, or -1 if null
* terminated
* @param ec the error code
*/
U_CAPI USet* U_EXPORT2
uset_openPattern(const UChar* pattern, int32_t patternLength,
UErrorCode* ec);
/**
* Disposes of the storage used by a USet object. This function should
* be called exactly once for objects returned by uset_open().
* @param set the object to dispose of
*/
U_CAPI void U_EXPORT2
uset_close(USet *set);
uset_close(USet* set);
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a uset_openPattern(), it
* will produce another set that is equal to this one.
* @param set the set
* @param result the string to receive the rules, may be NULL
* @param resultCapacity the capacity of result, may be 0 if result is NULL
* @param escapeUnprintable if TRUE then convert unprintable
* character to their hex escape representations, \uxxxx or
* \Uxxxxxxxx. Unprintable characters are those other than
* U+000A, U+0020..U+007E.
* @param ec error code.
* @return length of string, possibly larger than resultCapacity
*/
U_CAPI int32_t U_EXPORT2
uset_toPattern(const USet* set,
UChar* result, int32_t resultCapacity,
UBool escapeUnprintable,
UErrorCode* ec);
/**
* Adds the given character to the given USet. After this call,
@ -65,7 +102,17 @@ uset_close(USet *set);
* @param c the character to add
*/
U_CAPI void U_EXPORT2
uset_add(USet *set, UChar32 c);
uset_add(USet* set, UChar32 c);
/**
* Adds the given string to the given USet. After this call,
* uset_containsString(set, str, strLen) will return TRUE.
* @param set the object to which to add the character
* @param str the string to add
* @param strLen the length of the string or -1 if null terminated.
*/
U_CAPI void U_EXPORT2
uset_addString(USet* set, const UChar* str, int32_t strLen);
/**
* Removes the given character from the given USet. After this call,
@ -74,7 +121,34 @@ uset_add(USet *set, UChar32 c);
* @param c the character to remove
*/
U_CAPI void U_EXPORT2
uset_remove(USet *set, UChar32 c);
uset_remove(USet* set, UChar32 c);
/**
* Removes the given string to the given USet. After this call,
* uset_containsString(set, str, strLen) will return FALSE.
* @param set the object to which to add the character
* @param str the string to remove
* @param strLen the length of the string or -1 if null terminated.
*/
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen);
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This operation does not affect
* the multicharacter strings, if any.
* @param set the set
*/
U_CAPI void U_EXPORT2
uset_complement(USet* set);
/**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* @param set the set
*/
U_CAPI void U_EXPORT2
uset_clear(USet* set);
/**
* Returns TRUE if the given USet contains no characters and no
@ -83,7 +157,7 @@ uset_remove(USet *set, UChar32 c);
* @return true if set is empty
*/
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet *set);
uset_isEmpty(const USet* set);
/**
* Returns TRUE if the given USet contains the given character.
@ -91,7 +165,17 @@ uset_isEmpty(const USet *set);
* @return true if set contains c
*/
U_CAPI UBool U_EXPORT2
uset_contains(const USet *set, UChar32 c);
uset_contains(const USet* set, UChar32 c);
/**
* Returns TRUE if the given USet contains the given string.
* @param set the set
* @param str the string
* @param strLen the length of the string or -1 if null terminated.
* @return true if set contains str
*/
U_CAPI UBool U_EXPORT2
uset_containsString(const USet* set, const UChar* str, int32_t strLen);
/**
* Returns the number of characters and strings contained in the given
@ -104,32 +188,48 @@ U_CAPI int32_t U_EXPORT2
uset_size(const USet* set);
/**
* Returns the number of disjoint ranges of characters contained in
* the given set. Ignores any strings contained in the set.
* Returns the number of items in this set. An item is either a range
* of characters or a single multicharacter string.
* @param set the set
* @return a non-negative integer counting the character ranges
* contained in set
* and/or strings contained in set
*/
U_CAPI int32_t U_EXPORT2
uset_countRanges(const USet *set);
uset_getItemCount(const USet* set);
/**
* Returns a range of characters contained in the given set.
* Returns an item of this set. An item is either a range of
* characters or a single multicharacter string.
* @param set the set
* @param rangeIndex a non-negative integer in the range 0..
* uset_countRanges(set)-1
* @param pStart pointer to variable to receive first character
* @param itemIndex a non-negative integer in the range 0..
* uset_getItemCount(set)-1
* @param start pointer to variable to receive first character
* in range, inclusive
* @param pEnd pointer to variable to receive last character in range,
* @param end pointer to variable to receive last character in range,
* inclusive
* @return true if rangeIndex is value, otherwise false
* @param str buffer to receive the string, may be NULL
* @param strCapacity capacity of str, or 0 if str is NULL
* @param ec error code
* @return the length of the string (>= 2), or 0 if the item is a
* range, in which case it is the range *start..*end, or -1 if
* itemIndex is out of range
*/
U_CAPI UBool U_EXPORT2
uset_getRange(const USet *set, int32_t rangeIndex,
UChar32 *pStart, UChar32 *pEnd);
U_CAPI int32_t U_EXPORT2
uset_getItem(const USet* set, int32_t itemIndex,
UChar32* start, UChar32* end,
UChar* str, int32_t strCapacity,
UErrorCode* ec);
/*********************************************************************
* Serialized set API
*********************************************************************/
/**
* Serializes this set into an array of 16-bit integers. The array
* Serializes this set into an array of 16-bit integers. Serialization
* (currently) only records the characters in the set; multicharacter
* strings are ignored.
*
* The array
* has following format (each line is one 16-bit integer):
*
* length = (n+2*m) | (m!=0?0x8000:0)
@ -173,7 +273,7 @@ uset_getRange(const USet *set, int32_t rangeIndex,
* than U_BUFFER_OVERFLOW_ERROR.
*/
U_CAPI int32_t U_EXPORT2
uset_serialize(const USet *set, uint16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode);
uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
/**
* Given a serialized array, fill in the given serialized set object.
@ -183,14 +283,14 @@ uset_serialize(const USet *set, uint16_t *dest, int32_t destCapacity, UErrorCode
* @return true if the given array is valid, otherwise false
*/
U_CAPI UBool U_EXPORT2
uset_getSerializedSet(USerializedSet *fillSet, const uint16_t *src, int32_t srcLength);
uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
/**
* Set the USerializedSet to contain the given character (and nothing
* else).
*/
U_CAPI void U_EXPORT2
uset_setSerializedToOne(USerializedSet *fillSet, UChar32 c);
uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
/**
* Returns TRUE if the given USerializedSet contains the given
@ -199,7 +299,7 @@ uset_setSerializedToOne(USerializedSet *fillSet, UChar32 c);
* @return true if set contains c
*/
U_CAPI UBool U_EXPORT2
uset_serializedContains(const USerializedSet *set, UChar32 c);
uset_serializedContains(const USerializedSet* set, UChar32 c);
/**
* Returns the number of disjoint ranges of characters contained in
@ -210,22 +310,22 @@ uset_serializedContains(const USerializedSet *set, UChar32 c);
* contained in set
*/
U_CAPI int32_t U_EXPORT2
uset_countSerializedRanges(const USerializedSet *set);
uset_getSerializedRangeCount(const USerializedSet* set);
/**
* Returns a range of characters contained in the given serialized
* set.
* @param set the serialized set
* @param rangeIndex a non-negative integer in the range 0..
* uset_countSerializedRanges(set)-1
* uset_getSerializedRangeCount(set)-1
* @param pStart pointer to variable to receive first character
* in range, inclusive
* @param pEnd pointer to variable to receive last character in range,
* inclusive
* @return true if rangeIndex is value, otherwise false
* @return true if rangeIndex is valid, otherwise false
*/
U_CAPI UBool U_EXPORT2
uset_getSerializedRange(const USerializedSet *set, int32_t rangeIndex,
UChar32 *pStart, UChar32 *pEnd);
uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
UChar32* pStart, UChar32* pEnd);
#endif

View file

@ -1364,6 +1364,14 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
return list[index*2 + 1] - 1;
}
int32_t UnicodeSet::getStringCount() const {
return strings->size();
}
const UnicodeString* UnicodeSet::getString(int32_t index) const {
return (const UnicodeString*) strings->elementAt(index);
}
/**
* Reallocate this objects internal structures to take up the least
* possible space, without changing this object's value.

View file

@ -26,61 +26,155 @@
#include "unicode/uset.h"
#include "unicode/uniset.h"
#define USET_STATIC_CAPACITY 12
#define USET_GROW_DELTA 20
U_NAMESPACE_BEGIN
U_CAPI USet * U_EXPORT2
U_CAPI USet* U_EXPORT2
uset_open(UChar32 start, UChar32 end) {
return (USet*) new UnicodeSet(start, end);
}
U_CAPI void U_EXPORT2
uset_close(USet *set) {
delete (UnicodeSet*) set;
U_CAPI USet* U_EXPORT2
uset_openPattern(const UChar* pattern, int32_t patternLength,
UErrorCode* ec) {
UnicodeString pat(patternLength==-1, pattern, patternLength);
UnicodeSet* set = new UnicodeSet(pat, *ec);
if (U_FAILURE(*ec)) {
delete set;
set = NULL;
}
return (USet*) set;
}
U_CAPI void U_EXPORT2
uset_add(USet *set, UChar32 c) {
uset_close(USet* set) {
delete (UnicodeSet*) set;
}
U_CAPI int32_t U_EXPORT2
uset_toPattern(const USet* set,
UChar* result, int32_t resultCapacity,
UBool escapeUnprintable,
UErrorCode* ec) {
UnicodeString pat;
((const UnicodeSet*) set)->toPattern(pat, escapeUnprintable);
return pat.extract(result, resultCapacity, *ec);
}
U_CAPI void U_EXPORT2
uset_add(USet* set, UChar32 c) {
((UnicodeSet*) set)->add(c);
}
U_CAPI void U_EXPORT2
uset_remove(USet *set, UChar32 c) {
uset_addString(USet* set, const UChar* str, int32_t strLen) {
UnicodeString s(strLen==-1, str, strLen);
((UnicodeSet*) set)->add(s);
}
U_CAPI void U_EXPORT2
uset_remove(USet* set, UChar32 c) {
((UnicodeSet*) set)->remove(c);
}
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen) {
UnicodeString s(strLen==-1, str, strLen);
((UnicodeSet*) set)->remove(s);
}
U_CAPI void U_EXPORT2
uset_complement(USet* set) {
((UnicodeSet*) set)->complement();
}
U_CAPI void U_EXPORT2
uset_clear(USet* set) {
((UnicodeSet*) set)->clear();
}
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet *set) {
uset_isEmpty(const USet* set) {
return ((const UnicodeSet*) set)->isEmpty();
}
U_CAPI UBool U_EXPORT2
uset_contains(const USet *set, UChar32 c) {
uset_contains(const USet* set, UChar32 c) {
return ((const UnicodeSet*) set)->contains(c);
}
U_CAPI UBool U_EXPORT2
uset_containsString(const USet* set, const UChar* str, int32_t strLen) {
UnicodeString s(strLen==-1, str, strLen);
return ((const UnicodeSet*) set)->contains(s);
}
U_CAPI int32_t U_EXPORT2
uset_size(const USet* set) {
return ((const UnicodeSet*) set)->size();
}
/**
* This class only exists to provide access to the UnicodeSet private
* USet support API. Declaring a class a friend is more portable than
* trying to declare extern "C" functions as friends.
*/
class USetAccess {
public:
// Try to have the compiler inline these
inline static int32_t getStringCount(const UnicodeSet& set) {
return set.getStringCount();
}
inline static const UnicodeString* getString(const UnicodeSet& set,
int32_t i) {
return set.getString(i);
}
};
U_CAPI int32_t U_EXPORT2
uset_getRangeCount(const USet *set) {
return ((const UnicodeSet*) set)->getRangeCount();
uset_getItemCount(const USet* uset) {
const UnicodeSet& set = *(const UnicodeSet*)uset;
return set.getRangeCount() + USetAccess::getStringCount(set);
}
U_CAPI UBool U_EXPORT2
uset_getRange(const USet *set, int32_t rangeIndex,
UChar32 *pStart, UChar32 *pEnd) {
if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
return FALSE;
U_CAPI int32_t U_EXPORT2
uset_getItem(const USet* uset, int32_t itemIndex,
UChar32* start, UChar32* end,
UChar* str, int32_t strCapacity,
UErrorCode* ec) {
if (U_FAILURE(*ec)) return 0;
const UnicodeSet& set = *(const UnicodeSet*)uset;
if ((uint32_t) itemIndex < (uint32_t) set.getRangeCount()) {
*start = set.getRangeStart(itemIndex);
*end = set.getRangeEnd(itemIndex);
return 0;
} else {
itemIndex -= set.getRangeCount();
if ((uint32_t) itemIndex < (uint32_t) USetAccess::getStringCount(set)) {
const UnicodeString* s = USetAccess::getString(set, itemIndex);
return s->extract(str, strCapacity, *ec);
}
}
const UnicodeSet* us = (const UnicodeSet*) set;
*pStart = us->getRangeStart(rangeIndex);
*pEnd = us->getRangeEnd(rangeIndex);
return TRUE;
*ec = U_ILLEGAL_ARGUMENT_ERROR;
return -1;
}
//U_CAPI int32_t U_EXPORT2
//uset_getRangeCount(const USet* set) {
// return ((const UnicodeSet*) set)->getRangeCount();
//}
//
//U_CAPI UBool U_EXPORT2
//uset_getRange(const USet* set, int32_t rangeIndex,
// UChar32* pStart, UChar32* pEnd) {
// if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) {
// return FALSE;
// }
// const UnicodeSet* us = (const UnicodeSet*) set;
// *pStart = us->getRangeStart(rangeIndex);
// *pEnd = us->getRangeEnd(rangeIndex);
// return TRUE;
//}
/*
* Serialize a USet into 16-bit units.
* Store BMP code points as themselves with one 16-bit unit each.
@ -100,16 +194,16 @@ uset_getRange(const USet *set, int32_t rangeIndex,
* - some supplementary: (length|0x8000) (bmpLength<length) BMP, .., BMP, supp-high, supp-low, ..
*/
U_CAPI int32_t U_EXPORT2
uset_serialize(const USet *set, uint16_t *dest, int32_t destCapacity, UErrorCode *ec) {
uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* ec) {
if (ec==NULL || U_FAILURE(*ec)) {
return 0;
}
return ((const UnicodeSet*) set)->serialize(dest, destCapacity, *ec);
return ((const UnicodeSet*) set)->serialize(dest, destCapacity,* ec);
}
U_CAPI UBool U_EXPORT2
uset_getSerializedSet(USerializedSet *fillSet, const uint16_t *src, int32_t srcLength) {
uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) {
int32_t length;
if(fillSet==NULL) {
@ -143,7 +237,7 @@ uset_getSerializedSet(USerializedSet *fillSet, const uint16_t *src, int32_t srcL
}
U_CAPI void U_EXPORT2
uset_setSerializedToOne(USerializedSet *fillSet, UChar32 c) {
uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) {
if(fillSet==NULL || (uint32_t)c>0x10ffff) {
return;
}
@ -176,8 +270,8 @@ uset_setSerializedToOne(USerializedSet *fillSet, UChar32 c) {
}
U_CAPI UBool U_EXPORT2
uset_serializedContains(const USerializedSet *set, UChar32 c) {
const uint16_t *array;
uset_serializedContains(const USerializedSet* set, UChar32 c) {
const uint16_t* array;
if(set==NULL || (uint32_t)c>0x10ffff) {
return FALSE;
@ -203,7 +297,7 @@ uset_serializedContains(const USerializedSet *set, UChar32 c) {
}
U_CAPI int32_t U_EXPORT2
uset_countSerializedRanges(const USerializedSet *set) {
uset_getSerializedRangeCount(const USerializedSet* set) {
if(set==NULL) {
return 0;
}
@ -212,9 +306,9 @@ uset_countSerializedRanges(const USerializedSet *set) {
}
U_CAPI UBool U_EXPORT2
uset_getSerializedRange(const USerializedSet *set, int32_t rangeIndex,
UChar32 *pStart, UChar32 *pEnd) {
const uint16_t *array;
uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
UChar32* pStart, UChar32* pEnd) {
const uint16_t* array;
int32_t bmpLength, length;
if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) {
@ -258,11 +352,15 @@ uset_getSerializedRange(const USerializedSet *set, int32_t rangeIndex,
}
}
U_NAMESPACE_END
// TODO Investigate incorporating this code into UnicodeSet to improve
// efficiency.
// ---
// #define USET_GROW_DELTA 20
//
// static U_INLINE int32_t
// findChar(const UChar32 *array, int32_t length, UChar32 c) {
// findChar(const UChar32* array, int32_t length, UChar32 c) {
// int32_t i;
//
// /* check the last range limit first for more efficient appending */
@ -280,7 +378,7 @@ uset_getSerializedRange(const USerializedSet *set, int32_t rangeIndex,
// }
//
// static UBool
// addRemove(USet *set, UChar32 c, int32_t doRemove) {
// addRemove(USet* set, UChar32 c, int32_t doRemove) {
// int32_t i, length, more;
//
// if(set==NULL || (uint32_t)c>0x10ffff) {
@ -338,7 +436,7 @@ uset_getSerializedRange(const USerializedSet *set, int32_t rangeIndex,
// if(length+more>set->capacity) {
// /* reallocate */
// int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA;
// UChar32 *newArray=(UChar32 *)uprv_malloc(newCapacity*4);
// UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4);
// if(newArray==NULL) {
// return FALSE;
// }
@ -364,11 +462,11 @@ uset_getSerializedRange(const USerializedSet *set, int32_t rangeIndex,
// }
//
// U_CAPI UBool U_EXPORT2
// uset_add(USet *set, UChar32 c) {
// uset_add(USet* set, UChar32 c) {
// return addRemove(set, c, 0);
// }
//
// U_CAPI void U_EXPORT2
// uset_remove(USet *set, UChar32 c) {
// uset_remove(USet* set, UChar32 c) {
// addRemove(set, c, 1);
// }