From 6fd8f0e5b70dcf7da0109e16f761065226bb5ea1 Mon Sep 17 00:00:00 2001 From: George Rhoten Date: Tue, 10 Aug 2004 23:22:21 +0000 Subject: [PATCH] ICU-3341 Add some missing uset_* API. X-SVN-Rev: 16142 --- icu4c/source/common/unicode/uniset.h | 2 - icu4c/source/common/unicode/uset.h | 222 +++++++++++++++++++++++++- icu4c/source/common/uset.cpp | 74 +++++++++ icu4c/source/test/cintltst/usettest.c | 73 ++++++++- 4 files changed, 358 insertions(+), 13 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index bae57622ae4..ea707f9f0f4 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -11,9 +11,7 @@ #define UNICODESET_H #include "unicode/unifilt.h" -#include "unicode/utypes.h" #include "unicode/unistr.h" -#include "unicode/uchar.h" #include "unicode/uset.h" U_NAMESPACE_BEGIN diff --git a/icu4c/source/common/unicode/uset.h b/icu4c/source/common/unicode/uset.h index 78b50133d07..e955dc66d90 100644 --- a/icu4c/source/common/unicode/uset.h +++ b/icu4c/source/common/unicode/uset.h @@ -28,6 +28,7 @@ #define __USET_H__ #include "unicode/utypes.h" +#include "unicode/uchar.h" #ifndef UCNV_H struct USet; @@ -154,6 +155,18 @@ uset_openPatternOptions(const UChar* pattern, int32_t patternLength, U_STABLE void U_EXPORT2 uset_close(USet* set); +/** + * Causes the USet object to represent the range start - end. + * If start > end then this USet is set to an empty range. + * @param set the object to set to the given range + * @param start first character in the set, inclusive + * @param end last character in the set, inclusive + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_set(USet* set, + UChar32 start, UChar32 end); + /** * Modifies the set to represent the set specified by the given * pattern. See the UnicodeSet class description for the syntax of @@ -180,6 +193,83 @@ uset_applyPattern(USet *set, uint32_t options, UErrorCode *status); +/** + * Modifies the set to contain those code points which have the given value + * for the given binary or enumerated property, as returned by + * u_getIntPropertyValue. Prior contents of this set are lost. + * + * @param set the object to contain the code points defined by the property + * + * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 + * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 + * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. + * + * @param value a value in the range u_getIntPropertyMinValue(prop).. + * u_getIntPropertyMaxValue(prop), with one exception. If prop is + * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but + * rather a mask value produced by U_GET_GC_MASK(). This allows grouped + * categories such as [:L:] to be represented. + * + * @param ec error code input/output parameter + * + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_applyIntPropertyValue(USet* set, + UProperty prop, int32_t value, UErrorCode* ec); + +/** + * Modifies the set to contain those code points which have the + * given value for the given property. Prior contents of this + * set are lost. + * + * @param set the object to contain the code points defined by the given + * property and value alias + * + * @param prop a string specifying a property alias, either short or long. + * The name is matched loosely. See PropertyAliases.txt for names and a + * description of loose matching. If the value string is empty, then this + * string is interpreted as either a General_Category value alias, a Script + * value alias, a binary property alias, or a special ID. Special IDs are + * matched loosely and correspond to the following sets: + * + * "ANY" = [\\u0000-\\U0010FFFF], + * "ASCII" = [\\u0000-\\u007F]. + * + * @param propLength the length of the prop, or -1 if NULL + * + * @param value a string specifying a value alias, either short or long. + * The name is matched loosely. See PropertyValueAliases.txt for names + * and a description of loose matching. In addition to aliases listed, + * numeric values and canonical combining classes may be expressed + * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string + * may also be empty. + * + * @param valueLength the length of the value, or -1 if NULL + * + * @param ec error code input/output parameter + * + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_applyPropertyAlias(USet* set, + const UChar *prop, int32_t propLength, + const UChar *value, int32_t valueLength, + UErrorCode* ec); + +/** + * Return true if the given position, in the given pattern, appears + * to be the start of a UnicodeSet pattern. + * + * @param pattern a string specifying the pattern + * @param patternLength the length of the pattern, or -1 if NULL + * @param pos the given position + * @draft ICU 3.2 + */ +U_DRAFT UBool U_EXPORT2 +uset_resemblesPattern(const UChar *pattern, int32_t patternLength, + int32_t pos); + /** * Returns a string representation of this set. If the result of * calling this function is passed to a uset_openPattern(), it @@ -279,6 +369,59 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end); U_STABLE void U_EXPORT2 uset_removeString(USet* set, const UChar* str, int32_t strLen); +/** + * Removes from this set all of its elements that are contained in the + * specified set. This operation effectively modifies this + * set so that its value is the asymmetric set difference of + * the two sets. + * @param set the object from which the elements are to be removed + * @param remove the object that defines which elements will be + * removed from this set + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_removeAll(USet* set, const USet* remove); + +/** + * Retain only the elements in this set that are contained in the + * specified range. If start > end then an empty range is + * retained, leaving the set empty. This is equivalent to + * a boolean logic AND, or a set INTERSECTION. + * + * @param set the object for which to retain only the specified range + * @param start first character, inclusive, of range to be retained + * to this set. + * @param end last character, inclusive, of range to be retained + * to this set. + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_retain(USet* set, UChar32 start, UChar32 end); + +/** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the intersection of the two sets. + * + * @param set the object on which to perform the retain + * @param retain set that defines which elements this set will retain + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_retainAll(USet* set, const USet* retain); + +/** + * Reallocate this objects internal structures to take up the least + * possible space, without changing this object's value. + * + * @param set the object on which to perfrom the compact + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_compact(USet* set); + /** * Inverts this set. This operation modifies this set so that * its value is its complement. This operation does not affect @@ -289,6 +432,19 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen); U_STABLE void U_EXPORT2 uset_complement(USet* set); +/** + * Complements in this set all elements contained in the specified + * set. Any character in the other set will be removed if it is + * in this set, or will be added if it is not in this set. + * + * @param set the set with which to complement + * @param complement set that defines which elements will be xor'ed + * from this set. + * @draft ICU 3.2 + */ +U_DRAFT void U_EXPORT2 +uset_complementAll(USet* set, const USet* complement); + /** * Removes all of the elements from this set. This set will be * empty after this call returns. @@ -341,6 +497,32 @@ uset_containsRange(const USet* set, UChar32 start, UChar32 end); U_STABLE UBool U_EXPORT2 uset_containsString(const USet* set, const UChar* str, int32_t strLen); +/** + * Returns the index of the given character within this set, where + * the set is ordered by ascending code point. If the character + * is not in this set, return -1. The inverse of this method is + * charAt(). + * @param set the set + * @param c the character to obtain the index for + * @return an index from 0..size()-1, or -1 + * @draft ICU 3.2 + */ +U_DRAFT int32_t U_EXPORT2 +uset_indexOf(const USet* set, UChar32 c); + +/** + * Returns the character at the given index within this set, where + * the set is ordered by ascending code point. If the index is + * out of range, return (UChar32)-1. The inverse of this method is + * indexOf(). + * @param set the set + * @param index an index from 0..size()-1 to obtain the char for + * @return the character at the given index, or (UChar32)-1. + * @draft ICU 3.2 + */ +U_DRAFT UChar32 U_EXPORT2 +uset_charAt(const USet* set, int32_t index); + /** * Returns the number of characters and strings contained in the given * USet. @@ -387,24 +569,48 @@ uset_getItem(const USet* set, int32_t itemIndex, UChar* str, int32_t strCapacity, UErrorCode* ec); -/* TODO: propose the following to the list and make them public */ - /** - * @internal + * Returns true if set1 contains all the characters and strings + * of set2. It answers the question, 'Is set1 a subset of set2?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @draft ICU 3.2 */ -U_INTERNAL UBool U_EXPORT2 +U_DRAFT UBool U_EXPORT2 uset_containsAll(const USet* set1, const USet* set2); /** - * @internal + * Returns true if set1 contains none of the characters and strings + * of set2. It answers the question, 'Is set1 a disjoint set of set2?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @draft ICU 3.2 */ -U_INTERNAL UBool U_EXPORT2 +U_DRAFT UBool U_EXPORT2 uset_containsNone(const USet* set1, const USet* set2); /** - * @internal + * Returns true if set1 contains some of the characters and strings + * of set2. It answers the question, 'Does set1 and set2 have an intersection?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @draft ICU 3.2 */ -U_INTERNAL UBool U_EXPORT2 +U_DRAFT UBool U_EXPORT2 +uset_containsSome(const USet* set1, const USet* set2); + +/** + * Returns true if set1 contains all of the characters and strings + * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' + * @param set1 set to be checked for containment + * @param set2 set to be checked for containment + * @return true if the test condition is met + * @draft ICU 3.2 + */ +U_DRAFT UBool U_EXPORT2 uset_equals(const USet* set1, const USet* set2); /********************************************************************* diff --git a/icu4c/source/common/uset.cpp b/icu4c/source/common/uset.cpp index 58798bcaf22..7a9da1e0167 100644 --- a/icu4c/source/common/uset.cpp +++ b/icu4c/source/common/uset.cpp @@ -79,6 +79,12 @@ uset_close(USet* set) { delete (UnicodeSet*) set; } +U_CAPI void U_EXPORT2 +uset_set(USet* set, + UChar32 start, UChar32 end) { + ((UnicodeSet*) set)->set(start, end); +} + U_CAPI int32_t U_EXPORT2 uset_applyPattern(USet *set, const UChar *pattern, int32_t patternLength, @@ -108,6 +114,35 @@ uset_applyPattern(USet *set, return pos.getIndex(); } +U_CAPI void U_EXPORT2 +uset_applyIntPropertyValue(USet* set, + UProperty prop, int32_t value, UErrorCode* ec) { + ((UnicodeSet*) set)->applyIntPropertyValue(prop, value, *ec); +} + +U_CAPI void U_EXPORT2 +uset_applyPropertyAlias(USet* set, + const UChar *prop, int32_t propLength, + const UChar *value, int32_t valueLength, + UErrorCode* ec) { + + UnicodeString p(prop, propLength); + UnicodeString v(value, valueLength); + + ((UnicodeSet*) set)->applyPropertyAlias(p, v, *ec); +} + +U_CAPI UBool U_EXPORT2 +uset_resemblesPattern(const UChar *pattern, int32_t patternLength, + int32_t pos) { + + UnicodeString pat(pattern, patternLength); + + return ((pos+1) < pat.length() && + pat.charAt(pos) == (UChar)91/*[*/) || + UnicodeSet::resemblesPattern(pat, pos); +} + U_CAPI int32_t U_EXPORT2 uset_toPattern(const USet* set, UChar* result, int32_t resultCapacity, @@ -162,11 +197,36 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen) { ((UnicodeSet*) set)->remove(s); } +U_CAPI void U_EXPORT2 +uset_removeAll(USet* set, const USet* remove) { + ((UnicodeSet*) set)->removeAll(*(const UnicodeSet*)remove); +} + +U_CAPI void U_EXPORT2 +uset_retain(USet* set, UChar32 start, UChar32 end) { + ((UnicodeSet*) set)->retain(start, end); +} + +U_CAPI void U_EXPORT2 +uset_retainAll(USet* set, const USet* retain) { + ((UnicodeSet*) set)->retainAll(*(const UnicodeSet*)retain); +} + +U_CAPI void U_EXPORT2 +uset_compact(USet* set) { + ((UnicodeSet*) set)->compact(); +} + U_CAPI void U_EXPORT2 uset_complement(USet* set) { ((UnicodeSet*) set)->complement(); } +U_CAPI void U_EXPORT2 +uset_complementAll(USet* set, const USet* complement) { + ((UnicodeSet*) set)->complementAll(*(const UnicodeSet*)complement); +} + U_CAPI void U_EXPORT2 uset_clear(USet* set) { ((UnicodeSet*) set)->clear(); @@ -203,12 +263,26 @@ uset_containsNone(const USet* set1, const USet* set2) { return ((const UnicodeSet*) set1)->containsNone(* (const UnicodeSet*) set2); } +U_CAPI UBool U_EXPORT2 +uset_containsSome(const USet* set1, const USet* set2) { + return ((const UnicodeSet*) set1)->containsSome(* (const UnicodeSet*) set2); +} U_CAPI UBool U_EXPORT2 uset_equals(const USet* set1, const USet* set2) { return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; } +U_CAPI int32_t U_EXPORT2 +uset_indexOf(const USet* set, UChar32 c) { + return ((UnicodeSet*) set)->indexOf(c); +} + +U_CAPI UChar32 U_EXPORT2 +uset_charAt(const USet* set, int32_t index) { + return ((UnicodeSet*) set)->charAt(index); +} + U_CAPI int32_t U_EXPORT2 uset_size(const USet* set) { return ((const UnicodeSet*) set)->size(); diff --git a/icu4c/source/test/cintltst/usettest.c b/icu4c/source/test/cintltst/usettest.c index eec7e099e49..802676251ee 100644 --- a/icu4c/source/test/cintltst/usettest.c +++ b/icu4c/source/test/cintltst/usettest.c @@ -57,6 +57,12 @@ static void Testj2269() { static const UChar PAT[] = {91,97,45,99,123,97,98,125,93,0}; /* "[a-c{ab}]" */ static const int32_t PAT_LEN = (sizeof(PAT) / sizeof(PAT[0])) - 1; +static const UChar PAT_lb[] = {0x6C, 0x62, 0}; /* "lb" */ +static const int32_t PAT_lb_LEN = (sizeof(PAT_lb) / sizeof(PAT_lb[0])) - 1; + +static const UChar VAL_SP[] = {0x53, 0x50, 0}; /* "SP" */ +static const int32_t VAL_SP_LEN = (sizeof(VAL_SP) / sizeof(VAL_SP[0])) - 1; + static const UChar STR_bc[] = {98,99,0}; /* "bc" */ static const int32_t STR_bc_LEN = (sizeof(STR_bc) / sizeof(STR_bc[0])) - 1; @@ -68,6 +74,7 @@ static const int32_t STR_ab_LEN = (sizeof(STR_ab) / sizeof(STR_ab[0])) - 1; */ static void TestAPI() { USet* set; + USet* set2; UErrorCode ec; /* [] */ @@ -88,6 +95,9 @@ static void TestAPI() { log_data_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec)); return; } + if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) { + log_data_err("uset_resemblesPattern of PAT failed\n"); + } expect(set, "abc{ab}", "def{bc}", &ec); /* [a-d{ab}] */ @@ -116,7 +126,64 @@ static void TestAPI() { uset_removeRange(set, 0x0050, 0x0063); expect(set, "de{bc}", "bcfg{ab}", NULL); + /* [g-l] */ + uset_set(set, 0x0067, 0x006C); + expect(set, "ghijkl", "de{bc}", NULL); + + if (uset_indexOf(set, 0x0067) != 0) { + log_data_err("uset_indexOf failed finding correct index of 'g'\n"); + } + + if (uset_charAt(set, 0) != 0x0067) { + log_data_err("uset_charAt failed finding correct char 'g' at index 0\n"); + } + + /* How to test this one...? */ + uset_compact(set); + + /* [g-i] */ + uset_retain(set, 0x0067, 0x0069); + expect(set, "ghi", "dejkl{bc}", NULL); + + /* UCHAR_ASCII_HEX_DIGIT */ + uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec); + if(U_FAILURE(ec)) { + log_data_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec)); + return; + } + expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL); + + /* [] */ + set2 = uset_open(1, 1); + uset_clear(set2); + + /* space */ + uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec); + expect(set2, " ", "abcdefghi{bc}", NULL); + + /* [a-c] */ + uset_set(set2, 0x0061, 0x0063); + /* [g-i] */ + uset_set(set, 0x0067, 0x0069); + + /* [a-c g-i] */ + uset_complementAll(set, set2); + expect(set, "abcghi", "def{bc}", NULL); + + /* [g-i] */ + uset_removeAll(set, set2); + expect(set, "ghi", "abcdef{bc}", NULL); + + /* [a-c g-i] */ + uset_addAll(set2, set); + expect(set2, "abcghi", "def{bc}", NULL); + + /* [g-i] */ + uset_retainAll(set2, set); + expect(set2, "ghi", "abcdef{bc}", NULL); + uset_close(set); + uset_close(set2); } /*------------------------------------------------------------------ @@ -159,7 +226,7 @@ static void expectContainment(const USet* set, const char* list, UBool isIn) { const char* p = list; - UChar ustr[128]; + UChar ustr[4096]; char *pat; UErrorCode ec; int32_t rangeStart = -1, rangeEnd = -1, length; @@ -259,8 +326,8 @@ static char oneUCharToChar(UChar32 c) { static void expectItems(const USet* set, const char* items) { const char* p = items; - UChar ustr[128], itemStr[128]; - char buf[128]; + UChar ustr[4096], itemStr[4096]; + char buf[4096]; char *pat; UErrorCode ec; int32_t expectedSize = 0;