diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp index 602560336f3..12fb7746717 100644 --- a/icu4c/source/i18n/i18n.dsp +++ b/icu4c/source/i18n/i18n.dsp @@ -241,6 +241,10 @@ SOURCE=.\unicdcm.cpp # End Source File # Begin Source File +SOURCE=.\uniset.cpp +# End Source File +# Begin Source File + SOURCE=.\unum.cpp # End Source File # Begin Source File @@ -265,7 +269,7 @@ SOURCE=.\brkiter.h InputPath=.\brkiter.h "..\..\include\brkiter.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy brkiter.h ..\..\include + copy brkiter.h ..\..\include # End Custom Build @@ -275,7 +279,7 @@ InputPath=.\brkiter.h InputPath=.\brkiter.h "..\..\include\brkiter.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy brkiter.h ..\..\include + copy brkiter.h ..\..\include # End Custom Build @@ -292,7 +296,7 @@ SOURCE=.\calendar.h InputPath=.\calendar.h "..\..\include\calendar.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy calendar.h ..\..\include + copy calendar.h ..\..\include # End Custom Build @@ -302,7 +306,7 @@ InputPath=.\calendar.h InputPath=.\calendar.h "..\..\include\calendar.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy calendar.h ..\..\include + copy calendar.h ..\..\include # End Custom Build @@ -319,7 +323,7 @@ SOURCE=.\choicfmt.h InputPath=.\choicfmt.h "..\..\include\choicfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy choicfmt.h ..\..\include + copy choicfmt.h ..\..\include # End Custom Build @@ -329,7 +333,7 @@ InputPath=.\choicfmt.h InputPath=.\choicfmt.h "..\..\include\choicfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy choicfmt.h ..\..\include + copy choicfmt.h ..\..\include # End Custom Build @@ -350,7 +354,7 @@ SOURCE=.\coleitr.h InputPath=.\coleitr.h "..\..\include\coleitr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy coleitr.h ..\..\include + copy coleitr.h ..\..\include # End Custom Build @@ -360,7 +364,7 @@ InputPath=.\coleitr.h InputPath=.\coleitr.h "..\..\include\coleitr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy coleitr.h ..\..\include + copy coleitr.h ..\..\include # End Custom Build @@ -377,7 +381,7 @@ SOURCE=.\coll.h InputPath=.\coll.h "..\..\include\coll.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy coll.h ..\..\include + copy coll.h ..\..\include # End Custom Build @@ -387,7 +391,7 @@ InputPath=.\coll.h InputPath=.\coll.h "..\..\include\coll.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy coll.h ..\..\include + copy coll.h ..\..\include # End Custom Build @@ -408,7 +412,7 @@ SOURCE=.\datefmt.h InputPath=.\datefmt.h "..\..\include\datefmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy datefmt.h ..\..\include + copy datefmt.h ..\..\include # End Custom Build @@ -418,7 +422,7 @@ InputPath=.\datefmt.h InputPath=.\datefmt.h "..\..\include\datefmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy datefmt.h ..\..\include + copy datefmt.h ..\..\include # End Custom Build @@ -435,7 +439,7 @@ SOURCE=.\dcfmtsym.h InputPath=.\dcfmtsym.h "..\..\include\dcfmtsym.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy dcfmtsym.h ..\..\include + copy dcfmtsym.h ..\..\include # End Custom Build @@ -445,7 +449,7 @@ InputPath=.\dcfmtsym.h InputPath=.\dcfmtsym.h "..\..\include\dcfmtsym.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy dcfmtsym.h ..\..\include + copy dcfmtsym.h ..\..\include # End Custom Build @@ -462,7 +466,7 @@ SOURCE=.\decimfmt.h InputPath=.\decimfmt.h "..\..\include\decimfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy decimfmt.h ..\..\include + copy decimfmt.h ..\..\include # End Custom Build @@ -472,7 +476,7 @@ InputPath=.\decimfmt.h InputPath=.\decimfmt.h "..\..\include\decimfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy decimfmt.h ..\..\include + copy decimfmt.h ..\..\include # End Custom Build @@ -489,7 +493,7 @@ SOURCE=.\dtfmtsym.h InputPath=.\dtfmtsym.h "..\..\include\dtfmtsym.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy dtfmtsym.h ..\..\include + copy dtfmtsym.h ..\..\include # End Custom Build @@ -499,7 +503,7 @@ InputPath=.\dtfmtsym.h InputPath=.\dtfmtsym.h "..\..\include\dtfmtsym.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy dtfmtsym.h ..\..\include + copy dtfmtsym.h ..\..\include # End Custom Build @@ -516,7 +520,7 @@ SOURCE=.\fieldpos.h InputPath=.\fieldpos.h "..\..\include\fieldpos.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy fieldpos.h ..\..\include + copy fieldpos.h ..\..\include # End Custom Build @@ -526,7 +530,7 @@ InputPath=.\fieldpos.h InputPath=.\fieldpos.h "..\..\include\fieldpos.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy fieldpos.h ..\..\include + copy fieldpos.h ..\..\include # End Custom Build @@ -543,7 +547,7 @@ SOURCE=.\fmtable.h InputPath=.\fmtable.h "..\..\include\fmtable.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy fmtable.h ..\..\include + copy fmtable.h ..\..\include # End Custom Build @@ -553,7 +557,7 @@ InputPath=.\fmtable.h InputPath=.\fmtable.h "..\..\include\fmtable.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy fmtable.h ..\..\include + copy fmtable.h ..\..\include # End Custom Build @@ -570,7 +574,7 @@ SOURCE=.\format.h InputPath=.\format.h "..\..\include\format.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy format.h ..\..\include + copy format.h ..\..\include # End Custom Build @@ -580,7 +584,7 @@ InputPath=.\format.h InputPath=.\format.h "..\..\include\format.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy format.h ..\..\include + copy format.h ..\..\include # End Custom Build @@ -597,7 +601,7 @@ SOURCE=.\gregocal.h InputPath=.\gregocal.h "..\..\include\gregocal.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy gregocal.h ..\..\include + copy gregocal.h ..\..\include # End Custom Build @@ -607,7 +611,7 @@ InputPath=.\gregocal.h InputPath=.\gregocal.h "..\..\include\gregocal.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy gregocal.h ..\..\include + copy gregocal.h ..\..\include # End Custom Build @@ -628,7 +632,7 @@ SOURCE=.\msgfmt.h InputPath=.\msgfmt.h "..\..\include\msgfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy msgfmt.h ..\..\include + copy msgfmt.h ..\..\include # End Custom Build @@ -638,7 +642,7 @@ InputPath=.\msgfmt.h InputPath=.\msgfmt.h "..\..\include\msgfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy msgfmt.h ..\..\include + copy msgfmt.h ..\..\include # End Custom Build @@ -655,7 +659,7 @@ SOURCE=.\numfmt.h InputPath=.\numfmt.h "..\..\include\numfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy numfmt.h ..\..\include + copy numfmt.h ..\..\include # End Custom Build @@ -665,7 +669,7 @@ InputPath=.\numfmt.h InputPath=.\numfmt.h "..\..\include\numfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy numfmt.h ..\..\include + copy numfmt.h ..\..\include # End Custom Build @@ -682,7 +686,7 @@ SOURCE=.\parsepos.h InputPath=.\parsepos.h "..\..\include\parsepos.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy parsepos.h ..\..\include + copy parsepos.h ..\..\include # End Custom Build @@ -692,7 +696,7 @@ InputPath=.\parsepos.h InputPath=.\parsepos.h "..\..\include\parsepos.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy parsepos.h ..\..\include + copy parsepos.h ..\..\include # End Custom Build @@ -713,7 +717,7 @@ SOURCE=.\simpletz.h InputPath=.\simpletz.h "..\..\include\simpletz.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy simpletz.h ..\..\include + copy simpletz.h ..\..\include # End Custom Build @@ -723,7 +727,7 @@ InputPath=.\simpletz.h InputPath=.\simpletz.h "..\..\include\simpletz.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy simpletz.h ..\..\include + copy simpletz.h ..\..\include # End Custom Build @@ -744,7 +748,7 @@ SOURCE=.\smpdtfmt.h InputPath=.\smpdtfmt.h "..\..\include\smpdtfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy smpdtfmt.h ..\..\include + copy smpdtfmt.h ..\..\include # End Custom Build @@ -754,7 +758,7 @@ InputPath=.\smpdtfmt.h InputPath=.\smpdtfmt.h "..\..\include\smpdtfmt.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy smpdtfmt.h ..\..\include + copy smpdtfmt.h ..\..\include # End Custom Build @@ -771,7 +775,7 @@ SOURCE=.\sortkey.h InputPath=.\sortkey.h "..\..\include\sortkey.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy sortkey.h ..\..\include + copy sortkey.h ..\..\include # End Custom Build @@ -781,7 +785,7 @@ InputPath=.\sortkey.h InputPath=.\sortkey.h "..\..\include\sortkey.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy sortkey.h ..\..\include + copy sortkey.h ..\..\include # End Custom Build @@ -806,7 +810,7 @@ SOURCE=.\tblcoll.h InputPath=.\tblcoll.h "..\..\include\tblcoll.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy tblcoll.h ..\..\include + copy tblcoll.h ..\..\include # End Custom Build @@ -816,7 +820,7 @@ InputPath=.\tblcoll.h InputPath=.\tblcoll.h "..\..\include\tblcoll.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy tblcoll.h ..\..\include + copy tblcoll.h ..\..\include # End Custom Build @@ -837,7 +841,7 @@ SOURCE=.\timezone.h InputPath=.\timezone.h "..\..\include\timezone.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy timezone.h ..\..\include + copy timezone.h ..\..\include # End Custom Build @@ -847,7 +851,7 @@ InputPath=.\timezone.h InputPath=.\timezone.h "..\..\include\timezone.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy timezone.h ..\..\include + copy timezone.h ..\..\include # End Custom Build @@ -872,7 +876,7 @@ SOURCE=.\ubrk.h InputPath=.\ubrk.h "..\..\include\ubrk.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy ubrk.h ..\..\include + copy ubrk.h ..\..\include # End Custom Build @@ -882,7 +886,7 @@ InputPath=.\ubrk.h InputPath=.\ubrk.h "..\..\include\ubrk.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy ubrk.h ..\..\include + copy ubrk.h ..\..\include # End Custom Build @@ -899,7 +903,7 @@ SOURCE=.\ucal.h InputPath=.\ucal.h "..\..\include\ucal.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy ucal.h ..\..\include + copy ucal.h ..\..\include # End Custom Build @@ -909,7 +913,7 @@ InputPath=.\ucal.h InputPath=.\ucal.h "..\..\include\ucal.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy ucal.h ..\..\include + copy ucal.h ..\..\include # End Custom Build @@ -926,7 +930,7 @@ SOURCE=.\ucol.h InputPath=.\ucol.h "..\..\include\ucol.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy ucol.h ..\..\include + copy ucol.h ..\..\include # End Custom Build @@ -936,7 +940,7 @@ InputPath=.\ucol.h InputPath=.\ucol.h "..\..\include\ucol.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy ucol.h ..\..\include + copy ucol.h ..\..\include # End Custom Build @@ -953,7 +957,7 @@ SOURCE=.\udat.h InputPath=.\udat.h "..\..\include\udat.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy udat.h ..\..\include + copy udat.h ..\..\include # End Custom Build @@ -963,7 +967,7 @@ InputPath=.\udat.h InputPath=.\udat.h "..\..\include\udat.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy udat.h ..\..\include + copy udat.h ..\..\include # End Custom Build @@ -980,7 +984,7 @@ SOURCE=.\umsg.h InputPath=.\umsg.h "..\..\include\umsg.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy umsg.h ..\..\include + copy umsg.h ..\..\include # End Custom Build @@ -990,7 +994,7 @@ InputPath=.\umsg.h InputPath=.\umsg.h "..\..\include\umsg.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy umsg.h ..\..\include + copy umsg.h ..\..\include # End Custom Build @@ -1003,6 +1007,33 @@ SOURCE=.\unicdcm.h # End Source File # Begin Source File +SOURCE=.\uniset.h + +!IF "$(CFG)" == "i18n - Win32 Release" + +# Begin Custom Build +InputPath=.\uniset.h + +"..\..\include\uniset.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy uniset.h ..\..\include + +# End Custom Build + +!ELSEIF "$(CFG)" == "i18n - Win32 Debug" + +# Begin Custom Build +InputPath=.\uniset.h + +"..\..\include\uniset.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy uniset.h ..\..\include + +# End Custom Build + +!ENDIF + +# End Source File +# Begin Source File + SOURCE=.\unum.h !IF "$(CFG)" == "i18n - Win32 Release" @@ -1011,7 +1042,7 @@ SOURCE=.\unum.h InputPath=.\unum.h "..\..\include\unum.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy unum.h ..\..\include + copy unum.h ..\..\include # End Custom Build @@ -1021,7 +1052,7 @@ InputPath=.\unum.h InputPath=.\unum.h "..\..\include\unum.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" - copy unum.h ..\..\include + copy unum.h ..\..\include # End Custom Build diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp new file mode 100644 index 00000000000..abd0213f6c9 --- /dev/null +++ b/icu4c/source/i18n/uniset.cpp @@ -0,0 +1,1277 @@ +#include "uniset.h" + +/** + * A mutable set of Unicode characters. Objects of this class + * represent character classes used in regular expressions. + * Such classes specify a subset of the set of all Unicode characters, + * which in this implementation is the characters from U+0000 to + * U+FFFF, ignoring surrogates. + * + *

This class supports two APIs. The first is modeled after Java 2's + * java.util.Set interface, although this class does not + * implement that interface. All methods of Set are + * supported, with the modification that they take a character range + * or single character instead of an Object, and they + * take a UnicodeSet instead of a Collection. + * + *

The second API is the + * applyPattern()/toPattern() API from the + * java.text.Format-derived classes. Unlike the + * methods that add characters, add categories, and control the logic + * of the set, the method applyPattern() sets all + * attributes of a UnicodeSet at once, based on a + * string pattern. + * + *

In addition, the set complement operation is supported through + * the complement() method. + * + *

Pattern syntax

+ * + * Patterns are accepted by the constructors and the + * applyPattern() methods and returned by the + * toPattern() method. These patterns follow a syntax + * similar to that employed by version 8 regular expression character + * classes: + * + *
+ * pattern := ('[' '^'? item* ']') | ('[:' '^'? category ':]')
+ * item := char | (char '-' char) | pattern-expr
+ * pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern
+ * op := '&' | '-'
+ * special := '[' | ']' | '-'
+ * char := any character that is not special | + * ('\' any character) | + * ('\\u' hex hex hex hex)
+ * hex := any hex digit, as defined by Character.digit(c, 16) + *
+ * + *
Legend: + * + * + *
a:=b + * a may be replaced by + * b + *
a? + * zero or one instance of a
+ *
a* + * one or more instances of a
+ *
a|b + * either a or b
+ *
'a' + * the literal string between the quotes + *
+ *
+ * + * Patterns specify individual characters, ranges of characters, and + * Unicode character categories. When elements are concatenated, they + * specify their union. To complement a set, place a '^' immediately + * after the opening '[' or '[:'. In any other location, '^' has no + * special meaning. + * + *

Ranges are indicated by placing two a '-' between two + * characters, as in "a-z". This specifies the range of all + * characters from the left to the right, in Unicode order. If the + * left and right characters are the same, then the range consists of + * just that character. If the left character is greater than the + * right character it is a syntax error. If a '-' occurs as the first + * character after the opening '[' or '[^', or if it occurs as the + * last character before the closing ']', then it is taken as a + * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same + * set of three characters, 'a', 'b', and '-'. + * + *

Sets may be intersected using the '&' operator or the asymmetric + * set difference may be taken using the '-' operator, for example, + * "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters + * with values less than 4096. Operators ('&' and '|') have equal + * precedence and bind left-to-right. Thus + * "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to + * "[[[:L:]-[a-z]]-[\u0100-\u01FF]]". This only really matters for + * difference; intersection is commutative. + * + * + *
[a]The set containing 'a' + *
[a-z]The set containing 'a' + * through 'z' and all letters in between, in Unicode order + *
[^a-z]The set containing + * all characters but 'a' through 'z', + * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF + *
[[pat1][pat2]] + * The union of sets specified by pat1 and pat2 + *
[[pat1]&[pat2]] + * The intersection of sets specified by pat1 and pat2 + *
[[pat1]-[pat2]] + * The asymmetric difference of sets specified by pat1 and + * pat2 + *
[:Lu:] + * The set of characters belonging to the given + * Unicode category, as defined by Character.getType(); in + * this case, Unicode uppercase letters + *
[:L:] + * The set of characters belonging to all Unicode categories + * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. + *
+ * + *

Character categories. + * + * Character categories are specified using the POSIX-like syntax + * '[:Lu:]'. The complement of a category is specified by inserting + * '^' after the opening '[:'. The following category names are + * recognized. Actual determination of category data uses + * Character.getType(), so it reflects the underlying + * implmementation used by Character. As of Java 2 and + * JDK 1.1.8, this is Unicode 2.x.x - fill in version here. + * + *

+ * Normative
+ *     Mn = Mark, Non-Spacing
+ *     Mc = Mark, Spacing Combining
+ *     Me = Mark, Enclosing
+ * 
+ *     Nd = Number, Decimal Digit
+ *     Nl = Number, Letter
+ *     No = Number, Other
+ * 
+ *     Zs = Separator, Space
+ *     Zl = Separator, Line
+ *     Zp = Separator, Paragraph
+ * 
+ *     Cc = Other, Control
+ *     Cf = Other, Format
+ *     Cs = Other, Surrogate
+ *     Co = Other, Private Use
+ *     Cn = Other, Not Assigned
+ * 
+ * Informative
+ *     Lu = Letter, Uppercase
+ *     Ll = Letter, Lowercase
+ *     Lt = Letter, Titlecase
+ *     Lm = Letter, Modifier
+ *     Lo = Letter, Other
+ * 
+ *     Pc = Punctuation, Connector
+ *     Pd = Punctuation, Dash
+ *     Ps = Punctuation, Open
+ *     Pe = Punctuation, Close
+ *     Pi = Punctuation, Initial quote
+ *     Pf = Punctuation, Final quote
+ *     Po = Punctuation, Other
+ * 
+ *     Sm = Symbol, Math
+ *     Sc = Symbol, Currency
+ *     Sk = Symbol, Modifier
+ *     So = Symbol, Other
+ * 
+ * *Unsupported by Java (and hence unsupported by UnicodeSet). + * + * @author Alan Liu + * @version $RCSfile: uniset.cpp,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $ + */ + +// Note: This mapping is different in ICU and Java +const UnicodeString UnicodeSet::CATEGORY_NAMES( + "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf"); + +/** + * A cache mapping character category integers, as returned by + * Character.getType(), to pairs strings. Entries are initially + * null and are created on demand. + */ +UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE = + new UnicodeString[Unicode::GENERAL_TYPES_COUNT]; + +//---------------------------------------------------------------- +// Debugging and testing +//---------------------------------------------------------------- + +/** + * Return the representation of this set as a list of character + * ranges. Ranges are listed in ascending Unicode order. For + * example, the set [a-zA-M3] is represented as "33AMaz". + */ +const UnicodeString& UnicodeSet::getPairs() const { + return pairs; +} + +//---------------------------------------------------------------- +// Public API +//---------------------------------------------------------------- + +/** + * Constructs an empty set. + */ +UnicodeSet::UnicodeSet() {} + +/** + * Constructs a set from the given pattern, optionally ignoring + * white space. See the class description for the syntax of the + * pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored, except those preceded by '\\'. Spaces are + * those characters for which Character.isSpaceChar() + * is true. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ +UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces, + UErrorCode& status) { + applyPattern(pattern, ignoreSpaces, status); +} + +UnicodeSet::UnicodeSet(const UnicodeString& pattern, + UErrorCode& status) { + applyPattern(pattern, status); +} + +/** + * Constructs a set from the given Unicode character category. + * @param category an integer indicating the character category as + * returned by Character.getType(). + * @exception IllegalArgumentException if the given + * category is invalid. + */ +UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) { + if (U_SUCCESS(status)) { + if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + pairs = getCategoryPairs(category); + } + } +} + +/** + * Modifies this set to represent the set specified by the given + * pattern, optionally ignoring white space. See the class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored. Spaces are those characters for which + * Character.isSpaceChar() is true. + * Characters preceded by '\\' are escaped, losing any special + * meaning they otherwise have. Spaces may be included by + * escaping them. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ +void UnicodeSet::applyPattern(const UnicodeString& pattern, + bool_t ignoreSpaces, + UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + + ParsePosition pos(0); + UnicodeString* pat = (UnicodeString*) &pattern; + + // To ignore spaces, create a new pattern without spaces. We + // have to process all '\' escapes. If '\' is encountered, + // insert it and the following character (if any -- let parse + // deal with any syntax errors) in the pattern. This allows + // escaped spaces. + if (ignoreSpaces) { + pat = new UnicodeString(); + for (int32_t i=0; iappend(c); + c = pattern.charAt(++i); + // Fall through and append the following char + } + pat->append(c); + } + } + + parse(pairs, *pat, pos, status); + if (pos.getIndex() != pat->length()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } + if (pat != &pattern) { + delete pat; + } +} + +/** + * Returns a string representation of this set. If the result of + * calling this function is passed to a UnicodeSet constructor, it + * will produce another set that is equal to this one. + */ +UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const { + result.remove().append((UChar)'['); + + // iterate through the ranges in the CharSet + for (int32_t i=0; iInteger.MAX_VALUE elements, returns + * Integer.MAX_VALUE. + * + * @return the number of elements in this set (its cardinality). + */ +int32_t UnicodeSet::size() const { + int32_t n = 0; + for (int32_t i=0; itrue if this set contains no elements. + * + * @return true if this set contains no elements. + */ +bool_t UnicodeSet::isEmpty() const { + return pairs.length() == 0; +} + +/** + * Returns true if this set contains the specified range + * of chars. + * + * @return true if this set contains the specified range + * of chars. + */ +bool_t UnicodeSet::contains(UChar first, UChar last) const { + // Set i to the end of the smallest range such that its end + // point >= last, or pairs.length() if no such range exists. + int32_t i = 1; + while (ipairs.charAt(i)) i+=2; + return i=pairs.charAt(i-1); +} + +/** + * Returns true if this set contains the specified char. + * + * @return true if this set contains the specified char. + */ +bool_t UnicodeSet::contains(UChar c) const { + return contains(c, c); +} + +/** + * Adds the specified range to this set if it is not already + * present. If this set already contains the specified range, + * the call leaves this set unchanged. If last > first + * then an empty range is added, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be added + * to this set. + * @param last last character, inclusive, of range to be added + * to this set. + */ +void UnicodeSet::add(UChar first, UChar last) { + if (first <= last) { + addPair(pairs, first, last); + } +} + +/** + * Adds the specified character to this set if it is not already + * present. If this set already contains the specified character, + * the call leaves this set unchanged. + */ +void UnicodeSet::add(UChar c) { + add(c, c); +} + +/** + * Removes the specified range from this set if it is present. + * The set will not contain the specified range once the call + * returns. If last > first then an empty range is + * removed, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be removed + * from this set. + * @param last last character, inclusive, of range to be removed + * from this set. + */ +void UnicodeSet::remove(UChar first, UChar last) { + if (first <= last) { + removePair(pairs, first, last); + } +} + +/** + * Removes the specified character from this set if it is present. + * The set will not contain the specified range once the call + * returns. + */ +void UnicodeSet::remove(UChar c) { + remove(c, c); +} + +/** + * Returns true if the specified set is a subset + * of this set. + * + * @param c set to be checked for containment in this set. + * @return true if this set contains all of the elements of the + * specified set. + */ +bool_t UnicodeSet::containsAll(const UnicodeSet& c) const { + // The specified set is a subset if all of its pairs are contained + // in this set. + int32_t i = 1; + for (int32_t j=0; j= last, or pairs.length() if no such range + // exists. + while (ipairs.charAt(i)) i+=2; + if (i>pairs.length() || c.pairs.charAt(j) < pairs.charAt(i-1)) { + return FALSE; + } + } + return TRUE; +} + +/** + * Adds all of the elements in the specified set to this set if + * they're not already present. This operation effectively + * modifies this set so that its value is the union of the two + * sets. The behavior of this operation is unspecified if the specified + * collection is modified while the operation is in progress. + * + * @param c set whose elements are to be added to this set. + * @see #add(char, char) + */ +void UnicodeSet::addAll(const UnicodeSet& c) { + doUnion(pairs, c.pairs); +} + +/** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the intersection of the two sets. + * + * @param c set that defines which elements this set will retain. + */ +void UnicodeSet::retainAll(const UnicodeSet& c) { + doIntersection(pairs, c.pairs); +} + +/** + * Removes from this set all of its elements that are contained in the + * specified set. This operation effectively modifies this + * set so that its value is the asymmetric set difference of + * the two sets. + * + * @param c set that defines which elements will be removed from + * this set. + */ +void UnicodeSet::removeAll(const UnicodeSet& c) { + doDifference(pairs, c.pairs); +} + +/** + * Inverts this set. This operation modifies this set so that + * its value is its complement. This is equivalent to the pseudo code: + * this = new CharSet("[\u0000-\uFFFF]").removeAll(this). + */ +void UnicodeSet::complement() { + doComplement(pairs); +} + +/** + * Removes all of the elements from this set. This set will be + * empty after this call returns. + */ +void UnicodeSet::clear() { + pairs.remove(); +} + +/** + * Compares the specified object with this set for equality. Returns + * true if the specified object is also a set, the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o Object to be compared for equality with this set. + * @return true if the specified Object is equal to this set. + */ +bool_t UnicodeSet::operator==(const UnicodeSet& o) const { + return pairs == o.pairs; +} + +/** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ +int32_t UnicodeSet::hashCode() const { + return pairs.hashCode(); +} + +//---------------------------------------------------------------- +// Implementation: Pattern parsing +//---------------------------------------------------------------- + +/** + * Parses the given pattern, starting at the given position. The + * character at pattern.charAt(pos.getIndex()) must be '[', or the + * parse fails. Parsing continues until the corresponding closing + * ']'. If a syntax error is encountered between the opening and + * closing brace, the parse fails. Upon return from a U_SUCCESSful + * parse, the ParsePosition is updated to point to the character + * following the closing ']', and a StringBuffer containing a + * pairs list for the parsed pattern is returned. This method calls + * itself recursively to parse embedded subpatterns. + * + * @param pattern the string containing the pattern to be parsed. + * The portion of the string from pos.getIndex(), which must be a + * '[', to the corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. + * The character at pattern.charAt(pos.getIndex()) must be a '['. + * Upon return from a U_SUCCESSful parse, pos.getIndex() is either + * the character after the closing ']' of the parsed pattern, or + * pattern.length() if the closing ']' is the last character of + * the pattern string. + * @return a StringBuffer containing a pairs list for the parsed + * substring of pattern + * @exception IllegalArgumentException if the parse fails. + */ +UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, + const UnicodeString& pattern, + ParsePosition& pos, + UErrorCode& status) { + if (U_FAILURE(status)) { + return pairsBuf; + } + + bool_t invert = FALSE; + pairsBuf.remove(); + + /** + * Nodes: 0 - idle, waiting for '[' + * 10 - like 11, but immediately after "[" or "[^" + * 11 - awaiting x, "]", "[...]", or "[:...:]" + * 21 - after x + * 23 - after x- + * + * The parsing state machine moves from node 0 through zero or more + * other nodes back to node 0, in a U_SUCCESSful parse. + */ + int32_t node = 0; + UChar first = 0; + int32_t i; + /** + * This loop iterates over the characters in the pattern. We + * start at the position specified by pos. We exit the loop + * when either a matching closing ']' is seen, or we read all + * characters of the pattern. + */ + for (i=pos.getIndex(); i= pattern.length()) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return pairsBuf; + } + c = (UChar)0x0000; + for (int32_t j=(++i)+4; i "aq". addPair("ampz", 'n', + * 'o') => "az". + */ +void UnicodeSet::addPair(UnicodeString& pairs, UChar c, UChar d) { + UChar a = 0; + UChar b = 0; + for (int32_t i=0; i "ak". + * removePair("ampz", 'l', 'q') => "akrz". + */ +void UnicodeSet::removePair(UnicodeString& pairs, UChar c, UChar d) { + // Iterate over pairs until we find a pair that overlaps + // with the given range. + for (int32_t i=0; i= a. + // rangeEdited is set to true if we have modified the + // range a-b (the range at i) in place. + bool_t rangeEdited = FALSE; + if (c > a) { + // If c is after a and before b, then we have overlap + // of this sort: a--c==b--d or a--c==d--b, where a-b + // and c-d are the ranges of interest. We need to + // add the range a,c-1. + pairs.setCharAt(i+1, (UChar)(c-1)); + // i is already a + rangeEdited = TRUE; + } + if (d < b) { + // If d is after a and before b, we overlap like this: + // c--a==d--b or a--c==d--b, where a-b is the range at + // i and c-d is the range being removed. We need to + // add the range d+1,b. + if (rangeEdited) { + // Insert {d+1, b} + pairs.insert(i+2, b); // b moves to i+3 by next insert: + pairs.insert(i+2, (UChar)(d+1)); + i += 2; + } else { + pairs.setCharAt(i, (UChar)(d+1)); + // i+1 is already b + rangeEdited = TRUE; + } + } + if (!rangeEdited) { + // If we didn't add any ranges, that means the entire + // range a-b must be deleted, since we have + // c--a==b--d. + pairs.remove(i, 2); + i -= 2; + } + } +} + +//---------------------------------------------------------------- +// Implementation: Fundamental operators +//---------------------------------------------------------------- + +/** + * Changes the pairs list to represent the complement of the set it + * currently represents. The pairs list will be normalized (in + * order and in shortest possible form) if the original pairs list + * was normalized. + */ +void UnicodeSet::doComplement(UnicodeString& pairs) { + if (pairs.length() == 0) { + pairs.append((UChar)0x0000).append((UChar)0xffff); + return; + } + + // Change each end to a start and each start to an end of the + // gaps between the ranges. That is, 3-7 9-12 becomes x-2 8-8 + // 13-x, where 'x' represents a range that must now be fixed + // up. + for (int32_t i=0; i 0 && c1.charAt(i - 1) > ub) + ub = c1.charAt(i - 1); + + // now advance j to the first character that is greater + // that "ub" plus one + while (j < c2.length() && c2.charAt(j) <= ub + 1) + ++j; + + // if j points to the endpoint of a range, update "ub" + // to that character, or if j points to the start of + // a range and the endpoint of the preceding range is + // greater than "ub", update "up" to _that_ character + if (j % 2 == 1) + ub = c2.charAt(j); + else if (j > 0 && c2.charAt(j - 1) > ub) + ub = c2.charAt(j - 1); + } + // when we finally fall out of this loop, we will have stitched + // together a series of ranges that overlap or touch, i and j + // will both point to starting points of ranges, and "ub" will + // be the endpoint of the range we're working on. Write "ub" + // to the result + result.append(ub); + + // loop back around to create the next range in the result + } + + // we fall out to here when we've exhausted all the characters in + // one of the operands. We can append all of the remaining characters + // in the other operand without doing any extra work. + if (i < c1.length()) + result.append(c1, i, LONG_MAX); + if (j < c2.length()) + result.append(c2, j, LONG_MAX); + + c1 = result; +} + +/** + * Given two pairs lists, changes the first in place to represent + * the asymmetric difference of the two sets. + */ +void UnicodeSet::doDifference(UnicodeString& pairs, const UnicodeString& pairs2) { + UnicodeString p2(pairs2); + doComplement(p2); + doIntersection(pairs, p2); +} + +/** + * Given two pairs lists, changes the first in place to represent + * the intersection of the two sets. + */ +void UnicodeSet::doIntersection(UnicodeString& c1, const UnicodeString& c2) { + UnicodeString result; + + int32_t i = 0; + int32_t j = 0; + int32_t oldI; + int32_t oldJ; + + // iterate until we've exhausted one of the operands + while (i < c1.length() && j < c2.length()) { + + // advance j until it points to a character that is larger than + // the one i points to. If this is the beginning of a one- + // character range, advance j to point to the end + if (i < c1.length() && i % 2 == 0) { + while (j < c2.length() && c2.charAt(j) < c1.charAt(i)) + ++j; + if (j < c2.length() && j % 2 == 0 && c2.charAt(j) == c1.charAt(i)) + ++j; + } + + // if j points to the endpoint of a range, save the current + // value of i, then advance i until it reaches a character + // which is larger than the character pointed at + // by j. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldI = i; + while (j % 2 == 1 && i < c1.length() && c1.charAt(i) <= c2.charAt(j)) + ++i; + result.append(c1, oldI, i-oldI); + + // if i points to the endpoint of a range, save the current + // value of j, then advance j until it reaches a character + // which is larger than the character pointed at + // by i. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldJ = j; + while (i % 2 == 1 && j < c2.length() && c2.charAt(j) <= c1.charAt(i)) + ++j; + result.append(c2, oldJ, j-oldJ); + + // advance i until it points to a character larger than j + // If it points at the beginning of a one-character range, + // advance it to the end of that range + if (j < c2.length() && j % 2 == 0) { + while (i < c1.length() && c1.charAt(i) < c2.charAt(j)) + ++i; + if (i < c1.length() && i % 2 == 0 && c2.charAt(j) == c1.charAt(i)) + ++i; + } + } + + c1 = result; +} + +//---------------------------------------------------------------- +// Implementation: Generation of pairs for Unicode categories +//---------------------------------------------------------------- + +/** + * Returns a pairs string for the given category, given its name. + * The category name must be either a two-letter name, such as + * "Lu", or a one letter name, such as "L". One-letter names + * indicate the logical union of all two-letter names that start + * with that letter. Case is significant. If the name starts + * with the character '^' then the complement of the given + * character set is returned. + * + * Although individual categories such as "Lu" are cached, we do + * not currently cache single-letter categories such as "L" or + * complements such as "^Lu" or "^L". It would be easy to cache + * these as well in a hashtable should the need arise. + */ +UnicodeString& UnicodeSet::getCategoryPairs(UnicodeString& result, + const UnicodeString& catName, + UErrorCode& status) { + if (U_FAILURE(status)) { + return result; + } + + // The temporary cat is only really needed if invert is true. + // TO DO: Allocate cat on the heap only if needed. + UnicodeString cat(catName); + bool_t invert = (catName.length() > 1 && + catName.charAt(0) == '^'); + if (invert) { + cat.remove(0, 1); + } + + result.remove(); + + // if we have two characters, search the category map for that + // code and either construct and return a UnicodeSet from the + // data in the category map or throw an exception + if (cat.length() == 2) { + int32_t i = CATEGORY_NAMES.indexOf(cat); + if (i>=0 && i%2==0) { + i /= 2; + result = getCategoryPairs((int8_t)i); + if (!invert) { + return result; + } + } + } else if (cat.length() == 1) { + // if we have one character, search the category map for + // codes beginning with that letter, and union together + // all of the matching sets that we find (or throw an + // exception if there are no matches) + for (int32_t i=0; i= 0) { + pairs.append((UChar)first).append((UChar)last); + } + first = last = i; + } + } + } + if (first >= 0) { + pairs.append((UChar)first).append((UChar)last); + } + } + return CATEGORY_PAIRS_CACHE[cat]; +} + +//---------------------------------------------------------------- +// Implementation: Utility methods +//---------------------------------------------------------------- + +/** + * Returns the character after the given position, or '\uFFFF' if + * there is none. + */ +UChar UnicodeSet::charAfter(const UnicodeString& str, int32_t i) { + return ((++i) < str.length()) ? str.charAt(i) : (UChar)0xFFFF; +} + +/** + * TEMPORARY WORKAROUND UNTIL Unicode::digit() exists. + * Return the digit value of the given UChar, or -1. The radix + * value is ignored for now and hardcoded as 16. + */ +int8_t UnicodeSet::digit(UChar c, int8_t radix) { + int32_t d = Unicode::digitValue(c); + if (d < 0) { + if (c >= (UChar)'a' && c <= (UChar)'f') { + d = c - (UChar)('a' - 10); + } else if (c >= (UChar)'A' && c <= (UChar)'F') { + d = c - (UChar)('A' - 10); + } + } + return (int8_t)d; +} diff --git a/icu4c/source/i18n/uniset.h b/icu4c/source/i18n/uniset.h new file mode 100644 index 00000000000..f3c12148855 --- /dev/null +++ b/icu4c/source/i18n/uniset.h @@ -0,0 +1,587 @@ +#ifndef UNICODESET_H +#define UNICODESET_H + +#include "utypes.h" +#include "unistr.h" +#include "parsepos.h" + +/** + * A mutable set of Unicode characters. Objects of this class + * represent character classes used in regular expressions. + * Such classes specify a subset of the set of all Unicode characters, + * which in this implementation is the characters from U+0000 to + * U+FFFF, ignoring surrogates. + * + *

This class supports two APIs. The first is modeled after Java 2's + * java.util.Set interface, although this class does not + * implement that interface. All methods of Set are + * supported, with the modification that they take a character range + * or single character instead of an Object, and they + * take a UnicodeSet instead of a Collection. + * + *

The second API is the + * applyPattern()/toPattern() API from the + * java.text.Format-derived classes. Unlike the + * methods that add characters, add categories, and control the logic + * of the set, the method applyPattern() sets all + * attributes of a UnicodeSet at once, based on a + * string pattern. + * + *

In addition, the set complement operation is supported through + * the complement() method. + * + *

Pattern syntax

+ * + * Patterns are accepted by the constructors and the + * applyPattern() methods and returned by the + * toPattern() method. These patterns follow a syntax + * similar to that employed by version 8 regular expression character + * classes: + * + *
+ * pattern := ('[' '^'? item* ']') | ('[:' '^'? category ':]')
+ * item := char | (char '-' char) | pattern-expr
+ * pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern
+ * op := '&' | '-'
+ * special := '[' | ']' | '-'
+ * char := any character that is not special | + * ('\' any character) | + * ('\\u' hex hex hex hex)
+ * hex := any hex digit, as defined by Character.digit(c, 16) + *
+ * + *
Legend: + * + * + *
a:=b + * a may be replaced by + * b + *
a? + * zero or one instance of a
+ *
a* + * one or more instances of a
+ *
a|b + * either a or b
+ *
'a' + * the literal string between the quotes + *
+ *
+ * + * Patterns specify individual characters, ranges of characters, and + * Unicode character categories. When elements are concatenated, they + * specify their union. To complement a set, place a '^' immediately + * after the opening '[' or '[:'. In any other location, '^' has no + * special meaning. + * + *

Ranges are indicated by placing two a '-' between two + * characters, as in "a-z". This specifies the range of all + * characters from the left to the right, in Unicode order. If the + * left and right characters are the same, then the range consists of + * just that character. If the left character is greater than the + * right character it is a syntax error. If a '-' occurs as the first + * character after the opening '[' or '[^', or if it occurs as the + * last character before the closing ']', then it is taken as a + * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same + * set of three characters, 'a', 'b', and '-'. + * + *

Sets may be intersected using the '&' operator or the asymmetric + * set difference may be taken using the '-' operator, for example, + * "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters + * with values less than 4096. Operators ('&' and '|') have equal + * precedence and bind left-to-right. Thus + * "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to + * "[[[:L:]-[a-z]]-[\u0100-\u01FF]]". This only really matters for + * difference; intersection is commutative. + * + * + *
[a]The set containing 'a' + *
[a-z]The set containing 'a' + * through 'z' and all letters in between, in Unicode order + *
[^a-z]The set containing + * all characters but 'a' through 'z', + * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF + *
[[pat1][pat2]] + * The union of sets specified by pat1 and pat2 + *
[[pat1]&[pat2]] + * The intersection of sets specified by pat1 and pat2 + *
[[pat1]-[pat2]] + * The asymmetric difference of sets specified by pat1 and + * pat2 + *
[:Lu:] + * The set of characters belonging to the given + * Unicode category, as defined by Character.getType(); in + * this case, Unicode uppercase letters + *
[:L:] + * The set of characters belonging to all Unicode categories + * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. + *
+ * + *

Character categories. + * + * Character categories are specified using the POSIX-like syntax + * '[:Lu:]'. The complement of a category is specified by inserting + * '^' after the opening '[:'. The following category names are + * recognized. Actual determination of category data uses + * Character.getType(), so it reflects the underlying + * implmementation used by Character. As of Java 2 and + * JDK 1.1.8, this is Unicode 2.x.x - fill in version here. + * + *

+ * Normative
+ *     Mn = Mark, Non-Spacing
+ *     Mc = Mark, Spacing Combining
+ *     Me = Mark, Enclosing
+ * 
+ *     Nd = Number, Decimal Digit
+ *     Nl = Number, Letter
+ *     No = Number, Other
+ * 
+ *     Zs = Separator, Space
+ *     Zl = Separator, Line
+ *     Zp = Separator, Paragraph
+ * 
+ *     Cc = Other, Control
+ *     Cf = Other, Format
+ *     Cs = Other, Surrogate
+ *     Co = Other, Private Use
+ *     Cn = Other, Not Assigned
+ * 
+ * Informative
+ *     Lu = Letter, Uppercase
+ *     Ll = Letter, Lowercase
+ *     Lt = Letter, Titlecase
+ *     Lm = Letter, Modifier
+ *     Lo = Letter, Other
+ * 
+ *     Pc = Punctuation, Connector
+ *     Pd = Punctuation, Dash
+ *     Ps = Punctuation, Open
+ *     Pe = Punctuation, Close
+ *    *Pi = Punctuation, Initial quote
+ *    *Pf = Punctuation, Final quote
+ *     Po = Punctuation, Other
+ * 
+ *     Sm = Symbol, Math
+ *     Sc = Symbol, Currency
+ *     Sk = Symbol, Modifier
+ *     So = Symbol, Other
+ * 
+ * *Unsupported by Java (and hence unsupported by UnicodeSet). + * + * @author Alan Liu + * @version $RCSfile: uniset.h,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $ + */ +class U_I18N_API UnicodeSet { + + /** + * The internal representation is a StringBuffer of even length. + * Each pair of characters represents a range that is included in + * the set. A single character c is represented as cc. Thus, the + * ranges in the set are (a,b), a and b inclusive, where a = + * pairs.charAt(i) and b = pairs.charAt(i+1) for all even i, 0 <= + * i <= pairs.length()-2. Pairs are always stored in ascending + * Unicode order. Pairs are always stored in shortest form. For + * example, if the pair "hh", representing the single character + * 'h', is added to the pairs list "agik", representing the ranges + * 'a'-'g' and 'i'-'k', the result is "ak", not "aghhik". + */ + UnicodeString pairs; + + static const UnicodeString CATEGORY_NAMES; + + /** + * A cache mapping character category integers, as returned by + * Character.getType(), to pairs strings. Entries are initially + * null and are created on demand. + */ + static UnicodeString* CATEGORY_PAIRS_CACHE; + + //---------------------------------------------------------------- + // Debugging and testing + //---------------------------------------------------------------- + +public: + + /** + * Return the representation of this set as a list of character + * ranges. Ranges are listed in ascending Unicode order. For + * example, the set [a-zA-M3] is represented as "33AMaz". + */ + const UnicodeString& getPairs() const; + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + +public: + + /** + * Constructs an empty set. + */ + UnicodeSet(); + + /** + * Constructs a set from the given pattern. See the class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + UnicodeSet(const UnicodeString& pattern, + UErrorCode& status); + + /** + * Constructs a set from the given pattern, optionally ignoring + * white space. See the class description for the syntax of the + * pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored, except those preceded by '\\'. Spaces are + * those characters for which Character.isSpaceChar() + * is true. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces, + UErrorCode& status); + + /** + * Constructs a set from the given Unicode character category. + * @param category an integer indicating the character category as + * returned by Character.getType(). + * @exception IllegalArgumentException if the given + * category is invalid. + */ + UnicodeSet(int8_t category, UErrorCode& status); + + /** + * Modifies this set to represent the set specified by the given + * pattern, optionally ignoring white space. See the class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored. Spaces are those characters for which + * Character.isSpaceChar() is true. + * Characters preceded by '\\' are escaped, losing any special + * meaning they otherwise have. Spaces may be included by + * escaping them. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + virtual void applyPattern(const UnicodeString& pattern, + bool_t ignoreSpaces, + UErrorCode& status); + + /** + * Modifies this set to represent the set specified by the given + * pattern. See the class description for the syntax of the pattern + * language. + * @param pattern a string specifying what characters are in the set + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + void applyPattern(const UnicodeString& pattern, + UErrorCode& status); + + /** + * Returns a string representation of this set. If the result of + * calling this function is passed to a UnicodeSet constructor, it + * will produce another set that is equal to this one. + */ + virtual UnicodeString& toPattern(UnicodeString& result) const; + + /** + * Returns the number of elements in this set (its cardinality). If this + * set contains more than Integer.MAX_VALUE elements, returns + * Integer.MAX_VALUE. + * + * @return the number of elements in this set (its cardinality). + */ + virtual int32_t size() const; + + /** + * Returns true if this set contains no elements. + * + * @return true if this set contains no elements. + */ + virtual bool_t isEmpty() const; + + /** + * Returns true if this set contains the specified range + * of chars. + * + * @return true if this set contains the specified range + * of chars. + */ + virtual bool_t contains(UChar first, UChar last) const; + + /** + * Returns true if this set contains the specified char. + * + * @return true if this set contains the specified char. + */ + virtual bool_t contains(UChar c) const; + + /** + * Adds the specified range to this set if it is not already + * present. If this set already contains the specified range, + * the call leaves this set unchanged. If last > first + * then an empty range is added, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be added + * to this set. + * @param last last character, inclusive, of range to be added + * to this set. + */ + virtual void add(UChar first, UChar last); + + /** + * Adds the specified character to this set if it is not already + * present. If this set already contains the specified character, + * the call leaves this set unchanged. + */ + virtual void add(UChar c); + + /** + * Removes the specified range from this set if it is present. + * The set will not contain the specified range once the call + * returns. If last > first then an empty range is + * removed, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be removed + * from this set. + * @param last last character, inclusive, of range to be removed + * from this set. + */ + virtual void remove(UChar first, UChar last); + + /** + * Removes the specified character from this set if it is present. + * The set will not contain the specified range once the call + * returns. + */ + virtual void remove(UChar c); + + /** + * Returns true if the specified set is a subset + * of this set. + * + * @param c set to be checked for containment in this set. + * @return true if this set contains all of the elements of the + * specified set. + */ + virtual bool_t containsAll(const UnicodeSet& c) const; + + /** + * Adds all of the elements in the specified set to this set if + * they're not already present. This operation effectively + * modifies this set so that its value is the union of the two + * sets. The behavior of this operation is unspecified if the specified + * collection is modified while the operation is in progress. + * + * @param c set whose elements are to be added to this set. + * @see #add(char, char) + */ + virtual void addAll(const UnicodeSet& c); + + /** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the intersection of the two sets. + * + * @param c set that defines which elements this set will retain. + */ + virtual void retainAll(const UnicodeSet& c); + + /** + * Removes from this set all of its elements that are contained in the + * specified set. This operation effectively modifies this + * set so that its value is the asymmetric set difference of + * the two sets. + * + * @param c set that defines which elements will be removed from + * this set. + */ + virtual void removeAll(const UnicodeSet& c); + + /** + * Inverts this set. This operation modifies this set so that + * its value is its complement. This is equivalent to the pseudo code: + * this = new CharSet("[\u0000-\uFFFF]").removeAll(this). + */ + virtual void complement(); + + /** + * Removes all of the elements from this set. This set will be + * empty after this call returns. + */ + virtual void clear(); + + /** + * Compares the specified object with this set for equality. Returns + * true if the specified object is also a set, the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o Object to be compared for equality with this set. + * @return true if the specified Object is equal to this set. + */ + virtual bool_t operator==(const UnicodeSet& o) const; + + bool_t operator!=(const UnicodeSet& o) const; + + /** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ + virtual int32_t hashCode() const; + + //---------------------------------------------------------------- + // Implementation: Pattern parsing + //---------------------------------------------------------------- + +private: + + /** + * Parses the given pattern, starting at the given position. The + * character at pattern.charAt(pos.getIndex()) must be '[', or the + * parse fails. Parsing continues until the corresponding closing + * ']'. If a syntax error is encountered between the opening and + * closing brace, the parse fails. Upon return from a successful + * parse, the ParsePosition is updated to point to the character + * following the closing ']', and a StringBuffer containing a + * pairs list for the parsed pattern is returned. This method calls + * itself recursively to parse embedded subpatterns. + * + * @param pattern the string containing the pattern to be parsed. + * The portion of the string from pos.getIndex(), which must be a + * '[', to the corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. + * The character at pattern.charAt(pos.getIndex()) must be a '['. + * Upon return from a successful parse, pos.getIndex() is either + * the character after the closing ']' of the parsed pattern, or + * pattern.length() if the closing ']' is the last character of + * the pattern string. + * @return a StringBuffer containing a pairs list for the parsed + * substring of pattern + * @exception IllegalArgumentException if the parse fails. + */ + static UnicodeString& parse(UnicodeString& pairsBuf /*result*/, + const UnicodeString& pattern, + ParsePosition& pos, + UErrorCode& status); + + //---------------------------------------------------------------- + // Implementation: Efficient in-place union & difference + //---------------------------------------------------------------- + + /** + * Performs a union operation: adds the range 'c'-'d' to the given + * pairs list. The pairs list is modified in place. The result + * is normalized (in order and as short as possible). For + * example, addPair("am", 'l', 'q') => "aq". addPair("ampz", 'n', + * 'o') => "az". + */ + static void addPair(UnicodeString& pairs, UChar c, UChar d); + + /** + * Performs an asymmetric difference: removes the range 'c'-'d' + * from the pairs list. The pairs list is modified in place. The + * result is normalized (in order and as short as possible). For + * example, removePair("am", 'l', 'q') => "ak". + * removePair("ampz", 'l', 'q') => "akrz". + */ + static void removePair(UnicodeString& pairs, UChar c, UChar d); + + //---------------------------------------------------------------- + // Implementation: Fundamental operators + //---------------------------------------------------------------- + + /** + * Changes the pairs list to represent the complement of the set it + * currently represents. The pairs list will be normalized (in + * order and in shortest possible form) if the original pairs list + * was normalized. + */ + static void doComplement(UnicodeString& pairs); + + /** + * Given two pairs lists, changes the first in place to represent + * the union of the two sets. + */ + static void doUnion(UnicodeString& pairs, const UnicodeString& c2); + + /** + * Given two pairs lists, changes the first in place to represent + * the asymmetric difference of the two sets. + */ + static void doDifference(UnicodeString& pairs, const UnicodeString& pairs2); + + /** + * Given two pairs lists, changes the first in place to represent + * the intersection of the two sets. + */ + static void doIntersection(UnicodeString& pairs, const UnicodeString& c2); + + //---------------------------------------------------------------- + // Implementation: Generation of pairs for Unicode categories + //---------------------------------------------------------------- + + /** + * Returns a pairs string for the given category, given its name. + * The category name must be either a two-letter name, such as + * "Lu", or a one letter name, such as "L". One-letter names + * indicate the logical union of all two-letter names that start + * with that letter. Case is significant. If the name starts + * with the character '^' then the complement of the given + * character set is returned. + * + * Although individual categories such as "Lu" are cached, we do + * not currently cache single-letter categories such as "L" or + * complements such as "^Lu" or "^L". It would be easy to cache + * these as well in a hashtable should the need arise. + */ + static UnicodeString& getCategoryPairs(UnicodeString& result, + const UnicodeString& catName, + UErrorCode& status); + + /** + * Returns a pairs string for the given category. This string is + * cached and returned again if this method is called again with + * the same parameter. + */ + static const UnicodeString& getCategoryPairs(int8_t cat); + + //---------------------------------------------------------------- + // Implementation: Utility methods + //---------------------------------------------------------------- + + /** + * Returns the character after the given position, or '\uFFFF' if + * there is none. + */ + static UChar charAfter(const UnicodeString& str, int32_t i); + + /** + * TEMPORARY WORKAROUND UNTIL Unicode::digit() exists. + * Return the digit value of the given UChar, or -1. The radix + * value is ignored for now and hardcoded as 16. + */ + static int8_t digit(UChar c, int8_t radix); +}; + +inline void UnicodeSet::applyPattern(const UnicodeString& pattern, + UErrorCode& status) { + applyPattern(pattern, FALSE, status); +} + +inline bool_t UnicodeSet::operator!=(const UnicodeSet& o) const { + return !operator==(o); +} + +#endif diff --git a/icu4c/source/test/intltest/intltest.dsp b/icu4c/source/test/intltest/intltest.dsp index ce602ccd794..9b6960c043b 100644 --- a/icu4c/source/test/intltest/intltest.dsp +++ b/icu4c/source/test/intltest/intltest.dsp @@ -327,6 +327,10 @@ SOURCE=.\ucdtest.cpp # End Source File # Begin Source File +SOURCE=.\usettest.cpp +# End Source File +# Begin Source File + SOURCE=.\ustrtest.cpp # End Source File # End Group @@ -571,6 +575,14 @@ SOURCE=.\ucdtest.h # End Source File # Begin Source File +SOURCE=..\..\i18n\uniset.h +# End Source File +# Begin Source File + +SOURCE=.\usettest.h +# End Source File +# Begin Source File + SOURCE=.\ustrtest.h # End Source File # End Group diff --git a/icu4c/source/test/intltest/itformat.cpp b/icu4c/source/test/intltest/itformat.cpp index 8f3dae32bc2..8044c231fe0 100644 --- a/icu4c/source/test/intltest/itformat.cpp +++ b/icu4c/source/test/intltest/itformat.cpp @@ -35,15 +35,15 @@ #include "dtfmttst.h" // DateFormatTest #include "tmsgfmt.h" // TestMessageFormat #include "dtfmrgts.h" // DateFormatRegressionTest -#include "msfmrgts.h" // MessageFormatRegressionTest -#include "miscdtfm.h" // DateFormatMiscTests +#include "msfmrgts.h" // MessageFormatRegressionTest +#include "miscdtfm.h" // DateFormatMiscTests #include "nmfmtrt.h" // NumberFormatRoundTripTest #include "numrgts.h" // NumberFormatRegressionTest -#include "dtfmtrtts.h" // DateFormatRoundTripTest -#include "pptest.h" // ParsePositionTest -#include "calregts.h" // CalendarRegressionTest +#include "dtfmtrtts.h" // DateFormatRoundTripTest +#include "pptest.h" // ParsePositionTest +#include "calregts.h" // CalendarRegressionTest #include "tzregts.h" // TimeZoneRegressionTest - +#include "usettest.h" // UnicodeSetTest void IntlTestFormat::runIndexedTest( int32_t index, bool_t exec, char* &name, char* par ) { @@ -296,6 +296,15 @@ void IntlTestFormat::runIndexedTest( int32_t index, bool_t exec, char* &name, ch callTest( test, par ); } break; + + case 26: + name = "UnicodeSetTest"; + if (exec) { + logln("UnicodeSetTest---"); logln(); + UnicodeSetTest test; + callTest(test, par); + } + break; /* case 28: name = "DateFormatSymbolsCAPI"; @@ -414,6 +423,7 @@ void IntlTestFormat::runIndexedTest( int32_t index, bool_t exec, char* &name, ch } break; */ + default: name = ""; break; //needed to end loop } if (exec) { diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp new file mode 100644 index 00000000000..c232efdade2 --- /dev/null +++ b/icu4c/source/test/intltest/usettest.cpp @@ -0,0 +1,108 @@ +#include "utypes.h" +#include "usettest.h" +#include "uniset.h" + +#define CASE(id,test) case id: \ + name = #test; \ + if (exec) { \ + logln(#test "---"); \ + logln((UnicodeString)""); \ + test(); \ + } \ + break; + +void +UnicodeSetTest::runIndexedTest(int32_t index, bool_t exec, + char* &name, char* par) { + // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest"); + switch (index) { + CASE(0,Test1) + + default: name = ""; break; + } +} + +void +UnicodeSetTest::Test1() { + UErrorCode status = U_ZERO_ERROR; + + UnicodeSet set; // Construct empty set + set.add('a', 'z'); + expect(set, "az"); + set.remove('m', 'p'); + expect(set, "alqz"); + set.remove('e', 'g'); + expect(set, "adhlqz"); + set.remove('d', 'i'); + expect(set, "acjlqz"); + set.remove('c', 'r'); + expect(set, "absz"); + set.add('f', 'q'); + expect(set, "abfqsz"); + set.remove('a', 'g'); + expect(set, "hqsz"); + set.remove('a', 'z'); + expect(set, ""); + + set.applyPattern("[[a-m]&[d-z]&[k-y]]", status); + if (U_FAILURE(status)) { + errln("FAIL: Unexpected pattern parse failure"); + return; + } + expect(set, "km"); + + set.applyPattern("[[a-z]-[m-y]-[d-r]]", status); + if (U_FAILURE(status)) { + errln("FAIL: Unexpected pattern parse failure"); + return; + } + expect(set, "aczz"); + + set.applyPattern("[a\\-z]", status); + if (U_FAILURE(status)) { + errln("FAIL: Unexpected pattern parse failure"); + return; + } + expect(set, "--aazz"); + set.applyPattern("[-az]", status); + if (U_FAILURE(status)) { + errln("FAIL: Unexpected pattern parse failure"); + return; + } + expect(set, "--aazz"); + set.applyPattern("[az-]", status); + if (U_FAILURE(status)) { + errln("FAIL: Unexpected pattern parse failure"); + return; + } + expect(set, "--aazz"); +} + +void +UnicodeSetTest::expect(const UnicodeSet& set, const UnicodeString& expectedPairs) { + if (set.getPairs() != expectedPairs) { + errln(UnicodeString("FAIL: Expected pair list \"") + + escape(expectedPairs) + "\", got \"" + + escape(set.getPairs()) + '"'); + } +} + +static char toHexString(int32_t i) { return i + (i < 10 ? '0' : ('A' - 10)); } + +UnicodeString +UnicodeSetTest::escape(const UnicodeString& s) { + UnicodeString buf; + for (int32_t i=0; i> 12); + buf += toHexString((c & 0x0F00) >> 8); + buf += toHexString((c & 0x00F0) >> 4); + buf += toHexString(c & 0x000F); + } + } + return buf; +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h new file mode 100644 index 00000000000..f7ab63a69f2 --- /dev/null +++ b/icu4c/source/test/intltest/usettest.h @@ -0,0 +1,25 @@ +#ifndef _TESTUNISET +#define _TESTUNISET + +#include "utypes.h" +#include "intltest.h" + +class UnicodeSet; +class UnicodeString; + +/** + * UnicodeSet test + */ +class UnicodeSetTest: public IntlTest { + + void runIndexedTest(int32_t index, bool_t exec, char* &name, char* par=NULL); + +private: + + void Test1(); + + void expect(const UnicodeSet& set, const UnicodeString& expectedPairs); + static UnicodeString escape(const UnicodeString& s); +}; + +#endif