From 759772e313cfdf157a942696df7925548c848ffa Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Tue, 26 Oct 1999 17:15:44 +0000 Subject: [PATCH] [ICU-34] Added copy constructor etc. X-Commit-URL: https://ssl.icu-project.org/trac/changeset/121 --- icu4c/source/i18n/uniset.cpp | 272 +++++++++-------------------------- icu4c/source/i18n/uniset.h | 105 +++++++++----- 2 files changed, 135 insertions(+), 242 deletions(-) diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp index abd0213f6c9..04314dd5bd3 100644 --- a/icu4c/source/i18n/uniset.cpp +++ b/icu4c/source/i18n/uniset.cpp @@ -1,180 +1,23 @@ +/* +********************************************************************** +* Copyright (C) 1999 Alan Liu and others. All rights reserved. +********************************************************************** +* Date Name Description +* 10/20/99 alan Creation. +********************************************************************** +*/ + #include "uniset.h" +#include "parsepos.h" -/** - * A mutable set of Unicode characters. Objects of this class - * represent character classes used in regular expressions. - * Such classes specify a subset of the set of all Unicode characters, - * which in this implementation is the characters from U+0000 to - * U+FFFF, ignoring surrogates. - * - *

This class supports two APIs. The first is modeled after Java 2's - * java.util.Set interface, although this class does not - * implement that interface. All methods of Set are - * supported, with the modification that they take a character range - * or single character instead of an Object, and they - * take a UnicodeSet instead of a Collection. - * - *

The second API is the - * applyPattern()/toPattern() API from the - * java.text.Format-derived classes. Unlike the - * methods that add characters, add categories, and control the logic - * of the set, the method applyPattern() sets all - * attributes of a UnicodeSet at once, based on a - * string pattern. - * - *

In addition, the set complement operation is supported through - * the complement() method. - * - *

Pattern syntax

- * - * Patterns are accepted by the constructors and the - * applyPattern() methods and returned by the - * toPattern() method. These patterns follow a syntax - * similar to that employed by version 8 regular expression character - * classes: - * - *
- * pattern := ('[' '^'? item* ']') | ('[:' '^'? category ':]')
- * item := char | (char '-' char) | pattern-expr
- * pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern
- * op := '&' | '-'
- * special := '[' | ']' | '-'
- * char := any character that is not special | - * ('\' any character) | - * ('\\u' hex hex hex hex)
- * hex := any hex digit, as defined by Character.digit(c, 16) - *
- * - *
Legend: - * - * - *
a:=b - * a may be replaced by - * b - *
a? - * zero or one instance of a
- *
a* - * one or more instances of a
- *
a|b - * either a or b
- *
'a' - * the literal string between the quotes - *
- *
- * - * Patterns specify individual characters, ranges of characters, and - * Unicode character categories. When elements are concatenated, they - * specify their union. To complement a set, place a '^' immediately - * after the opening '[' or '[:'. In any other location, '^' has no - * special meaning. - * - *

Ranges are indicated by placing two a '-' between two - * characters, as in "a-z". This specifies the range of all - * characters from the left to the right, in Unicode order. If the - * left and right characters are the same, then the range consists of - * just that character. If the left character is greater than the - * right character it is a syntax error. If a '-' occurs as the first - * character after the opening '[' or '[^', or if it occurs as the - * last character before the closing ']', then it is taken as a - * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same - * set of three characters, 'a', 'b', and '-'. - * - *

Sets may be intersected using the '&' operator or the asymmetric - * set difference may be taken using the '-' operator, for example, - * "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters - * with values less than 4096. Operators ('&' and '|') have equal - * precedence and bind left-to-right. Thus - * "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to - * "[[[:L:]-[a-z]]-[\u0100-\u01FF]]". This only really matters for - * difference; intersection is commutative. - * - * - *
[a]The set containing 'a' - *
[a-z]The set containing 'a' - * through 'z' and all letters in between, in Unicode order - *
[^a-z]The set containing - * all characters but 'a' through 'z', - * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF - *
[[pat1][pat2]] - * The union of sets specified by pat1 and pat2 - *
[[pat1]&[pat2]] - * The intersection of sets specified by pat1 and pat2 - *
[[pat1]-[pat2]] - * The asymmetric difference of sets specified by pat1 and - * pat2 - *
[:Lu:] - * The set of characters belonging to the given - * Unicode category, as defined by Character.getType(); in - * this case, Unicode uppercase letters - *
[:L:] - * The set of characters belonging to all Unicode categories - * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. - *
- * - *

Character categories. - * - * Character categories are specified using the POSIX-like syntax - * '[:Lu:]'. The complement of a category is specified by inserting - * '^' after the opening '[:'. The following category names are - * recognized. Actual determination of category data uses - * Character.getType(), so it reflects the underlying - * implmementation used by Character. As of Java 2 and - * JDK 1.1.8, this is Unicode 2.x.x - fill in version here. - * - *

- * Normative
- *     Mn = Mark, Non-Spacing
- *     Mc = Mark, Spacing Combining
- *     Me = Mark, Enclosing
- * 
- *     Nd = Number, Decimal Digit
- *     Nl = Number, Letter
- *     No = Number, Other
- * 
- *     Zs = Separator, Space
- *     Zl = Separator, Line
- *     Zp = Separator, Paragraph
- * 
- *     Cc = Other, Control
- *     Cf = Other, Format
- *     Cs = Other, Surrogate
- *     Co = Other, Private Use
- *     Cn = Other, Not Assigned
- * 
- * Informative
- *     Lu = Letter, Uppercase
- *     Ll = Letter, Lowercase
- *     Lt = Letter, Titlecase
- *     Lm = Letter, Modifier
- *     Lo = Letter, Other
- * 
- *     Pc = Punctuation, Connector
- *     Pd = Punctuation, Dash
- *     Ps = Punctuation, Open
- *     Pe = Punctuation, Close
- *     Pi = Punctuation, Initial quote
- *     Pf = Punctuation, Final quote
- *     Po = Punctuation, Other
- * 
- *     Sm = Symbol, Math
- *     Sc = Symbol, Currency
- *     Sk = Symbol, Modifier
- *     So = Symbol, Other
- * 
- * *Unsupported by Java (and hence unsupported by UnicodeSet). - * - * @author Alan Liu - * @version $RCSfile: uniset.cpp,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $ - */ - -// Note: This mapping is different in ICU and Java +// N.B.: This mapping is different in ICU and Java const UnicodeString UnicodeSet::CATEGORY_NAMES( "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf"); /** * A cache mapping character category integers, as returned by - * Character.getType(), to pairs strings. Entries are initially - * null and are created on demand. + * Unicode::getType(), to pairs strings. Entries are initially + * zero length and are filled in on demand. */ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE = new UnicodeString[Unicode::GENERAL_TYPES_COUNT]; @@ -193,13 +36,13 @@ const UnicodeString& UnicodeSet::getPairs() const { } //---------------------------------------------------------------- -// Public API +// Constructors &c //---------------------------------------------------------------- /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() {} +UnicodeSet::UnicodeSet() : pairs() {} /** * Constructs a set from the given pattern, optionally ignoring @@ -214,12 +57,12 @@ UnicodeSet::UnicodeSet() {} * contains a syntax error. */ UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces, - UErrorCode& status) { + UErrorCode& status) : pairs() { applyPattern(pattern, ignoreSpaces, status); } UnicodeSet::UnicodeSet(const UnicodeString& pattern, - UErrorCode& status) { + UErrorCode& status) : pairs() { applyPattern(pattern, status); } @@ -230,7 +73,7 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, * @exception IllegalArgumentException if the given * category is invalid. */ -UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) { +UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : pairs() { if (U_SUCCESS(status)) { if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) { status = U_ILLEGAL_ARGUMENT_ERROR; @@ -240,6 +83,52 @@ UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) { } } +/** + * Constructs a set that is identical to the given UnicodeSet. + */ +UnicodeSet::UnicodeSet(const UnicodeSet& o) : pairs(o.pairs) {} + +/** + * Destructs the set. + */ +UnicodeSet::~UnicodeSet() {} + +/** + * Assigns this object to be a copy of another. + */ +UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { + pairs = o.pairs; + return *this; +} + +/** + * Compares the specified object with this set for equality. Returns + * true if the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o set to be compared for equality with this set. + * @return true if the specified set is equal to this set. + */ +bool_t UnicodeSet::operator==(const UnicodeSet& o) const { + return pairs == o.pairs; +} + +/** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ +int32_t UnicodeSet::hashCode() const { + return pairs.hashCode(); +} + +//---------------------------------------------------------------- +// Public API +//---------------------------------------------------------------- + /** * Modifies this set to represent the set specified by the given * pattern, optionally ignoring white space. See the class @@ -302,7 +191,7 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern, UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const { result.remove().append((UChar)'['); - // iterate through the ranges in the CharSet + // iterate through the ranges in the UnicodeSet for (int32_t i=0; iInteger.MAX_VALUE elements, returns - * Integer.MAX_VALUE. + * Returns the number of elements in this set (its cardinality), + * n, where 0 <= n <= 65536. * * @return the number of elements in this set (its cardinality). */ @@ -484,7 +372,7 @@ void UnicodeSet::removeAll(const UnicodeSet& c) { /** * Inverts this set. This operation modifies this set so that * its value is its complement. This is equivalent to the pseudo code: - * this = new CharSet("[\u0000-\uFFFF]").removeAll(this). + * this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this). */ void UnicodeSet::complement() { doComplement(pairs); @@ -498,30 +386,6 @@ void UnicodeSet::clear() { pairs.remove(); } -/** - * Compares the specified object with this set for equality. Returns - * true if the specified object is also a set, the two sets - * have the same size, and every member of the specified set is - * contained in this set (or equivalently, every member of this set is - * contained in the specified set). - * - * @param o Object to be compared for equality with this set. - * @return true if the specified Object is equal to this set. - */ -bool_t UnicodeSet::operator==(const UnicodeSet& o) const { - return pairs == o.pairs; -} - -/** - * Returns the hash code value for this set. - * - * @return the hash code value for this set. - * @see Object#hashCode() - */ -int32_t UnicodeSet::hashCode() const { - return pairs.hashCode(); -} - //---------------------------------------------------------------- // Implementation: Pattern parsing //---------------------------------------------------------------- diff --git a/icu4c/source/i18n/uniset.h b/icu4c/source/i18n/uniset.h index f3c12148855..9e403223d17 100644 --- a/icu4c/source/i18n/uniset.h +++ b/icu4c/source/i18n/uniset.h @@ -1,9 +1,19 @@ +/* +********************************************************************** +* Copyright (C) 1999 Alan Liu and others. All rights reserved. +********************************************************************** +* Date Name Description +* 10/20/99 alan Creation. +********************************************************************** +*/ + #ifndef UNICODESET_H #define UNICODESET_H #include "utypes.h" #include "unistr.h" -#include "parsepos.h" + +class ParsePosition; /** * A mutable set of Unicode characters. Objects of this class @@ -21,7 +31,7 @@ * *

The second API is the * applyPattern()/toPattern() API from the - * java.text.Format-derived classes. Unlike the + * Format-derived classes. Unlike the * methods that add characters, add categories, and control the logic * of the set, the method applyPattern() sets all * attributes of a UnicodeSet at once, based on a @@ -122,9 +132,8 @@ * '[:Lu:]'. The complement of a category is specified by inserting * '^' after the opening '[:'. The following category names are * recognized. Actual determination of category data uses - * Character.getType(), so it reflects the underlying - * implmementation used by Character. As of Java 2 and - * JDK 1.1.8, this is Unicode 2.x.x - fill in version here. + * Unicode::getType(), so it reflects the underlying + * data used by Unicode. * *

  * Normative
@@ -157,8 +166,8 @@
  *     Pd = Punctuation, Dash
  *     Ps = Punctuation, Open
  *     Pe = Punctuation, Close
- *    *Pi = Punctuation, Initial quote
- *    *Pf = Punctuation, Final quote
+ *     Pi = Punctuation, Initial quote
+ *     Pf = Punctuation, Final quote
  *     Po = Punctuation, Other
  * 
  *     Sm = Symbol, Math
@@ -166,15 +175,13 @@
  *     Sk = Symbol, Modifier
  *     So = Symbol, Other
  * 
- * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: uniset.h,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $ */ class U_I18N_API UnicodeSet { /** - * The internal representation is a StringBuffer of even length. + * The internal representation is a UnicodeString of even length. * Each pair of characters represents a range that is included in * the set. A single character c is represented as cc. Thus, the * ranges in the set are (a,b), a and b inclusive, where a = @@ -191,8 +198,8 @@ class U_I18N_API UnicodeSet { /** * A cache mapping character category integers, as returned by - * Character.getType(), to pairs strings. Entries are initially - * null and are created on demand. + * Unicode::getType(), to pairs strings. Entries are initially + * zero length and are filled in on demand. */ static UnicodeString* CATEGORY_PAIRS_CACHE; @@ -210,7 +217,7 @@ public: const UnicodeString& getPairs() const; //---------------------------------------------------------------- - // Public API + // Constructors &c //---------------------------------------------------------------- public: @@ -254,6 +261,51 @@ public: */ UnicodeSet(int8_t category, UErrorCode& status); + /** + * Constructs a set that is identical to the given UnicodeSet. + */ + UnicodeSet(const UnicodeSet& o); + + /** + * Destructs the set. + */ + virtual ~UnicodeSet(); + + /** + * Assigns this object to be a copy of another. + */ + UnicodeSet& operator=(const UnicodeSet& o); + + /** + * Compares the specified object with this set for equality. Returns + * true if the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o set to be compared for equality with this set. + * @return true if the specified set is equal to this set. + */ + virtual bool_t operator==(const UnicodeSet& o) const; + + /** + * Compares the specified object with this set for equality. Returns + * true if the specified set is not equal to this set. + */ + bool_t operator!=(const UnicodeSet& o) const; + + /** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ + virtual int32_t hashCode() const; + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + /** * Modifies this set to represent the set specified by the given * pattern, optionally ignoring white space. See the class @@ -291,9 +343,8 @@ public: virtual UnicodeString& toPattern(UnicodeString& result) const; /** - * Returns the number of elements in this set (its cardinality). If this - * set contains more than Integer.MAX_VALUE elements, returns - * Integer.MAX_VALUE. + * Returns the number of elements in this set (its cardinality), + * n, where 0 <= n <= 65536. * * @return the number of elements in this set (its cardinality). */ @@ -419,28 +470,6 @@ public: */ virtual void clear(); - /** - * Compares the specified object with this set for equality. Returns - * true if the specified object is also a set, the two sets - * have the same size, and every member of the specified set is - * contained in this set (or equivalently, every member of this set is - * contained in the specified set). - * - * @param o Object to be compared for equality with this set. - * @return true if the specified Object is equal to this set. - */ - virtual bool_t operator==(const UnicodeSet& o) const; - - bool_t operator!=(const UnicodeSet& o) const; - - /** - * Returns the hash code value for this set. - * - * @return the hash code value for this set. - * @see Object#hashCode() - */ - virtual int32_t hashCode() const; - //---------------------------------------------------------------- // Implementation: Pattern parsing //----------------------------------------------------------------