From 759772e313cfdf157a942696df7925548c848ffa Mon Sep 17 00:00:00 2001
From: Alan Liu <alansliu@gmail.com>
Date: Tue, 26 Oct 1999 17:15:44 +0000
Subject: [PATCH] [ICU-34] Added copy constructor etc. X-Commit-URL:
 https://ssl.icu-project.org/trac/changeset/121

---
 icu4c/source/i18n/uniset.cpp | 272 +++++++++--------------------------
 icu4c/source/i18n/uniset.h   | 105 +++++++++-----
 2 files changed, 135 insertions(+), 242 deletions(-)
diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp
index abd0213f6c9..04314dd5bd3 100644
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@@ -1,180 +1,23 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 Alan Liu and others. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   10/20/99    alan        Creation.
+**********************************************************************
+*/
+
 #include "uniset.h"
+#include "parsepos.h"
 
-/**
- * A mutable set of Unicode characters.  Objects of this class
- * represent <em>character classes</em> used in regular expressions.
- * Such classes specify a subset of the set of all Unicode characters,
- * which in this implementation is the characters from U+0000 to
- * U+FFFF, ignoring surrogates.
- *
- * <p>This class supports two APIs.  The first is modeled after Java 2's
- * <code>java.util.Set</code> interface, although this class does not
- * implement that interface.  All methods of <code>Set</code> are
- * supported, with the modification that they take a character range
- * or single character instead of an <code>Object</code>, and they
- * take a <code>UnicodeSet</code> instead of a <code>Collection</code>.
- *
- * <p>The second API is the
- * <code>applyPattern()</code>/<code>toPattern()</code> API from the
- * <code>java.text.Format</code>-derived classes.  Unlike the
- * methods that add characters, add categories, and control the logic
- * of the set, the method <code>applyPattern()</code> sets all
- * attributes of a <code>UnicodeSet</code> at once, based on a
- * string pattern.
- *
- * <p>In addition, the set complement operation is supported through
- * the <code>complement()</code> method.
- *
- * <p><b>Pattern syntax</b></p>
- *
- * Patterns are accepted by the constructors and the
- * <code>applyPattern()</code> methods and returned by the
- * <code>toPattern()</code> method.  These patterns follow a syntax
- * similar to that employed by version 8 regular expression character
- * classes:
- *
- * <blockquote><code>
- * pattern := ('[' '^'? item* ']') | ('[:' '^'? category ':]')<br>
- * item := char | (char '-' char) | pattern-expr<br>
- * pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern<br>
- * op := '&' | '-'<br>
- * special := '[' | ']' | '-'<br>
- * char := <em>any character that is not</em> special |
- *        ('\' <em>any character</em>) |
- *        ('\\u' hex hex hex hex)<br>
- * hex := <em>any hex digit, as defined by </em>Character.digit(c, 16)
- * </code>
- *
- * <br>Legend:
- *
- * <table>
- * <tr><td width=20%><code>a:=b</code>
- * <td><code>a</code> may be replaced by
- * <code>b</code>
- * <tr><td><code>a?</code>
- * <td>zero or one instance of <code>a</code><br>
- * <tr><td><code>a*</code>
- * <td>one or more instances of <code>a</code><br>
- * <tr><td><code>a|b</code>
- * <td>either <code>a</code> or <code>b</code><br>
- * <tr><td><code>'a'</code>
- * <td>the literal string between the quotes
- * </table>
- * </blockquote>
- *
- * Patterns specify individual characters, ranges of characters, and
- * Unicode character categories.  When elements are concatenated, they
- * specify their union.  To complement a set, place a '^' immediately
- * after the opening '[' or '[:'.  In any other location, '^' has no
- * special meaning.
- *
- * <p>Ranges are indicated by placing two a '-' between two
- * characters, as in "a-z".  This specifies the range of all
- * characters from the left to the right, in Unicode order.  If the
- * left and right characters are the same, then the range consists of
- * just that character.  If the left character is greater than the
- * right character it is a syntax error.  If a '-' occurs as the first
- * character after the opening '[' or '[^', or if it occurs as the
- * last character before the closing ']', then it is taken as a
- * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
- * set of three characters, 'a', 'b', and '-'.
- *
- * <p>Sets may be intersected using the '&' operator or the asymmetric
- * set difference may be taken using the '-' operator, for example,
- * "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters
- * with values less than 4096.  Operators ('&' and '|') have equal
- * precedence and bind left-to-right.  Thus
- * "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to
- * "[[[:L:]-[a-z]]-[\u0100-\u01FF]]".  This only really matters for
- * difference; intersection is commutative.
- *
- * <table>
- * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
- * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
- * through 'z' and all letters in between, in Unicode order
- * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
- * all characters but 'a' through 'z',
- * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
- * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
- * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
- * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
- * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
- * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
- * <td>The asymmetric difference of sets specified by <em>pat1</em> and
- * <em>pat2</em>
- * <tr valign=top><td nowrap><code>[:Lu:]</code>
- * <td>The set of characters belonging to the given
- * Unicode category, as defined by <code>Character.getType()</code>; in
- * this case, Unicode uppercase letters
- * <tr valign=top><td nowrap><code>[:L:]</code>
- * <td>The set of characters belonging to all Unicode categories
- * starting wih 'L', that is, <code>[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]</code>.
- * </table>
- *
- * <p><b>Character categories.</b>
- *
- * Character categories are specified using the POSIX-like syntax
- * '[:Lu:]'.  The complement of a category is specified by inserting
- * '^' after the opening '[:'.  The following category names are
- * recognized.  Actual determination of category data uses
- * <code>Character.getType()</code>, so it reflects the underlying
- * implmementation used by <code>Character</code>.  As of Java 2 and
- * JDK 1.1.8, this is Unicode <b>2.x.x - fill in version here</b>.
- *
- * <pre>
- * Normative
- *     Mn = Mark, Non-Spacing
- *     Mc = Mark, Spacing Combining
- *     Me = Mark, Enclosing
- * 
- *     Nd = Number, Decimal Digit
- *     Nl = Number, Letter
- *     No = Number, Other
- * 
- *     Zs = Separator, Space
- *     Zl = Separator, Line
- *     Zp = Separator, Paragraph
- * 
- *     Cc = Other, Control
- *     Cf = Other, Format
- *     Cs = Other, Surrogate
- *     Co = Other, Private Use
- *     Cn = Other, Not Assigned
- * 
- * Informative
- *     Lu = Letter, Uppercase
- *     Ll = Letter, Lowercase
- *     Lt = Letter, Titlecase
- *     Lm = Letter, Modifier
- *     Lo = Letter, Other
- * 
- *     Pc = Punctuation, Connector
- *     Pd = Punctuation, Dash
- *     Ps = Punctuation, Open
- *     Pe = Punctuation, Close
- *     Pi = Punctuation, Initial quote
- *     Pf = Punctuation, Final quote
- *     Po = Punctuation, Other
- * 
- *     Sm = Symbol, Math
- *     Sc = Symbol, Currency
- *     Sk = Symbol, Modifier
- *     So = Symbol, Other
- * </pre>
- * *Unsupported by Java (and hence unsupported by UnicodeSet).
- *
- * @author Alan Liu
- * @version $RCSfile: uniset.cpp,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $
- */
-
-// Note: This mapping is different in ICU and Java
+// N.B.: This mapping is different in ICU and Java
 const UnicodeString UnicodeSet::CATEGORY_NAMES(
     "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf");
 
 /**
  * A cache mapping character category integers, as returned by
- * Character.getType(), to pairs strings.  Entries are initially
- * null and are created on demand.
+ * Unicode::getType(), to pairs strings.  Entries are initially
+ * zero length and are filled in on demand.
  */
 UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
      new UnicodeString[Unicode::GENERAL_TYPES_COUNT];
@@ -193,13 +36,13 @@ const UnicodeString& UnicodeSet::getPairs() const {
 }
 
 //----------------------------------------------------------------
-// Public API
+// Constructors &c
 //----------------------------------------------------------------
 
 /**
  * Constructs an empty set.
  */
-UnicodeSet::UnicodeSet() {}
+UnicodeSet::UnicodeSet() : pairs() {}
 
 /**
  * Constructs a set from the given pattern, optionally ignoring
@@ -214,12 +57,12 @@ UnicodeSet::UnicodeSet() {}
  * contains a syntax error.
  */
 UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces,
-                       UErrorCode& status) {
+                       UErrorCode& status) : pairs() {
     applyPattern(pattern, ignoreSpaces, status);
 }
 
 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
-                       UErrorCode& status) {
+                       UErrorCode& status) : pairs() {
     applyPattern(pattern, status);
 }
 
@@ -230,7 +73,7 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
  * @exception <code>IllegalArgumentException</code> if the given
  * category is invalid.
  */
-UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) {
+UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : pairs() {
     if (U_SUCCESS(status)) {
         if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
             status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -240,6 +83,52 @@ UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) {
     }
 }
 
+/**
+ * Constructs a set that is identical to the given UnicodeSet.
+ */
+UnicodeSet::UnicodeSet(const UnicodeSet& o) : pairs(o.pairs) {}
+
+/**
+ * Destructs the set.
+ */
+UnicodeSet::~UnicodeSet() {}
+
+/**
+ * Assigns this object to be a copy of another.
+ */
+UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
+    pairs = o.pairs;
+    return *this;
+}
+
+/**
+ * Compares the specified object with this set for equality.  Returns
+ * <tt>true</tt> if the two sets
+ * have the same size, and every member of the specified set is
+ * contained in this set (or equivalently, every member of this set is
+ * contained in the specified set).
+ *
+ * @param o set to be compared for equality with this set.
+ * @return <tt>true</tt> if the specified set is equal to this set.
+ */
+bool_t UnicodeSet::operator==(const UnicodeSet& o) const {
+    return pairs == o.pairs;
+}
+
+/**
+ * Returns the hash code value for this set.
+ *
+ * @return the hash code value for this set.
+ * @see Object#hashCode()
+ */
+int32_t UnicodeSet::hashCode() const {
+    return pairs.hashCode();
+}
+
+//----------------------------------------------------------------
+// Public API
+//----------------------------------------------------------------
+
 /**
  * Modifies this set to represent the set specified by the given
  * pattern, optionally ignoring white space.  See the class
@@ -302,7 +191,7 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
 UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const {
     result.remove().append((UChar)'[');
     
-    // iterate through the ranges in the CharSet
+    // iterate through the ranges in the UnicodeSet
     for (int32_t i=0; i<pairs.length(); i+=2) {
         // for a range with the same beginning and ending point,
         // output that character, otherwise, output the start and
@@ -317,9 +206,8 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const {
 }
 
 /**
- * Returns the number of elements in this set (its cardinality).  If this
- * set contains more than <tt>Integer.MAX_VALUE</tt> elements, returns
- * <tt>Integer.MAX_VALUE</tt>.
+ * Returns the number of elements in this set (its cardinality),
+ * <em>n</em>, where <code>0 <= </code><em>n</em><code> <= 65536</code>.
  *
  * @return the number of elements in this set (its cardinality).
  */
@@ -484,7 +372,7 @@ void UnicodeSet::removeAll(const UnicodeSet& c) {
 /**
  * Inverts this set.  This operation modifies this set so that
  * its value is its complement.  This is equivalent to the pseudo code:
- * <code>this = new CharSet("[\u0000-\uFFFF]").removeAll(this)</code>.
+ * <code>this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this)</code>.
  */
 void UnicodeSet::complement() {
     doComplement(pairs);
@@ -498,30 +386,6 @@ void UnicodeSet::clear() {
     pairs.remove();
 }
 
-/**
- * Compares the specified object with this set for equality.  Returns
- * <tt>true</tt> if the specified object is also a set, the two sets
- * have the same size, and every member of the specified set is
- * contained in this set (or equivalently, every member of this set is
- * contained in the specified set).
- *
- * @param o Object to be compared for equality with this set.
- * @return <tt>true</tt> if the specified Object is equal to this set.
- */
-bool_t UnicodeSet::operator==(const UnicodeSet& o) const {
-    return pairs == o.pairs;
-}
-
-/**
- * Returns the hash code value for this set.
- *
- * @return the hash code value for this set.
- * @see Object#hashCode()
- */
-int32_t UnicodeSet::hashCode() const {
-    return pairs.hashCode();
-}
-
 //----------------------------------------------------------------
 // Implementation: Pattern parsing
 //----------------------------------------------------------------
diff --git a/icu4c/source/i18n/uniset.h b/icu4c/source/i18n/uniset.h
index f3c12148855..9e403223d17 100644
--- a/icu4c/source/i18n/uniset.h
+++ b/icu4c/source/i18n/uniset.h
@@ -1,9 +1,19 @@
+/*
+**********************************************************************
+*   Copyright (C) 1999 Alan Liu and others. All rights reserved.
+**********************************************************************
+*   Date        Name        Description
+*   10/20/99    alan        Creation.
+**********************************************************************
+*/
+
 #ifndef UNICODESET_H
 #define UNICODESET_H
 
 #include "utypes.h"
 #include "unistr.h"
-#include "parsepos.h"
+
+class ParsePosition;
 
 /**
  * A mutable set of Unicode characters.  Objects of this class
@@ -21,7 +31,7 @@
  *
  * <p>The second API is the
  * <code>applyPattern()</code>/<code>toPattern()</code> API from the
- * <code>java.text.Format</code>-derived classes.  Unlike the
+ * <code>Format</code>-derived classes.  Unlike the
  * methods that add characters, add categories, and control the logic
  * of the set, the method <code>applyPattern()</code> sets all
  * attributes of a <code>UnicodeSet</code> at once, based on a
@@ -122,9 +132,8 @@
  * '[:Lu:]'.  The complement of a category is specified by inserting
  * '^' after the opening '[:'.  The following category names are
  * recognized.  Actual determination of category data uses
- * <code>Character.getType()</code>, so it reflects the underlying
- * implmementation used by <code>Character</code>.  As of Java 2 and
- * JDK 1.1.8, this is Unicode <b>2.x.x - fill in version here</b>.
+ * <code>Unicode::getType()</code>, so it reflects the underlying
+ * data used by <code>Unicode</code>.
  *
  * <pre>
  * Normative
@@ -157,8 +166,8 @@
  *     Pd = Punctuation, Dash
  *     Ps = Punctuation, Open
  *     Pe = Punctuation, Close
- *    *Pi = Punctuation, Initial quote
- *    *Pf = Punctuation, Final quote
+ *     Pi = Punctuation, Initial quote
+ *     Pf = Punctuation, Final quote
  *     Po = Punctuation, Other
  * 
  *     Sm = Symbol, Math
@@ -166,15 +175,13 @@
  *     Sk = Symbol, Modifier
  *     So = Symbol, Other
  * </pre>
- * *Unsupported by Java (and hence unsupported by UnicodeSet).
  *
  * @author Alan Liu
- * @version $RCSfile: uniset.h,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $
  */
 class U_I18N_API UnicodeSet {
 
     /**
-     * The internal representation is a StringBuffer of even length.
+     * The internal representation is a UnicodeString of even length.
      * Each pair of characters represents a range that is included in
      * the set.  A single character c is represented as cc.  Thus, the
      * ranges in the set are (a,b), a and b inclusive, where a =
@@ -191,8 +198,8 @@ class U_I18N_API UnicodeSet {
 
     /**
      * A cache mapping character category integers, as returned by
-     * Character.getType(), to pairs strings.  Entries are initially
-     * null and are created on demand.
+     * Unicode::getType(), to pairs strings.  Entries are initially
+     * zero length and are filled in on demand.
      */
     static UnicodeString* CATEGORY_PAIRS_CACHE;
 
@@ -210,7 +217,7 @@ public:
     const UnicodeString& getPairs() const;
 
     //----------------------------------------------------------------
-    // Public API
+    // Constructors &c
     //----------------------------------------------------------------
 
 public:
@@ -254,6 +261,51 @@ public:
      */
     UnicodeSet(int8_t category, UErrorCode& status);
 
+    /**
+     * Constructs a set that is identical to the given UnicodeSet.
+     */
+    UnicodeSet(const UnicodeSet& o);
+
+    /**
+     * Destructs the set.
+     */
+    virtual ~UnicodeSet();
+
+    /**
+     * Assigns this object to be a copy of another.
+     */
+    UnicodeSet& operator=(const UnicodeSet& o);
+
+    /**
+     * Compares the specified object with this set for equality.  Returns
+     * <tt>true</tt> if the two sets
+     * have the same size, and every member of the specified set is
+     * contained in this set (or equivalently, every member of this set is
+     * contained in the specified set).
+     *
+     * @param o set to be compared for equality with this set.
+     * @return <tt>true</tt> if the specified set is equal to this set.
+     */
+    virtual bool_t operator==(const UnicodeSet& o) const;
+
+    /**
+     * Compares the specified object with this set for equality.  Returns
+     * <tt>true</tt> if the specified set is not equal to this set.
+     */
+    bool_t operator!=(const UnicodeSet& o) const;
+
+    /**
+     * Returns the hash code value for this set.
+     *
+     * @return the hash code value for this set.
+     * @see Object#hashCode()
+     */
+    virtual int32_t hashCode() const;
+
+    //----------------------------------------------------------------
+    // Public API
+    //----------------------------------------------------------------
+
     /**
      * Modifies this set to represent the set specified by the given
      * pattern, optionally ignoring white space.  See the class
@@ -291,9 +343,8 @@ public:
     virtual UnicodeString& toPattern(UnicodeString& result) const;
 
     /**
-     * Returns the number of elements in this set (its cardinality).  If this
-     * set contains more than <tt>Integer.MAX_VALUE</tt> elements, returns
-     * <tt>Integer.MAX_VALUE</tt>.
+     * Returns the number of elements in this set (its cardinality),
+     * <em>n</em>, where <code>0 <= </code><em>n</em><code> <= 65536</code>.
      *
      * @return the number of elements in this set (its cardinality).
      */
@@ -419,28 +470,6 @@ public:
      */
     virtual void clear();
 
-    /**
-     * Compares the specified object with this set for equality.  Returns
-     * <tt>true</tt> if the specified object is also a set, the two sets
-     * have the same size, and every member of the specified set is
-     * contained in this set (or equivalently, every member of this set is
-     * contained in the specified set).
-     *
-     * @param o Object to be compared for equality with this set.
-     * @return <tt>true</tt> if the specified Object is equal to this set.
-     */
-    virtual bool_t operator==(const UnicodeSet& o) const;
-
-    bool_t operator!=(const UnicodeSet& o) const;
-
-    /**
-     * Returns the hash code value for this set.
-     *
-     * @return the hash code value for this set.
-     * @see Object#hashCode()
-     */
-    virtual int32_t hashCode() const;
-
     //----------------------------------------------------------------
     // Implementation: Pattern parsing
     //----------------------------------------------------------------

`a:=b` - *	`a` may be replaced by - * `b` - *
`a?` - *	zero or one instance of `a` - *
`a` -	one or more instances of `a` - *
`a\|b` - *	either `a` or `b` - *
`'a'` - *	the literal string between the quotes - *
`[a]`	The set containing 'a' - *
`[a-z]`	The set containing 'a' - * through 'z' and all letters in between, in Unicode order - *
`[^a-z]`	The set containing - * all characters but 'a' through 'z', - * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF - *
`[[pat1][pat2]]` - *	The union of sets specified by pat1 and pat2 - *
`[[pat1]&[pat2]]` - *	The intersection of sets specified by pat1 and pat2 - *
`[[pat1]-[pat2]]` - *	The asymmetric difference of sets specified by pat1 and - * pat2 - *
`[:Lu:]` - *	The set of characters belonging to the given - * Unicode category, as defined by `Character.getType()`; in - * this case, Unicode uppercase letters - *
`[:L:]` - *	The set of characters belonging to all Unicode categories - * starting wih 'L', that is, `[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]`. - *