Added copy constructor etc.

X-Trac-URL: https://ssl.icu-project.org/trac/ticket/34



git-svn-id: http://source.icu-project.org/repos/icu/icu/trunk@121 0a8b766c-62c9-45f9-954d-7f2943c9ead0
This commit is contained in:
Alan Liu 1999-10-26 17:15:44 +00:00
parent 371542c71e
commit 6849df4a1b
2 changed files with 135 additions and 242 deletions

View file

@ -1,180 +1,23 @@
/*
**********************************************************************
* Copyright (C) 1999 Alan Liu and others. All rights reserved.
**********************************************************************
* Date Name Description
* 10/20/99 alan Creation.
**********************************************************************
*/
#include "uniset.h"
#include "parsepos.h"
/**
* A mutable set of Unicode characters. Objects of this class
* represent <em>character classes</em> used in regular expressions.
* Such classes specify a subset of the set of all Unicode characters,
* which in this implementation is the characters from U+0000 to
* U+FFFF, ignoring surrogates.
*
* <p>This class supports two APIs. The first is modeled after Java 2's
* <code>java.util.Set</code> interface, although this class does not
* implement that interface. All methods of <code>Set</code> are
* supported, with the modification that they take a character range
* or single character instead of an <code>Object</code>, and they
* take a <code>UnicodeSet</code> instead of a <code>Collection</code>.
*
* <p>The second API is the
* <code>applyPattern()</code>/<code>toPattern()</code> API from the
* <code>java.text.Format</code>-derived classes. Unlike the
* methods that add characters, add categories, and control the logic
* of the set, the method <code>applyPattern()</code> sets all
* attributes of a <code>UnicodeSet</code> at once, based on a
* string pattern.
*
* <p>In addition, the set complement operation is supported through
* the <code>complement()</code> method.
*
* <p><b>Pattern syntax</b></p>
*
* Patterns are accepted by the constructors and the
* <code>applyPattern()</code> methods and returned by the
* <code>toPattern()</code> method. These patterns follow a syntax
* similar to that employed by version 8 regular expression character
* classes:
*
* <blockquote><code>
* pattern := ('[' '^'? item* ']') | ('[:' '^'? category ':]')<br>
* item := char | (char '-' char) | pattern-expr<br>
* pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern<br>
* op := '&' | '-'<br>
* special := '[' | ']' | '-'<br>
* char := <em>any character that is not</em> special |
* ('\' <em>any character</em>) |
* ('\\u' hex hex hex hex)<br>
* hex := <em>any hex digit, as defined by </em>Character.digit(c, 16)
* </code>
*
* <br>Legend:
*
* <table>
* <tr><td width=20%><code>a:=b</code>
* <td><code>a</code> may be replaced by
* <code>b</code>
* <tr><td><code>a?</code>
* <td>zero or one instance of <code>a</code><br>
* <tr><td><code>a*</code>
* <td>one or more instances of <code>a</code><br>
* <tr><td><code>a|b</code>
* <td>either <code>a</code> or <code>b</code><br>
* <tr><td><code>'a'</code>
* <td>the literal string between the quotes
* </table>
* </blockquote>
*
* Patterns specify individual characters, ranges of characters, and
* Unicode character categories. When elements are concatenated, they
* specify their union. To complement a set, place a '^' immediately
* after the opening '[' or '[:'. In any other location, '^' has no
* special meaning.
*
* <p>Ranges are indicated by placing two a '-' between two
* characters, as in "a-z". This specifies the range of all
* characters from the left to the right, in Unicode order. If the
* left and right characters are the same, then the range consists of
* just that character. If the left character is greater than the
* right character it is a syntax error. If a '-' occurs as the first
* character after the opening '[' or '[^', or if it occurs as the
* last character before the closing ']', then it is taken as a
* literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
* set of three characters, 'a', 'b', and '-'.
*
* <p>Sets may be intersected using the '&' operator or the asymmetric
* set difference may be taken using the '-' operator, for example,
* "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters
* with values less than 4096. Operators ('&' and '|') have equal
* precedence and bind left-to-right. Thus
* "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to
* "[[[:L:]-[a-z]]-[\u0100-\u01FF]]". This only really matters for
* difference; intersection is commutative.
*
* <table>
* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
* through 'z' and all letters in between, in Unicode order
* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
* all characters but 'a' through 'z',
* that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
* <td>The asymmetric difference of sets specified by <em>pat1</em> and
* <em>pat2</em>
* <tr valign=top><td nowrap><code>[:Lu:]</code>
* <td>The set of characters belonging to the given
* Unicode category, as defined by <code>Character.getType()</code>; in
* this case, Unicode uppercase letters
* <tr valign=top><td nowrap><code>[:L:]</code>
* <td>The set of characters belonging to all Unicode categories
* starting wih 'L', that is, <code>[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]</code>.
* </table>
*
* <p><b>Character categories.</b>
*
* Character categories are specified using the POSIX-like syntax
* '[:Lu:]'. The complement of a category is specified by inserting
* '^' after the opening '[:'. The following category names are
* recognized. Actual determination of category data uses
* <code>Character.getType()</code>, so it reflects the underlying
* implmementation used by <code>Character</code>. As of Java 2 and
* JDK 1.1.8, this is Unicode <b>2.x.x - fill in version here</b>.
*
* <pre>
* Normative
* Mn = Mark, Non-Spacing
* Mc = Mark, Spacing Combining
* Me = Mark, Enclosing
*
* Nd = Number, Decimal Digit
* Nl = Number, Letter
* No = Number, Other
*
* Zs = Separator, Space
* Zl = Separator, Line
* Zp = Separator, Paragraph
*
* Cc = Other, Control
* Cf = Other, Format
* Cs = Other, Surrogate
* Co = Other, Private Use
* Cn = Other, Not Assigned
*
* Informative
* Lu = Letter, Uppercase
* Ll = Letter, Lowercase
* Lt = Letter, Titlecase
* Lm = Letter, Modifier
* Lo = Letter, Other
*
* Pc = Punctuation, Connector
* Pd = Punctuation, Dash
* Ps = Punctuation, Open
* Pe = Punctuation, Close
* Pi = Punctuation, Initial quote
* Pf = Punctuation, Final quote
* Po = Punctuation, Other
*
* Sm = Symbol, Math
* Sc = Symbol, Currency
* Sk = Symbol, Modifier
* So = Symbol, Other
* </pre>
* *Unsupported by Java (and hence unsupported by UnicodeSet).
*
* @author Alan Liu
* @version $RCSfile: uniset.cpp,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $
*/
// Note: This mapping is different in ICU and Java
// N.B.: This mapping is different in ICU and Java
const UnicodeString UnicodeSet::CATEGORY_NAMES(
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf");
/**
* A cache mapping character category integers, as returned by
* Character.getType(), to pairs strings. Entries are initially
* null and are created on demand.
* Unicode::getType(), to pairs strings. Entries are initially
* zero length and are filled in on demand.
*/
UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE =
new UnicodeString[Unicode::GENERAL_TYPES_COUNT];
@ -193,13 +36,13 @@ const UnicodeString& UnicodeSet::getPairs() const {
}
//----------------------------------------------------------------
// Public API
// Constructors &c
//----------------------------------------------------------------
/**
* Constructs an empty set.
*/
UnicodeSet::UnicodeSet() {}
UnicodeSet::UnicodeSet() : pairs() {}
/**
* Constructs a set from the given pattern, optionally ignoring
@ -214,12 +57,12 @@ UnicodeSet::UnicodeSet() {}
* contains a syntax error.
*/
UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces,
UErrorCode& status) {
UErrorCode& status) : pairs() {
applyPattern(pattern, ignoreSpaces, status);
}
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
UErrorCode& status) {
UErrorCode& status) : pairs() {
applyPattern(pattern, status);
}
@ -230,7 +73,7 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern,
* @exception <code>IllegalArgumentException</code> if the given
* category is invalid.
*/
UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) {
UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : pairs() {
if (U_SUCCESS(status)) {
if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
status = U_ILLEGAL_ARGUMENT_ERROR;
@ -240,6 +83,52 @@ UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) {
}
}
/**
* Constructs a set that is identical to the given UnicodeSet.
*/
UnicodeSet::UnicodeSet(const UnicodeSet& o) : pairs(o.pairs) {}
/**
* Destructs the set.
*/
UnicodeSet::~UnicodeSet() {}
/**
* Assigns this object to be a copy of another.
*/
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
pairs = o.pairs;
return *this;
}
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the two sets
* have the same size, and every member of the specified set is
* contained in this set (or equivalently, every member of this set is
* contained in the specified set).
*
* @param o set to be compared for equality with this set.
* @return <tt>true</tt> if the specified set is equal to this set.
*/
bool_t UnicodeSet::operator==(const UnicodeSet& o) const {
return pairs == o.pairs;
}
/**
* Returns the hash code value for this set.
*
* @return the hash code value for this set.
* @see Object#hashCode()
*/
int32_t UnicodeSet::hashCode() const {
return pairs.hashCode();
}
//----------------------------------------------------------------
// Public API
//----------------------------------------------------------------
/**
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
@ -302,7 +191,7 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const {
result.remove().append((UChar)'[');
// iterate through the ranges in the CharSet
// iterate through the ranges in the UnicodeSet
for (int32_t i=0; i<pairs.length(); i+=2) {
// for a range with the same beginning and ending point,
// output that character, otherwise, output the start and
@ -317,9 +206,8 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const {
}
/**
* Returns the number of elements in this set (its cardinality). If this
* set contains more than <tt>Integer.MAX_VALUE</tt> elements, returns
* <tt>Integer.MAX_VALUE</tt>.
* Returns the number of elements in this set (its cardinality),
* <em>n</em>, where <code>0 <= </code><em>n</em><code> <= 65536</code>.
*
* @return the number of elements in this set (its cardinality).
*/
@ -484,7 +372,7 @@ void UnicodeSet::removeAll(const UnicodeSet& c) {
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This is equivalent to the pseudo code:
* <code>this = new CharSet("[\u0000-\uFFFF]").removeAll(this)</code>.
* <code>this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this)</code>.
*/
void UnicodeSet::complement() {
doComplement(pairs);
@ -498,30 +386,6 @@ void UnicodeSet::clear() {
pairs.remove();
}
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the specified object is also a set, the two sets
* have the same size, and every member of the specified set is
* contained in this set (or equivalently, every member of this set is
* contained in the specified set).
*
* @param o Object to be compared for equality with this set.
* @return <tt>true</tt> if the specified Object is equal to this set.
*/
bool_t UnicodeSet::operator==(const UnicodeSet& o) const {
return pairs == o.pairs;
}
/**
* Returns the hash code value for this set.
*
* @return the hash code value for this set.
* @see Object#hashCode()
*/
int32_t UnicodeSet::hashCode() const {
return pairs.hashCode();
}
//----------------------------------------------------------------
// Implementation: Pattern parsing
//----------------------------------------------------------------

View file

@ -1,9 +1,19 @@
/*
**********************************************************************
* Copyright (C) 1999 Alan Liu and others. All rights reserved.
**********************************************************************
* Date Name Description
* 10/20/99 alan Creation.
**********************************************************************
*/
#ifndef UNICODESET_H
#define UNICODESET_H
#include "utypes.h"
#include "unistr.h"
#include "parsepos.h"
class ParsePosition;
/**
* A mutable set of Unicode characters. Objects of this class
@ -21,7 +31,7 @@
*
* <p>The second API is the
* <code>applyPattern()</code>/<code>toPattern()</code> API from the
* <code>java.text.Format</code>-derived classes. Unlike the
* <code>Format</code>-derived classes. Unlike the
* methods that add characters, add categories, and control the logic
* of the set, the method <code>applyPattern()</code> sets all
* attributes of a <code>UnicodeSet</code> at once, based on a
@ -122,9 +132,8 @@
* '[:Lu:]'. The complement of a category is specified by inserting
* '^' after the opening '[:'. The following category names are
* recognized. Actual determination of category data uses
* <code>Character.getType()</code>, so it reflects the underlying
* implmementation used by <code>Character</code>. As of Java 2 and
* JDK 1.1.8, this is Unicode <b>2.x.x - fill in version here</b>.
* <code>Unicode::getType()</code>, so it reflects the underlying
* data used by <code>Unicode</code>.
*
* <pre>
* Normative
@ -157,8 +166,8 @@
* Pd = Punctuation, Dash
* Ps = Punctuation, Open
* Pe = Punctuation, Close
* *Pi = Punctuation, Initial quote
* *Pf = Punctuation, Final quote
* Pi = Punctuation, Initial quote
* Pf = Punctuation, Final quote
* Po = Punctuation, Other
*
* Sm = Symbol, Math
@ -166,15 +175,13 @@
* Sk = Symbol, Modifier
* So = Symbol, Other
* </pre>
* *Unsupported by Java (and hence unsupported by UnicodeSet).
*
* @author Alan Liu
* @version $RCSfile: uniset.h,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $
*/
class U_I18N_API UnicodeSet {
/**
* The internal representation is a StringBuffer of even length.
* The internal representation is a UnicodeString of even length.
* Each pair of characters represents a range that is included in
* the set. A single character c is represented as cc. Thus, the
* ranges in the set are (a,b), a and b inclusive, where a =
@ -191,8 +198,8 @@ class U_I18N_API UnicodeSet {
/**
* A cache mapping character category integers, as returned by
* Character.getType(), to pairs strings. Entries are initially
* null and are created on demand.
* Unicode::getType(), to pairs strings. Entries are initially
* zero length and are filled in on demand.
*/
static UnicodeString* CATEGORY_PAIRS_CACHE;
@ -210,7 +217,7 @@ public:
const UnicodeString& getPairs() const;
//----------------------------------------------------------------
// Public API
// Constructors &c
//----------------------------------------------------------------
public:
@ -254,6 +261,51 @@ public:
*/
UnicodeSet(int8_t category, UErrorCode& status);
/**
* Constructs a set that is identical to the given UnicodeSet.
*/
UnicodeSet(const UnicodeSet& o);
/**
* Destructs the set.
*/
virtual ~UnicodeSet();
/**
* Assigns this object to be a copy of another.
*/
UnicodeSet& operator=(const UnicodeSet& o);
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the two sets
* have the same size, and every member of the specified set is
* contained in this set (or equivalently, every member of this set is
* contained in the specified set).
*
* @param o set to be compared for equality with this set.
* @return <tt>true</tt> if the specified set is equal to this set.
*/
virtual bool_t operator==(const UnicodeSet& o) const;
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the specified set is not equal to this set.
*/
bool_t operator!=(const UnicodeSet& o) const;
/**
* Returns the hash code value for this set.
*
* @return the hash code value for this set.
* @see Object#hashCode()
*/
virtual int32_t hashCode() const;
//----------------------------------------------------------------
// Public API
//----------------------------------------------------------------
/**
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
@ -291,9 +343,8 @@ public:
virtual UnicodeString& toPattern(UnicodeString& result) const;
/**
* Returns the number of elements in this set (its cardinality). If this
* set contains more than <tt>Integer.MAX_VALUE</tt> elements, returns
* <tt>Integer.MAX_VALUE</tt>.
* Returns the number of elements in this set (its cardinality),
* <em>n</em>, where <code>0 <= </code><em>n</em><code> <= 65536</code>.
*
* @return the number of elements in this set (its cardinality).
*/
@ -419,28 +470,6 @@ public:
*/
virtual void clear();
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the specified object is also a set, the two sets
* have the same size, and every member of the specified set is
* contained in this set (or equivalently, every member of this set is
* contained in the specified set).
*
* @param o Object to be compared for equality with this set.
* @return <tt>true</tt> if the specified Object is equal to this set.
*/
virtual bool_t operator==(const UnicodeSet& o) const;
bool_t operator!=(const UnicodeSet& o) const;
/**
* Returns the hash code value for this set.
*
* @return the hash code value for this set.
* @see Object#hashCode()
*/
virtual int32_t hashCode() const;
//----------------------------------------------------------------
// Implementation: Pattern parsing
//----------------------------------------------------------------