diff --git a/source/i18n/uniset.cpp b/source/i18n/uniset.cpp index abd0213f6c9..04314dd5bd3 100644 --- a/source/i18n/uniset.cpp +++ b/source/i18n/uniset.cpp @@ -1,180 +1,23 @@ +/* +********************************************************************** +* Copyright (C) 1999 Alan Liu and others. All rights reserved. +********************************************************************** +* Date Name Description +* 10/20/99 alan Creation. +********************************************************************** +*/ + #include "uniset.h" +#include "parsepos.h" -/** - * A mutable set of Unicode characters. Objects of this class - * represent character classes used in regular expressions. - * Such classes specify a subset of the set of all Unicode characters, - * which in this implementation is the characters from U+0000 to - * U+FFFF, ignoring surrogates. - * - *
This class supports two APIs. The first is modeled after Java 2's
- * java.util.Set
interface, although this class does not
- * implement that interface. All methods of Set
are
- * supported, with the modification that they take a character range
- * or single character instead of an Object
, and they
- * take a UnicodeSet
instead of a Collection
.
- *
- *
The second API is the
- * applyPattern()
/toPattern()
API from the
- * java.text.Format
-derived classes. Unlike the
- * methods that add characters, add categories, and control the logic
- * of the set, the method applyPattern()
sets all
- * attributes of a UnicodeSet
at once, based on a
- * string pattern.
- *
- *
In addition, the set complement operation is supported through
- * the complement()
method.
- *
- *
Pattern syntax
- * - * Patterns are accepted by the constructors and the - *applyPattern()
methods and returned by the
- * toPattern()
method. These patterns follow a syntax
- * similar to that employed by version 8 regular expression character
- * classes:
- *
- * - * - * Patterns specify individual characters, ranges of characters, and - * Unicode character categories. When elements are concatenated, they - * specify their union. To complement a set, place a '^' immediately - * after the opening '[' or '[:'. In any other location, '^' has no - * special meaning. - * - *- * pattern := ('[' '^'? item* ']') | ('[:' '^'? category ':]')
- * - *
- * item := char | (char '-' char) | pattern-expr
- * pattern-expr := pattern | pattern-expr pattern | pattern-expr op pattern
- * op := '&' | '-'
- * special := '[' | ']' | '-'
- * char := any character that is not special | - * ('\' any character) | - * ('\\u' hex hex hex hex)
- * hex := any hex digit, as defined by Character.digit(c, 16) - *
Legend: - * - *- *
- *a:=b
- *a
may be replaced by - *b
- *a?
- *zero or one instance of a
- *a*
- *one or more instances of a
- *a|b
- *either a
orb
- *'a'
- *the literal string between the quotes - *
Ranges are indicated by placing two a '-' between two - * characters, as in "a-z". This specifies the range of all - * characters from the left to the right, in Unicode order. If the - * left and right characters are the same, then the range consists of - * just that character. If the left character is greater than the - * right character it is a syntax error. If a '-' occurs as the first - * character after the opening '[' or '[^', or if it occurs as the - * last character before the closing ']', then it is taken as a - * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same - * set of three characters, 'a', 'b', and '-'. - * - *
Sets may be intersected using the '&' operator or the asymmetric - * set difference may be taken using the '-' operator, for example, - * "[[:L:]&[\u0000-\u0FFF]]" indicates the set of all Unicode letters - * with values less than 4096. Operators ('&' and '|') have equal - * precedence and bind left-to-right. Thus - * "[[:L:]-[a-z]-[\u0100-\u01FF]]" is equivalent to - * "[[[:L:]-[a-z]]-[\u0100-\u01FF]]". This only really matters for - * difference; intersection is commutative. - * - *
[a] | The set containing 'a' - * |
[a-z] | The set containing 'a' - * through 'z' and all letters in between, in Unicode order - * |
[^a-z] | The set containing - * all characters but 'a' through 'z', - * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF - * |
[[pat1][pat2]]
- * | The union of sets specified by pat1 and pat2 - * |
[[pat1]&[pat2]]
- * | The intersection of sets specified by pat1 and pat2 - * |
[[pat1]-[pat2]]
- * | The asymmetric difference of sets specified by pat1 and - * pat2 - * |
[:Lu:]
- * | The set of characters belonging to the given
- * Unicode category, as defined by Character.getType() ; in
- * this case, Unicode uppercase letters
- * |
[:L:]
- * | The set of characters belonging to all Unicode categories
- * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]] .
- * |
Character categories.
- *
- * Character categories are specified using the POSIX-like syntax
- * '[:Lu:]'. The complement of a category is specified by inserting
- * '^' after the opening '[:'. The following category names are
- * recognized. Actual determination of category data uses
- * Character.getType()
, so it reflects the underlying
- * implmementation used by Character
. As of Java 2 and
- * JDK 1.1.8, this is Unicode 2.x.x - fill in version here.
- *
- *
- * Normative - * Mn = Mark, Non-Spacing - * Mc = Mark, Spacing Combining - * Me = Mark, Enclosing - * - * Nd = Number, Decimal Digit - * Nl = Number, Letter - * No = Number, Other - * - * Zs = Separator, Space - * Zl = Separator, Line - * Zp = Separator, Paragraph - * - * Cc = Other, Control - * Cf = Other, Format - * Cs = Other, Surrogate - * Co = Other, Private Use - * Cn = Other, Not Assigned - * - * Informative - * Lu = Letter, Uppercase - * Ll = Letter, Lowercase - * Lt = Letter, Titlecase - * Lm = Letter, Modifier - * Lo = Letter, Other - * - * Pc = Punctuation, Connector - * Pd = Punctuation, Dash - * Ps = Punctuation, Open - * Pe = Punctuation, Close - * Pi = Punctuation, Initial quote - * Pf = Punctuation, Final quote - * Po = Punctuation, Other - * - * Sm = Symbol, Math - * Sc = Symbol, Currency - * Sk = Symbol, Modifier - * So = Symbol, Other - *- * *Unsupported by Java (and hence unsupported by UnicodeSet). - * - * @author Alan Liu - * @version $RCSfile: uniset.cpp,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $ - */ - -// Note: This mapping is different in ICU and Java +// N.B.: This mapping is different in ICU and Java const UnicodeString UnicodeSet::CATEGORY_NAMES( "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf"); /** * A cache mapping character category integers, as returned by - * Character.getType(), to pairs strings. Entries are initially - * null and are created on demand. + * Unicode::getType(), to pairs strings. Entries are initially + * zero length and are filled in on demand. */ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE = new UnicodeString[Unicode::GENERAL_TYPES_COUNT]; @@ -193,13 +36,13 @@ const UnicodeString& UnicodeSet::getPairs() const { } //---------------------------------------------------------------- -// Public API +// Constructors &c //---------------------------------------------------------------- /** * Constructs an empty set. */ -UnicodeSet::UnicodeSet() {} +UnicodeSet::UnicodeSet() : pairs() {} /** * Constructs a set from the given pattern, optionally ignoring @@ -214,12 +57,12 @@ UnicodeSet::UnicodeSet() {} * contains a syntax error. */ UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces, - UErrorCode& status) { + UErrorCode& status) : pairs() { applyPattern(pattern, ignoreSpaces, status); } UnicodeSet::UnicodeSet(const UnicodeString& pattern, - UErrorCode& status) { + UErrorCode& status) : pairs() { applyPattern(pattern, status); } @@ -230,7 +73,7 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, * @exception
IllegalArgumentException
if the given
* category is invalid.
*/
-UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) {
+UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : pairs() {
if (U_SUCCESS(status)) {
if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -240,6 +83,52 @@ UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) {
}
}
+/**
+ * Constructs a set that is identical to the given UnicodeSet.
+ */
+UnicodeSet::UnicodeSet(const UnicodeSet& o) : pairs(o.pairs) {}
+
+/**
+ * Destructs the set.
+ */
+UnicodeSet::~UnicodeSet() {}
+
+/**
+ * Assigns this object to be a copy of another.
+ */
+UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
+ pairs = o.pairs;
+ return *this;
+}
+
+/**
+ * Compares the specified object with this set for equality. Returns
+ * true if the two sets
+ * have the same size, and every member of the specified set is
+ * contained in this set (or equivalently, every member of this set is
+ * contained in the specified set).
+ *
+ * @param o set to be compared for equality with this set.
+ * @return true if the specified set is equal to this set.
+ */
+bool_t UnicodeSet::operator==(const UnicodeSet& o) const {
+ return pairs == o.pairs;
+}
+
+/**
+ * Returns the hash code value for this set.
+ *
+ * @return the hash code value for this set.
+ * @see Object#hashCode()
+ */
+int32_t UnicodeSet::hashCode() const {
+ return pairs.hashCode();
+}
+
+//----------------------------------------------------------------
+// Public API
+//----------------------------------------------------------------
+
/**
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
@@ -302,7 +191,7 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const {
result.remove().append((UChar)'[');
- // iterate through the ranges in the CharSet
+ // iterate through the ranges in the UnicodeSet
for (int32_t i=0; i0 <=
n <= 65536
.
*
* @return the number of elements in this set (its cardinality).
*/
@@ -484,7 +372,7 @@ void UnicodeSet::removeAll(const UnicodeSet& c) {
/**
* Inverts this set. This operation modifies this set so that
* its value is its complement. This is equivalent to the pseudo code:
- * this = new CharSet("[\u0000-\uFFFF]").removeAll(this)
.
+ * this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this)
.
*/
void UnicodeSet::complement() {
doComplement(pairs);
@@ -498,30 +386,6 @@ void UnicodeSet::clear() {
pairs.remove();
}
-/**
- * Compares the specified object with this set for equality. Returns
- * true if the specified object is also a set, the two sets
- * have the same size, and every member of the specified set is
- * contained in this set (or equivalently, every member of this set is
- * contained in the specified set).
- *
- * @param o Object to be compared for equality with this set.
- * @return true if the specified Object is equal to this set.
- */
-bool_t UnicodeSet::operator==(const UnicodeSet& o) const {
- return pairs == o.pairs;
-}
-
-/**
- * Returns the hash code value for this set.
- *
- * @return the hash code value for this set.
- * @see Object#hashCode()
- */
-int32_t UnicodeSet::hashCode() const {
- return pairs.hashCode();
-}
-
//----------------------------------------------------------------
// Implementation: Pattern parsing
//----------------------------------------------------------------
diff --git a/source/i18n/uniset.h b/source/i18n/uniset.h
index f3c12148855..9e403223d17 100644
--- a/source/i18n/uniset.h
+++ b/source/i18n/uniset.h
@@ -1,9 +1,19 @@
+/*
+**********************************************************************
+* Copyright (C) 1999 Alan Liu and others. All rights reserved.
+**********************************************************************
+* Date Name Description
+* 10/20/99 alan Creation.
+**********************************************************************
+*/
+
#ifndef UNICODESET_H
#define UNICODESET_H
#include "utypes.h"
#include "unistr.h"
-#include "parsepos.h"
+
+class ParsePosition;
/**
* A mutable set of Unicode characters. Objects of this class
@@ -21,7 +31,7 @@
*
* The second API is the
* applyPattern()
/toPattern()
API from the
- * java.text.Format
-derived classes. Unlike the
+ * Format
-derived classes. Unlike the
* methods that add characters, add categories, and control the logic
* of the set, the method applyPattern()
sets all
* attributes of a UnicodeSet
at once, based on a
@@ -122,9 +132,8 @@
* '[:Lu:]'. The complement of a category is specified by inserting
* '^' after the opening '[:'. The following category names are
* recognized. Actual determination of category data uses
- * Character.getType()
, so it reflects the underlying
- * implmementation used by Character
. As of Java 2 and
- * JDK 1.1.8, this is Unicode 2.x.x - fill in version here.
+ * Unicode::getType()
, so it reflects the underlying
+ * data used by Unicode
.
*
*
* Normative @@ -157,8 +166,8 @@ * Pd = Punctuation, Dash * Ps = Punctuation, Open * Pe = Punctuation, Close - * *Pi = Punctuation, Initial quote - * *Pf = Punctuation, Final quote + * Pi = Punctuation, Initial quote + * Pf = Punctuation, Final quote * Po = Punctuation, Other * * Sm = Symbol, Math @@ -166,15 +175,13 @@ * Sk = Symbol, Modifier * So = Symbol, Other *- * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: uniset.h,v $ $Revision: 1.1 $ $Date: 1999/10/20 22:06:52 $ */ class U_I18N_API UnicodeSet { /** - * The internal representation is a StringBuffer of even length. + * The internal representation is a UnicodeString of even length. * Each pair of characters represents a range that is included in * the set. A single character c is represented as cc. Thus, the * ranges in the set are (a,b), a and b inclusive, where a = @@ -191,8 +198,8 @@ class U_I18N_API UnicodeSet { /** * A cache mapping character category integers, as returned by - * Character.getType(), to pairs strings. Entries are initially - * null and are created on demand. + * Unicode::getType(), to pairs strings. Entries are initially + * zero length and are filled in on demand. */ static UnicodeString* CATEGORY_PAIRS_CACHE; @@ -210,7 +217,7 @@ public: const UnicodeString& getPairs() const; //---------------------------------------------------------------- - // Public API + // Constructors &c //---------------------------------------------------------------- public: @@ -254,6 +261,51 @@ public: */ UnicodeSet(int8_t category, UErrorCode& status); + /** + * Constructs a set that is identical to the given UnicodeSet. + */ + UnicodeSet(const UnicodeSet& o); + + /** + * Destructs the set. + */ + virtual ~UnicodeSet(); + + /** + * Assigns this object to be a copy of another. + */ + UnicodeSet& operator=(const UnicodeSet& o); + + /** + * Compares the specified object with this set for equality. Returns + * true if the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o set to be compared for equality with this set. + * @return true if the specified set is equal to this set. + */ + virtual bool_t operator==(const UnicodeSet& o) const; + + /** + * Compares the specified object with this set for equality. Returns + * true if the specified set is not equal to this set. + */ + bool_t operator!=(const UnicodeSet& o) const; + + /** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ + virtual int32_t hashCode() const; + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + /** * Modifies this set to represent the set specified by the given * pattern, optionally ignoring white space. See the class @@ -291,9 +343,8 @@ public: virtual UnicodeString& toPattern(UnicodeString& result) const; /** - * Returns the number of elements in this set (its cardinality). If this - * set contains more than Integer.MAX_VALUE elements, returns - * Integer.MAX_VALUE. + * Returns the number of elements in this set (its cardinality), + * n, where
0 <=
n <= 65536
.
*
* @return the number of elements in this set (its cardinality).
*/
@@ -419,28 +470,6 @@ public:
*/
virtual void clear();
- /**
- * Compares the specified object with this set for equality. Returns
- * true if the specified object is also a set, the two sets
- * have the same size, and every member of the specified set is
- * contained in this set (or equivalently, every member of this set is
- * contained in the specified set).
- *
- * @param o Object to be compared for equality with this set.
- * @return true if the specified Object is equal to this set.
- */
- virtual bool_t operator==(const UnicodeSet& o) const;
-
- bool_t operator!=(const UnicodeSet& o) const;
-
- /**
- * Returns the hash code value for this set.
- *
- * @return the hash code value for this set.
- * @see Object#hashCode()
- */
- virtual int32_t hashCode() const;
-
//----------------------------------------------------------------
// Implementation: Pattern parsing
//----------------------------------------------------------------