ICU-3280 rewrite applyPattern() to use RuleCharacterIterator; add test cases

X-SVN-Rev: 13238
2025-04-07 22:44:49 +00:00 · 2003-09-29 23:00:39 +00:00 · 2003-09-29 23:00:39 +00:00 · 17eaec5cb0
commit 17eaec5cb0
parent 8f1a781f68
4 changed files with 570 additions and 425 deletions
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -22,7 +22,7 @@ class ParsePosition;
 class SymbolTable;
 class UVector;
 class CaseEquivClass;
-
+class RuleCharacterIterator;
    
 /**
 * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
@ -1113,17 +1113,11 @@ private:

    const UnicodeString* getString(int32_t index) const;

-private:
-
    //----------------------------------------------------------------
    // RuleBasedTransliterator support
    //----------------------------------------------------------------

-    friend class TransliteratorParser;
-    friend class TransliteratorIDParser;
-
-    friend class RBBIRuleScanner;
-    friend class RegexCompile;
+public:

    /**
     * Constructs a set from the given pattern.  See the class description
@ -1142,6 +1136,7 @@ private:
     * varNameToChar is also non-null.
     * @exception <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
     * contains a syntax error.
+     * @draft ICU 2.8
     */
    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
               const SymbolTable& symbols,
@ -1151,10 +1146,13 @@ private:
     * Constructs a set from the given pattern.  Identical to the
     * 4-parameter ParsePosition contstructor, but does not take a
     * SymbolTable, and does not recognize embedded variables.
+     * @draft ICU 2.8
     */
    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
               uint32_t options, UErrorCode& status);

+private:
+
    /**
     * Returns <tt>true</tt> if this set contains any character whose low byte
     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
@ -1198,6 +1196,12 @@ private:
                      const SymbolTable* symbols,
                      UErrorCode& status);

+    void applyPattern(RuleCharacterIterator& chars,
+                      const SymbolTable* symbols,
+                      UnicodeString& rebuiltPat,
+                      int32_t options,
+                      UErrorCode& ec);
+
    //----------------------------------------------------------------
    // Implementation: Utility methods
    //----------------------------------------------------------------
@ -1210,13 +1214,6 @@ private:

    UBool allocateStrings();

-    void _applyPattern(const UnicodeString& pattern,
-                       ParsePosition& pos,
-                       uint32_t options,
-                       const SymbolTable* symbols,
-                       UnicodeString& rebuiltPat,
-                       UErrorCode& status);
-
    UnicodeString& _toPattern(UnicodeString& result,
                              UBool escapeUnprintable) const;

@ -1245,6 +1242,9 @@ private:
    static UBool resemblesPropertyPattern(const UnicodeString& pattern,
                                          int32_t pos);

+    static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
+                                          int32_t iterOpts);
+
    /**
     * Parse the given property pattern at the given parse position
     * and set this UnicodeSet to the result.
@ -1287,6 +1287,10 @@ private:
                                     ParsePosition& ppos,
                                     UErrorCode &ec);

+    void applyPropertyPattern(RuleCharacterIterator& chars,
+                              UnicodeString& rebuiltPat,
+                              UErrorCode& ec);
+
    /**
     * A filter that returns TRUE if the given code point should be
     * included in the UnicodeSet being constructed.
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@ -12,7 +12,8 @@
 #include "unicode/parsepos.h"
 #include "unicode/uchar.h"
 #include "unicode/uscript.h"
-#include "symtable.h"
+#include "symtable.h" // TODO => unicode/symtable.h
+#include "ruleiter.h"
 #include "cmemory.h"
 #include "uhash.h"
 #include "util.h"
@ -60,6 +61,7 @@ static const UChar POSIX_CLOSE[] = { 58,93,0 };  // ":]"
 static const UChar PERL_OPEN[]   = { 92,112,0 }; // "\\p"
 static const UChar PERL_CLOSE[]  = { 125,0 };    // "}"
 static const UChar NAME_OPEN[]   = { 92,78,0 };  // "\\N"
+static const UChar HYPHEN_RIGHT_BRACE[] = {0x2D,0x5D,0}; /*-]*/

 // Special property set IDs
 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
@ -1878,474 +1880,417 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
    if (U_FAILURE(status)) {
        return;
    }
-
    // Need to build the pattern in a temporary string because
    // _applyPattern calls add() etc., which set pat to empty.
    UnicodeString rebuiltPat;
-    _applyPattern(pattern, pos, options, symbols, rebuiltPat, status);
+    RuleCharacterIterator chars(pattern, symbols, pos);
+    applyPattern(chars, symbols, rebuiltPat, options, status);
+    if (U_FAILURE(status)) return;
+    if (chars.inVariable()) {
+        // syntaxError(chars, "Extra chars in variable value");
+        status = U_MALFORMED_SET;
+        return;
+    }
    pat = rebuiltPat;
 }

-void UnicodeSet::_applyPattern(const UnicodeString& pattern,
-                               ParsePosition& pos,
-                               uint32_t options,
-                               const SymbolTable* symbols,
-                               UnicodeString& rebuiltPat,
-                               UErrorCode& status) {
+/**
+ * Parse the pattern from the given RuleCharacterIterator.  The
+ * iterator is advanced over the parsed pattern.
+ * @param chars iterator over the pattern characters.  Upon return
+ * it will be advanced to the first character after the parsed
+ * pattern, or the end of the iteration if all characters are
+ * parsed.
+ * @param symbols symbol table to use to parse and dereference
+ * variables, or null if none.
+ * @param rebuiltPat the pattern that was parsed, rebuilt or
+ * copied from the input pattern, as appropriate.
+ * @param options a bit mask of zero or more of the following:
+ * IGNORE_SPACE, CASE.
+ */
+void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
+                              const SymbolTable* symbols,
+                              UnicodeString& rebuiltPat,
+                              int32_t options,
+                              UErrorCode& ec) {
+    if (U_FAILURE(ec)) return;

-    if (U_FAILURE(status)) {
-        return;
+    // Syntax characters: [ ] ^ - & { }
+
+    // Recognized special forms for chars, sets: c-c s-s s&s
+
+    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
+                   RuleCharacterIterator::PARSE_ESCAPES;
+    if ((options & USET_IGNORE_SPACE) != 0) {
+        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
    }

-    // If the pattern contains any of the following, we save a
-    // rebuilt (variable-substituted) copy of the source pattern:
-    // - a category
-    // - an intersection or subtraction operator
-    // - an anchor (trailing '$', indicating RBT ether)
-    UBool rebuildPattern = FALSE;
-    UnicodeString newPat(SET_OPEN);
-    int32_t nestedPatStart = - 1; // see below for usage
-    UBool nestedPatDone = FALSE; // see below for usage
-    UnicodeString multiCharBuffer;
+    UnicodeString pat, buf;
+    UBool usePat = FALSE;
+    UnicodeSet* scratch = 0;
+    RuleCharacterIterator::Pos backup;
+
+    // mode: 0=before [, 1=between [...], 2=after ]
+    // lastItem: 0=none, 1=char, 2=set
+    int8_t lastItem = 0, mode = 0;
+    UChar32 lastChar = 0;
+    UChar op = 0;

    UBool invert = FALSE;
+
    clear();

-    const UChar32 NONE = (UChar32) -1;
-    UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
-    UBool isLastLiteral = FALSE; // TRUE if lastChar was a literal
-    UChar lastOp = 0;
+    while (mode != 2 && !chars.atEnd()) {
+        U_ASSERT((lastItem == 0 && op == 0) ||
+                 (lastItem == 1 && (op == 0 || op == 0x2D /*'-'*/)) ||
+                 (lastItem == 2 && (op == 0 || op == 0x2D /*'-'*/ ||
+                                    op == 0x26 /*'&'*/)));

-    /* This loop iterates over the characters in the pattern.  We start at
-     * the position specified by pos.  We exit the loop when either a
-     * matching closing ']' is seen, or we read all characters of the
-     * pattern.  In the latter case an error will be thrown.
-     */
+        UChar32 c = 0;
+        UBool literal = FALSE;
+        UnicodeSet* nested = 0;

-    /* Pattern syntax:
-     *  pat := '[' '^'? elem* ']'
-     *  elem := a | a '-' a | set | set op set
-     *  set := pat | (a set variable)
-     *  op := '&' | '-'
-     *  a := (a character, possibly defined by a var)
-     */
+        // -------- Check for property pattern

-    // mode 0: No chars parsed yet; next must be '['
-    // mode 1: '[' seen; if next is '^' or ':' then special
-    // mode 15: "[^" seen; if next is '-' then literal
-    // mode 2: '[' '^'? '-'? seen; parse pattern and close with ']'
-    // mode 3: '[:' seen; parse category and close with ':]'
-    // mode 4: ']' seen; parse complete
-    // mode 5: Top-level property pattern seen
-    int8_t mode = 0;
-    int32_t i = pos.getIndex();
-    int32_t limit = pattern.length();
-    UnicodeSet nestedAux;
-    const UnicodeSet* nestedSet; // never owned
-    UnicodeString scratch;
-    /* In the case of an embedded SymbolTable variable, we look it up and
-     * then take characters from the resultant char[] array.  These chars
-     * are subjected to an extra level of lookup in the SymbolTable in case
-     * they are stand-ins for a nested UnicodeSet.  */
-    const UnicodeString* varValueBuffer = NULL;
-    int32_t ivarValueBuffer = 0;
-    int32_t anchor = 0;
-    UChar32 c;
-    while (i<limit) {
-        /* If the next element is a single character, c will be set to it,
-         * and nestedSet will be null.  In this case isLiteral indicates
-         * whether the character should assume special meaning if it has
-         * one.  If the next element is a nested set, either via a variable
-         * reference, or via an embedded "[..]"  or "[:..:]" pattern, then
-         * nestedSet will be set to the pairs list for the nested set, and
-         * c's value should be ignored.
-         */
-        nestedSet = NULL;
-        UBool isLiteral = FALSE;
-        if (varValueBuffer != NULL) {
-            if (ivarValueBuffer < varValueBuffer->length()) {
-                c = varValueBuffer->char32At(ivarValueBuffer);
-                ivarValueBuffer += UTF_CHAR_LENGTH(c);
-                const UnicodeFunctor *m = symbols->lookupMatcher(c); // may be NULL
-                if (m != NULL && m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
-                    status = U_ILLEGAL_ARGUMENT_ERROR;
-                    return;
-                }
-                nestedSet = (UnicodeSet*) m;
-                nestedPatDone = FALSE;
-            } else {
-                varValueBuffer = NULL;
-                c = pattern.char32At(i);
-                i += UTF_CHAR_LENGTH(c);
-            }
-        } else {
-            c = pattern.char32At(i);
-            i += UTF_CHAR_LENGTH(c);
+        // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
+        int8_t setMode = 0;
+        if (resemblesPropertyPattern(chars, opts)) {
+            setMode = 2;
        }

-        if ((options & USET_IGNORE_SPACE) && uprv_isRuleWhiteSpace(c)) {
+        // -------- Parse '[' of opening delimiter OR nested set.
+        // If there is a nested set, use `setMode' to define how
+        // the set should be parsed.  If the '[' is part of the
+        // opening delimiter for this pattern, parse special
+        // strings "[", "[^", "[-", and "[^-".  Check for stand-in
+        // characters representing a nested set in the symbol
+        // table.
+
+        else {
+            // Prepare to backup if necessary
+            chars.getPos(backup);
+            c = chars.next(opts, literal, ec);
+            if (U_FAILURE(ec)) return;
+
+            if (c == 0x5B /*'['*/ && !literal) {
+                if (mode == 1) {
+                    chars.setPos(backup); // backup
+                    setMode = 1;
+                } else {
+                    // Handle opening '[' delimiter
+                    mode = 1;
+                    pat.append((UChar) 0x5B /*'['*/);
+                    chars.getPos(backup); // prepare to backup
+                    c = chars.next(opts, literal, ec); 
+                    if (U_FAILURE(ec)) return;
+                    if (c == 0x5E /*'^'*/ && !literal) {
+                        invert = TRUE;
+                        pat.append((UChar) 0x5E /*'^'*/);
+                        chars.getPos(backup); // prepare to backup
+                        c = chars.next(opts, literal, ec);
+                        if (U_FAILURE(ec)) return;
+                    }
+                    // Fall through to handle special leading '-';
+                    // otherwise restart loop for nested [], \p{}, etc.
+                    if (c == 0x2D /*'-'*/) {
+                        literal = TRUE;
+                        // Fall through to handle literal '-' below
+                    } else {
+                        chars.setPos(backup); // backup
+                        continue;
+                    }
+                }
+            } else if (symbols != 0) {
+                const UnicodeFunctor *m = symbols->lookupMatcher(c);
+                if (m != 0) {
+                    if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
+                        ec = U_MALFORMED_SET;
+                        return;
+                    }
+                    // casting away const, but `nested' won't be modified
+                    // (important not to modify stored set)
+                    nested = (UnicodeSet*) m;
+                    setMode = 3;
+                }
+            }
+        }
+
+        // -------- Handle a nested set.  This either is inline in
+        // the pattern or represented by a stand-in that has
+        // previously been parsed and was looked up in the symbol
+        // table.
+
+        if (setMode != 0) {
+            if (lastItem == 1) {
+                if (op != 0) {
+                    // syntaxError(chars, "Char expected after operator");
+                    ec = U_MALFORMED_SET;
+                    return;
+                }
+                add(lastChar, lastChar);
+                _appendToPat(pat, lastChar, FALSE);
+                lastItem = 0;
+                op = 0;
+            }
+
+            if (op == 0x2D /*'-'*/ || op == 0x26 /*'&'*/) {
+                pat.append(op);
+            }
+
+            if (nested == 0) {
+                if (scratch == 0) { // lazy allocation
+                    scratch = new UnicodeSet();
+                    if (scratch == 0) {
+                        ec = U_MEMORY_ALLOCATION_ERROR;
+                        return;
+                    }
+                }
+                nested = scratch;
+            }
+            switch (setMode) {
+            case 1:
+                nested->applyPattern(chars, symbols, pat, options, ec);
+                break;
+            case 2:
+                chars.skipIgnored(opts);
+                nested->applyPropertyPattern(chars, pat, ec);
+                if (U_FAILURE(ec)) return;
+                break;
+            case 3: // `nested' already parsed
+                nested->_toPattern(pat, FALSE);
+                break;
+            }
+
+            usePat = TRUE;
+
+            if (mode == 0) {
+                // Entire pattern is a category; leave parse loop
+                *this = *nested;
+                mode = 2;
+                break;
+            }
+
+            switch (op) {
+            case '-':
+                removeAll(*nested);
+                break;
+            case '&':
+                retainAll(*nested);
+                break;
+            case 0:
+                addAll(*nested);
+                break;
+            }
+
+            op = 0;
+            lastItem = 2;
+
            continue;
        }

-        // Keep track of the count of characters after an alleged anchor
-        if (anchor > 0) {
-            ++anchor;
+        if (mode == 0) {
+            // syntaxError(chars, "Missing '['");
+            ec = U_MALFORMED_SET;
+            return;
        }

-        // Parse the opening '[' and optional following '^'
-        switch (mode) {
-        case 0:
-            if (resemblesPropertyPattern(pattern, i-1)) {
-                mode = 3;
-                break; // Fall through
-            } else if (c == SET_OPEN) {
-                mode = 1; // Next look for '^' or ':'
-                continue;
-            } else {
-                // throw new IllegalArgumentException("Missing opening '['");
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return;
-            }
-        case 1:
-            mode = 2;
+        // -------- Parse special (syntax) characters.  If the
+        // current character is not special, or if it is escaped,
+        // then fall through and handle it below.
+
+        if (!literal) {
            switch (c) {
-            case COMPLEMENT:
-                invert = TRUE;
-                newPat.append(c);
-                mode = 15;
-                continue; // Back to top to fetch next character
-            case HYPHEN:
-                isLiteral = TRUE; // Treat leading '-' as a literal
-                break; // Fall through
-            }
-            break;
-        case 15:
-            mode = 2;
-            if (c == HYPHEN) {
-                isLiteral = TRUE; // [^-...] starts with literal '-'
-            }
-            break;
-            // else fall through and parse this character normally
-        }
-
-        // After opening matter is parsed ("[", "[^", or "[:"), the mode
-        // will be 2 if we want a closing ']', or 3 if we should parse a
-        // category and close with ":]".
-
-        // Only process escapes, variable references, and nested sets
-        // if we are _not_ retrieving characters from the variable
-        // buffer.  Characters in the variable buffer have already
-        // benn through escape and variable reference processing.
-        if (varValueBuffer == NULL) {
-            /**
-             * Handle property set patterns.
-             */
-            if (resemblesPropertyPattern(pattern, i-1)) {
-                ParsePosition pp(i-1);
-                nestedAux.applyPropertyPattern(pattern, pp, status);
-                if (U_FAILURE(status)) {
-                    U_ASSERT(pp.getIndex() == i-1);
-                    //throw new IllegalArgumentException("Invalid property pattern " +
-                    //                                   pattern.substring(i-1));
+            case 0x5D /*']'*/:
+                if (lastItem == 1) {
+                    add(lastChar, lastChar);
+                    _appendToPat(pat, lastChar, FALSE);
+                }
+                // Treat final trailing '-' as a literal
+                if (op == 0x2D /*'-'*/) {
+                    add(op, op);
+                    pat.append(op);
+                } else if (op == 0x26 /*'&'*/) {
+                    // syntaxError(chars, "Trailing '&'");
+                    ec = U_MALFORMED_SET;
                    return;
                }
-                nestedSet = &nestedAux;
-                nestedPatStart = newPat.length();
-                nestedPatDone = TRUE; // we're going to do it just below
-                
-                switch (lastOp) {
-                case HYPHEN:
-                case INTERSECTION:
-                    newPat.append(lastOp);
-                    break;
-                }
-
-                // If we have a top-level property pattern, then trim
-                // off the opening '[' and use the property pattern
-                // as the entire pattern.
-                if (mode == 3) {
-                    newPat.truncate(0);
-                }
-                UnicodeString str;
-                pattern.extractBetween(i-1, pp.getIndex(), str);
-                newPat.append(str);
-                rebuildPattern = TRUE;
-                
-                i = pp.getIndex(); // advance past property pattern
-                
-                if (mode == 3) {
-                    // Entire pattern is a category; leave parse
-                    // loop.  This is one of 2 ways we leave this
-                    // loop if the pattern is well-formed.
-                    *this = nestedAux;
-                    mode = 5;
-                    break;
-                }
-            }
-            
-            /* Handle escapes.  If a character is escaped, then it assumes its
-             * literal value.  This is true for all characters, both special
-             * characters and characters with no special meaning.  We also
-             * interpret '\\uxxxx' Unicode escapes here (as literals).
-             */
-            else if (c == BACKSLASH) {
-                UChar32 escaped = pattern.unescapeAt(i);
-                if (escaped == (UChar32) -1) {
-                    status = U_ILLEGAL_ARGUMENT_ERROR;
-                    return;
-                }
-                isLiteral = TRUE;
-                c = escaped;
-            }
-
-            /* Parse variable references.  These are treated as literals.  If a
-             * variable refers to a UnicodeSet, its stand in character is
-             * returned in the UChar[] buffer.
-             * Variable names are only parsed if varNameToChar is not null.
-             * Set variables are only looked up if varCharToSet is not null.
-             */
-            else if (symbols != NULL && !isLiteral && c == SymbolTable::SYMBOL_REF) {
-                pos.setIndex(i);
-                UnicodeString name = symbols->parseReference(pattern, pos, limit);
-                if (name.length() != 0) {
-                    varValueBuffer = symbols->lookup(name);
-                    if (varValueBuffer == NULL) {
-                        //throw new IllegalArgumentException("Undefined variable: "
-                        //                                   + name);
-                        status = U_ILLEGAL_ARGUMENT_ERROR;
-                        return;
-                    }
-                    ivarValueBuffer = 0;
-                    i = pos.getIndex(); // Make i point PAST last char of var name
-                } else {
-                    // Got a null; this means we have an isolated $.
-                    // Tentatively assume this is an anchor.
-                    anchor = 1;
-                }
-                continue; // Back to the top to get varValueBuffer[0]
-            }
-
-            /* An opening bracket indicates the first bracket of a nested
-             * subpattern.
-             */
-            else if (!isLiteral && c == SET_OPEN) {
-                // Record position before nested pattern
-                nestedPatStart = newPat.length();
-
-                // Recurse to get the pairs for this nested set.
-                // Backup i to '['.
-                pos.setIndex(--i);
-                switch (lastOp) {
-                case HYPHEN:
-                case INTERSECTION:
-                    newPat.append(lastOp);
-                    break;
-                }
-                nestedAux._applyPattern(pattern, pos, options, symbols, newPat, status);
-                nestedSet = &nestedAux;
-                nestedPatDone =  TRUE;
-                if (U_FAILURE(status)) {
-                    return;
-                }
-                i = pos.getIndex();
-            }
-
-            else if (!isLiteral && c == OPEN_BRACE) {
-                // start of a string. find the rest.
-                int32_t length = 0;
-                int32_t st = i;
-                multiCharBuffer.truncate(0);
-                while (i < pattern.length()) {
-                    UChar32 ch = pattern.char32At(i);
-                    i += UTF_CHAR_LENGTH(ch); 
-                    if (ch == CLOSE_BRACE) {
-                        length = -length; // signal that we saw '}'
-                        break;
-                    } else if (ch == BACKSLASH) {
-                        ch = pattern.unescapeAt(i);
-                        if (ch == (UChar32) -1) {
-                            status = U_ILLEGAL_ARGUMENT_ERROR;
-                            return;
+                pat.append((UChar) 0x5D /*']'*/);
+                mode = 2;
+                continue;
+            case 0x2D /*'-'*/:
+                if (op == 0) {
+                    if (lastItem != 0) {
+                        op = (UChar) c;
+                        continue;
+                    } else {
+                        // Treat final trailing '-' as a literal
+                        add(c, c);
+                        c = chars.next(opts, literal, ec);
+                        if (U_FAILURE(ec)) return;
+                        if (c == 0x5D /*']'*/ && !literal) {
+                            pat.append(HYPHEN_RIGHT_BRACE);
+                            mode = 2;
+                            continue;
                        }
                    }
-                    --length; // sic; see above
-                    multiCharBuffer.append(ch);
                }
-                if (length < 1) {
-                    status = U_ILLEGAL_ARGUMENT_ERROR;
+                // syntaxError(chars, "'-' not after char or set");
+                ec = U_MALFORMED_SET;
+                return;
+            case 0x26 /*'&'*/:
+                if (lastItem == 2 && op == 0) {
+                    op = (UChar) c;
+                    continue;
+                }
+                // syntaxError(chars, "'&' not after set");
+                ec = U_MALFORMED_SET;
+                return;
+            case 0x5E /*'^'*/:
+                // syntaxError(chars, "'^' not after '['");
+                ec = U_MALFORMED_SET;
+                return;
+            case 0x7B /*'{'*/:
+                if (op != 0) {
+                    // syntaxError(chars, "Missing operand after operator");
+                    ec = U_MALFORMED_SET;
                    return;
                }
+                if (lastItem == 1) {
+                    add(lastChar, lastChar);
+                    _appendToPat(pat, lastChar, FALSE);
+                }
+                lastItem = 0;
+                buf.truncate(0);
+                {
+                    UBool ok = FALSE;
+                    while (!chars.atEnd()) {
+                        c = chars.next(opts, literal, ec);
+                        if (U_FAILURE(ec)) return;
+                        if (c == 0x7D /*'}'*/ && !literal) {
+                            ok = TRUE;
+                            break;
+                        }
+                        buf.append(c);
+                    }
+                    if (buf.length() < 1 || !ok) {
+                        // syntaxError(chars, "Invalid multicharacter string");
+                        ec = U_MALFORMED_SET;
+                        return;
+                    }
+                }
                // We have new string. Add it to set and continue;
                // we don't need to drop through to the further
                // processing
-                add(multiCharBuffer);
-                pattern.extractBetween(st, i, multiCharBuffer);
-                newPat.append(OPEN_BRACE).append(multiCharBuffer);
-                rebuildPattern = TRUE;
+                add(buf);
+                pat.append((UChar) 0x7B /*'{'*/);
+                _appendToPat(pat, buf, FALSE);
+                pat.append((UChar) 0x7D /*'}'*/);
                continue;
-            }
-        }
-
-        /* At this point we have either a character c, or a nested set.  If
-         * we have encountered a nested set, either embedded in the pattern,
-         * or as a variable, we have a non-null nestedSet, and c should be
-         * ignored.  Otherwise c is the current character, and isLiteral
-         * indicates whether it is an escaped literal (or variable) or a
-         * normal unescaped character.  Unescaped characters '-', '&', and
-         * ']' have special meanings.
-         */
-        if (nestedSet != NULL) {
-            if (lastChar != NONE) {
-                if (lastOp != 0) {
-                    // throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
-                    status = U_ILLEGAL_ARGUMENT_ERROR;
+            case SymbolTable::SYMBOL_REF:
+                //         symbols  nosymbols
+                // [a-$]   error    error (ambiguous)
+                // [a$]    anchor   anchor
+                // [a-$x]  var "x"* literal '$'
+                // [a-$.]  error    literal '$'
+                // *We won't get here in the case of var "x"
+                {
+                    chars.getPos(backup);
+                    c = chars.next(opts, literal, ec);
+                    if (U_FAILURE(ec)) return;
+                    UBool anchor = (c == 0x5D /*']'*/ && !literal);
+                    if (symbols == 0 && !anchor) {
+                        c = SymbolTable::SYMBOL_REF;
+                        chars.setPos(backup);
+                        break; // literal '$'
+                    }
+                    if (anchor && op == 0) {
+                        if (lastItem == 1) {
+                            add(lastChar, lastChar);
+                            _appendToPat(pat, lastChar, FALSE);
+                        }
+                        add(U_ETHER);
+                        usePat = TRUE;
+                        pat.append((UChar) SymbolTable::SYMBOL_REF);
+                        pat.append((UChar) 0x5D /*']'*/);
+                        mode = 2;
+                        continue;
+                    }
+                    // syntaxError(chars, "Unquoted '$'");
+                    ec = U_MALFORMED_SET;
                    return;
                }
-                add(lastChar, lastChar);
-                if (nestedPatDone) {
-                    // If there was a character before the nested set,
-                    // then we need to insert it in newPat before the
-                    // pattern for the nested set.  This position was
-                    // recorded in nestedPatStart.
-                    UnicodeString s;
-                    _appendToPat(s, lastChar, FALSE);
-                    newPat.insert(nestedPatStart, s);
-                } else {
-                    _appendToPat(newPat, lastChar, FALSE);
-                }
-                lastChar = NONE;
-            }
-            switch (lastOp) {
-            case HYPHEN:
-                removeAll(*nestedSet);
-                break;
-            case INTERSECTION:
-                retainAll(*nestedSet);
-                break;
-            case 0:
-                addAll(*nestedSet);
+            default:
                break;
            }
+        }

-            // Get the pattern for the nested set, if we haven't done so
-            // already.
-            if (!nestedPatDone) {
-                if (lastOp != 0) {
-                    newPat.append(lastOp);
-                }
-                nestedSet->_toPattern(newPat, FALSE);
-            }
-            rebuildPattern = TRUE;
+        // -------- Parse literal characters.  This includes both
+        // escaped chars ("\u4E01") and non-syntax characters
+        // ("a").

-            lastOp = 0;
-
-        } else if (!isLiteral && c == SET_CLOSE) {
-            // Final closing delimiter.  This is one of 2 ways we
-            // leave this loop if the pattern is well-formed.
-            if (anchor > 2 || anchor == 1) {
-                //throw new IllegalArgumentException("Syntax error near $" + pattern);
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return;
-            }
-            if (anchor == 2) {
-                rebuildPattern = TRUE;
-                newPat.append((UChar)SymbolTable::SYMBOL_REF);
-                add(U_ETHER);
-            }
-            mode = 4;
+        switch (lastItem) {
+        case 0:
+            lastItem = 1;
+            lastChar = c;
            break;
-        } else if (lastOp == 0 && !isLiteral && (c == HYPHEN || c == INTERSECTION)) {
-            // assert(c <= 0xFFFF);
-            lastOp = (UChar) c;
-        } else if (lastOp == HYPHEN) {
-            if (lastChar >= c || lastChar == NONE) {
-                // Don't allow redundant (a-a) or empty (b-a) ranges;
-                // these are most likely typos.
-                //throw new IllegalArgumentException("Invalid range " + lastChar +
-                //                                       '-' + c);
-                status = U_ILLEGAL_ARGUMENT_ERROR;
-                return;
-            }
-            add(lastChar, c);
-            _appendToPat(newPat, lastChar, FALSE);
-            newPat.append(HYPHEN);
-            _appendToPat(newPat, c, FALSE);
-            lastOp = 0;
-            lastChar = NONE;
-        } else if (lastOp != 0) {
-            // We have <set>&<char> or <char>&<char>
-            // throw new IllegalArgumentException("Unquoted " + lastOp);
-            status = U_ILLEGAL_ARGUMENT_ERROR;
-            return;
-        } else {
-            if (lastChar != NONE) {
-                // We have <char><char>
+        case 1:
+            if (op == 0x2D /*'-'*/) {
+                if (lastChar >= c) {
+                    // Don't allow redundant (a-a) or empty (b-a) ranges;
+                    // these are most likely typos.
+                    // syntaxError(chars, "Invalid range");
+                    ec = U_MALFORMED_SET;
+                    return;
+                }
+                add(lastChar, c);
+                _appendToPat(pat, lastChar, FALSE);
+                pat.append(op);
+                _appendToPat(pat, c, FALSE);
+                lastItem = 0;
+                op = 0;
+            } else {
                add(lastChar, lastChar);
-                _appendToPat(newPat, lastChar, FALSE);
+                _appendToPat(pat, lastChar, FALSE);
+                lastChar = c;
+            }
+            break;
+        case 2:
+            if (op != 0) {
+                // syntaxError(chars, "Set expected after operator");
+                ec = U_MALFORMED_SET;
+                return;
            }
            lastChar = c;
-            isLastLiteral = isLiteral;
+            lastItem = 1;
+            break;
        }
    }

-    if (mode < 4) {
-        // throw new IllegalArgumentException("Missing ']'");
-        status = U_ILLEGAL_ARGUMENT_ERROR;
+    if (mode != 2) {
+        // syntaxError(chars, "Missing ']'");
+        ec = U_MALFORMED_SET;
        return;
    }

-    // Treat a trailing '$' as indicating U_ETHER.  This code is only
-    // executed if symbols == NULL; otherwise other code parses the
-    // anchor.
-    if (lastChar == (UChar)SymbolTable::SYMBOL_REF && !isLastLiteral) {
-        rebuildPattern = TRUE;
-        newPat.append(lastChar);
-        add(U_ETHER);
-    }
-
-    else if (lastChar != NONE) {
-        add(lastChar, lastChar);
-        _appendToPat(newPat, lastChar, FALSE);
-    }
-
-    // Handle unprocessed stuff preceding the closing ']'
-    if (lastOp == HYPHEN) {
-        // Trailing '-' is treated as literal
-        add(lastOp, lastOp);
-        newPat.append(HYPHEN);
-    } else if (lastOp == INTERSECTION) {
-        // throw new IllegalArgumentException("Unquoted trailing " + lastOp);
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-
-    if (mode == 4) {
-        newPat.append(SET_CLOSE);
-    }
+    chars.skipIgnored(opts);

    /**
-     * If this pattern should be compiled case-insensitive, then
-     * we need to close over case BEFORE complementing.  This
-     * makes patterns like /[^abc]/i work.
+     * Handle global flags (invert, case insensitivity).  If this
+     * pattern should be compiled case-insensitive, then we need
+     * to close over case BEFORE COMPLEMENTING.  This makes
+     * patterns like /[^abc]/i work.
     */
    if ((options & USET_CASE_INSENSITIVE) != 0) {
        closeOver(USET_CASE);
    }
-
-    /**
-     * If we saw a '^' after the initial '[' of this pattern, then perform
-     * the complement.  (Inversion after '[:' is handled elsewhere.)
-     */
    if (invert) {
        complement();
    }

-    pos.setIndex(i);
-
-    // Use the rebuilt pattern (newPat) only if necessary.  Prefer the
+    // Use the rebuilt pattern (pat) only if necessary.  Prefer the
    // generated pattern.
-    if (rebuildPattern) {
-        rebuiltPat.append(newPat);
+    if (usePat) {
+        rebuiltPat.append(pat);
    } else {
        _generatePattern(rebuiltPat, FALSE);
    }
@ -2970,6 +2915,33 @@ UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
    return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
 }

+/**
+ * Return true if the given iterator appears to point at a
+ * property pattern.  Regardless of the result, return with the
+ * iterator unchanged.
+ * @param chars iterator over the pattern characters.  Upon return
+ * it will be unchanged.
+ * @param iterOpts RuleCharacterIterator options
+ */
+UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
+                                           int32_t iterOpts) {
+    // NOTE: literal will always be FALSE, because we don't parse escapes.
+    UBool result = FALSE, literal;
+    UErrorCode ec = U_ZERO_ERROR;
+    iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
+    RuleCharacterIterator::Pos pos;
+    chars.getPos(pos);
+    UChar32 c = chars.next(iterOpts, literal, ec);
+    if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
+        UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
+                               literal, ec);
+        result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
+                 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
+    }
+    chars.setPos(pos);
+    return result && U_SUCCESS(ec);
+}
+
 /**
 * Parse the given property pattern at the given parse position.
 */
@ -3063,6 +3035,33 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
    return *this;
 }

+/**
+ * Parse a property pattern.
+ * @param chars iterator over the pattern characters.  Upon return
+ * it will be advanced to the first character after the parsed
+ * pattern, or the end of the iteration if all characters are
+ * parsed.
+ * @param rebuiltPat the pattern that was parsed, rebuilt or
+ * copied from the input pattern, as appropriate.
+ */
+void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
+                                      UnicodeString& rebuiltPat,
+                                      UErrorCode& ec) {
+    if (U_FAILURE(ec)) return;
+    UnicodeString pat;
+    chars.lookahead(pat);
+    ParsePosition pos(0);
+    applyPropertyPattern(pat, pos, ec);
+    if (U_FAILURE(ec)) return;
+    if (pos.getIndex() == 0) {
+        // syntaxError(chars, "Invalid property pattern");
+        ec = U_MALFORMED_SET;
+        return;
+    }
+    chars.jumpahead(pos.getIndex());
+    rebuiltPat.append(pat, 0, pos.getIndex());
+}
+
 //----------------------------------------------------------------
 // Inclusions list
 //----------------------------------------------------------------
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -15,6 +15,9 @@
 #include "unicode/uchar.h"
 #include "unicode/usetiter.h"
 #include "unicode/ustring.h"
+#include "unicode/parsepos.h"
+#include "symtable.h" // TODO move this to unicode/symtable.h
+#include "hash.h"

 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
    UnicodeString pat;
@ -54,6 +57,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
        CASE(15,TestCloseOver);
        CASE(16,TestEscapePattern);
        CASE(17,TestInvalidCodePoint);
+        CASE(18,TestSymbolTable);
        default: name = ""; break;
    }
 }
@ -661,7 +665,7 @@ void UnicodeSetTest::TestStringPatterns() {
        s->applyPattern("[a-z {\\{l} {r\\}}]", ec);
        if (U_FAILURE(ec)) break;
        const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
-        expectToPattern(*s, "[a-z{\\{l}{r\\}}]", exp3);
+        expectToPattern(*s, "[a-z{r\\}}{\\{l}]", exp3);

        s->add("[]");
        const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
@ -670,7 +674,7 @@ void UnicodeSetTest::TestStringPatterns() {
        s->applyPattern("[a-z {\\u4E01\\u4E02}{\\n\\r}]", ec);
        if (U_FAILURE(ec)) break;
        const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
-        expectToPattern(*s, "[a-z{\\u4E01\\u4E02}{\\n\\r}]", exp5);
+        expectToPattern(*s, "[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]", exp5);

        // j2189
        s->clear();
@ -810,7 +814,19 @@ void UnicodeSetTest::TestPropertySet() {

        "[^b-]", // trailing '-' is literal
        "ac",
-        "-b"
+        "-b",
+
+        "[a-b-]", // trailing '-' is literal
+        "ab-",
+        "c=",
+        
+        "[[a-q]&[p-z]-]", // trailing '-' is literal
+        "pq-",
+        "or=",
+
+        "[\\s|\\)|:|$|\\>]", // from regex tests
+        "s|):$>",
+        "abc"
    };

    static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
@ -1120,6 +1136,130 @@ void UnicodeSetTest::TestInvalidCodePoint() {
    }
 }

+// Used by TestSymbolTable
+class TokenSymbolTable : public SymbolTable {
+public:
+    Hashtable contents;
+
+    TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
+        contents.setValueDeleter(uhash_deleteUnicodeString);
+    }
+
+    ~TokenSymbolTable() {}
+
+    /**
+     * (Non-SymbolTable API) Add the given variable and value to
+     * the table.  Variable should NOT contain leading '$'.
+     */
+    void add(const UnicodeString& var, const UnicodeString& value,
+             UErrorCode& ec) {
+        if (U_SUCCESS(ec)) {
+            contents.put(var, new UnicodeString(value), ec);
+        }
+    }
+
+    /**
+     * SymbolTable API
+     */
+    virtual const UnicodeString* lookup(const UnicodeString& s) const {
+        return (const UnicodeString*) contents.get(s);
+    }
+
+    /**
+     * SymbolTable API
+     */
+    virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const {
+        return NULL;
+    }
+
+    /**
+     * SymbolTable API
+     */
+    virtual UnicodeString parseReference(const UnicodeString& text,
+                                         ParsePosition& pos, int32_t limit) const {
+        int32_t start = pos.getIndex();
+        int32_t i = start;
+        UnicodeString result;
+        while (i < limit) {
+            UChar c = text.charAt(i);
+            if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
+                break;
+            }
+            ++i;
+        }
+        if (i == start) { // No valid name chars
+            return result; // Indicate failure with empty string
+        }
+        pos.setIndex(i);
+        text.extractBetween(start, i, result);
+        return result;
+    }
+};
+
+void UnicodeSetTest::TestSymbolTable() {
+    // Multiple test cases can be set up here.  Each test case
+    // is terminated by null:
+    // var, value, var, value,..., input pat., exp. output pat., null
+    const char* DATA[] = {
+        "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
+        "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
+        "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
+        NULL
+    };
+
+    for (int32_t i=0; DATA[i]!=NULL; ++i) {
+        UErrorCode ec = U_ZERO_ERROR;
+        TokenSymbolTable sym(ec);
+        if (U_FAILURE(ec)) {
+            errln("FAIL: couldn't construct TokenSymbolTable");
+            continue;
+        }
+
+        // Set up variables
+        while (DATA[i+2] != NULL) {
+            sym.add(DATA[i], DATA[i+1], ec);
+            if (U_FAILURE(ec)) {
+                errln("FAIL: couldn't add to TokenSymbolTable");
+                continue;
+            }
+            i += 2;
+        }
+
+        // Input pattern and expected output pattern
+        UnicodeString inpat = DATA[i], exppat = DATA[i+1];
+        i += 2;
+
+        ParsePosition pos(0);
+        UnicodeSet us(inpat, pos, sym, ec);
+        if (U_FAILURE(ec)) {
+            errln("FAIL: couldn't construct UnicodeSet");
+            continue;
+        }
+
+        // results
+        if (pos.getIndex() != inpat.length()) {
+            errln((UnicodeString)"Failed to read to end of string \""
+                  + inpat + "\": read to "
+                  + pos.getIndex() + ", length is "
+                  + inpat.length());
+        }
+
+        UnicodeSet us2(exppat, ec);
+        if (U_FAILURE(ec)) {
+            errln("FAIL: couldn't construct expected UnicodeSet");
+            continue;
+        }
+        
+        UnicodeString a, b;
+        if (us != us2) {
+            errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
+                  ", expected " + us2.toPattern(b, TRUE));
+        } else {
+            logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
+        }
+    }
+}
+
 void UnicodeSetTest::TestExhaustive() {
    // exhaustive tests. Simulate UnicodeSets with integers.
    // That gives us very solid tests (except for large memory tests).
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@ -70,6 +70,8 @@ private:

    void TestInvalidCodePoint(void);

+    void TestSymbolTable(void);
+
 private:

    UBool toPatternAux(UChar32 start, UChar32 end);