ICU-86 initial implementation of perl-ish character property syntax for UnicodeSet

X-SVN-Rev: 6280
2025-04-13 08:53:20 +00:00 · 2001-10-17 19:19:00 +00:00 · 2001-10-17 19:19:00 +00:00 · a4a66fdc7f
commit a4a66fdc7f
parent e33659c6ef
12 changed files with 1590 additions and 654 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
- * $Date: 2001/10/10 20:23:27 $
- * $Revision: 1.52 $
+ * $Date: 2001/10/17 19:19:00 $
+ * $Revision: 1.53 $
 *
 *****************************************************************************************
 */
@ -1318,7 +1318,7 @@ public class TransliteratorTest extends TestFmwk {
        public TestFact(String theID) {
            id = theID;
        }
-        public Transliterator getInstance() {
+        public Transliterator getInstance(String ignoredID) {
            return new NameableNullTrans(id);
        }
    };
@ -1533,6 +1533,15 @@ public class TransliteratorTest extends TestFmwk {
        }
    }

+    /**
+     * Test new property set syntax
+     */
+    public void TestPropertySet() {
+        expect("a>A; \\p{Lu}>x; \\p{ALL}>y;", "abcDEF", "Ayyxxx");
+        expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
+               "[ a stitch ]\n[ in time ]\r[ saves 9]");
+    }
+
    //======================================================================
    // icu4j ONLY
    // These tests are not mirrored (yet) in icu4c at
@ -1551,6 +1560,10 @@ public class TransliteratorTest extends TestFmwk {
        }
    }

+    public void TestDebugIndic() {
+        expect("'-'h\\u0323>a;", "-h\u0323", "a");
+    }
+
    //======================================================================
    // Ram's tests
    //======================================================================
--- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $ 
- * $Date: 2001/10/10 21:35:33 $ 
- * $Revision: 1.13 $
+ * $Date: 2001/10/17 19:17:59 $ 
+ * $Revision: 1.14 $
 *
 *****************************************************************************************
 */
@ -52,7 +52,7 @@ public class UnicodeSetTest extends TestFmwk {
        // not used int TOP = 0x200; // Don't need to go over the whole range:
        set = new UnicodeSet("[:L:]");
        for (int i=0; i<0x200; ++i) {
-            boolean l = Character.isLetter((char)i);
+            boolean l = UCharacter.isLetter(i);
            if (l != set.contains((char)i)) {
                errln("FAIL: L contains " + (char)i + " = " + 
                      set.contains((char)i));
@ -62,7 +62,7 @@ public class UnicodeSetTest extends TestFmwk {

        set = new UnicodeSet("[:Lu:]");
        for (int i=0; i<0x200; ++i) {
-            boolean lu = (Character.getType((char)i) == Character.UPPERCASE_LETTER);
+            boolean lu = (UCharacter.getType(i) == UCharacterCategory.UPPERCASE_LETTER);
            if (lu != set.contains((char)i)) {
                errln("FAIL: Lu contains " + (char)i + " = " + 
                      set.contains((char)i));
@ -249,11 +249,13 @@ public class UnicodeSetTest extends TestFmwk {
    /**
     * Test the [:Latin:] syntax.
     */
-    public void TestScriptSet() {
+    public void TestPropertySet() {
        UnicodeSet set = new UnicodeSet("[:Latin:]");
        expectContainment(set, "aA", "\u0391\u03B1");
-        set = new UnicodeSet("[:Greek:]");
+        set = new UnicodeSet("[\\p{Greek}]");
        expectContainment(set, "\u0391\u03B1", "aA");
+        set = new UnicodeSet("\\P{ GENERAL Category = upper case letter }");
+        expectContainment(set, "abc", "ABC");
    }

    /**
@ -453,7 +455,7 @@ public class UnicodeSetTest extends TestFmwk {
                }
            }
            if (bad.length() > 0) {
-                logln(Utility.escape("Fail: set " + set + " does not contain " + bad +
+                errln(Utility.escape("FAIL: set " + set + " does not contain " + bad +
                      ", expected containment of " + charsIn));
            } else {
                logln(Utility.escape("Ok: set " + set + " contains " + charsIn));
@ -468,7 +470,7 @@ public class UnicodeSetTest extends TestFmwk {
                }
            }
            if (bad.length() > 0) {
-                logln(Utility.escape("Fail: set " + set + " contains " + bad +
+                errln(Utility.escape("FAIL: set " + set + " contains " + bad +
                      ", expected non-containment of " + charsOut));
            } else {
                logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut));
--- a/icu4j/src/com/ibm/icu/text/Quantifier.java
+++ b/icu4j/src/com/ibm/icu/text/Quantifier.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Quantifier.java,v $ 
- * $Date: 2001/10/04 18:24:15 $ 
- * $Revision: 1.1 $
+ * $Date: 2001/10/17 19:17:06 $ 
+ * $Revision: 1.2 $
 *
 *****************************************************************************************
 */
@ -45,9 +45,15 @@ class Quantifier implements UnicodeMatcher {
        int start = offset[0];
        int count = 0;
        while (count < maxCount) {
+            int pos = offset[0];
            int m = matcher.matches(text, offset, limit, incremental);
            if (m == U_MATCH) {
                ++count;
+                if (pos == offset[0]) {
+                    // If offset has not moved we have a zero-width match.
+                    // Don't keep matching it infinitely.
+                    break;
+                }
            } else if (incremental && m == U_PARTIAL_MATCH) {
                return U_PARTIAL_MATCH;
            } else {
--- a/icu4j/src/com/ibm/icu/text/TransliteratorParser.java
+++ b/icu4j/src/com/ibm/icu/text/TransliteratorParser.java
@ -1,3 +1,13 @@
+/*
+**********************************************************************
+*   Copyright (c) 2001, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
+* $Date: 2001/10/17 19:17:06 $
+* $Revision: 1.4 $
+**********************************************************************
+*/
 package com.ibm.text;

 import com.ibm.text.resources.ResourceReader;
@ -85,6 +95,13 @@ class TransliteratorParser {
     */
    private String undefinedVariableName;

+    /**
+     * The stand-in character for the 'dot' set, represented by '.' in
+     * patterns.  This is allocated the first time it is needed, and
+     * reused thereafter.
+     */
+    private int dotStandIn = -1;
+
    //----------------------------------------------------------------------
    // Constants
    //----------------------------------------------------------------------
@ -109,8 +126,6 @@ class TransliteratorParser {

    private static final char CONTEXT_ANTE        = '{'; // ante{key
    private static final char CONTEXT_POST        = '}'; // key}post
-    private static final char SET_OPEN            = '[';
-    private static final char SET_CLOSE           = ']';
    private static final char CURSOR_POS          = '|';
    private static final char CURSOR_OFFSET       = '@';
    private static final char ANCHOR_START        = '^';
@ -119,6 +134,9 @@ class TransliteratorParser {
    private static final char ONE_OR_MORE         = '+';
    private static final char ZERO_OR_ONE         = '?';

+    private static final char DOT                 = '.';
+    private static final String DOT_SET           = "[^[:Zp:][:Zl:]\r\n$]";
+
    // By definition, the ANCHOR_END special character is a
    // trailing SymbolTable.SYMBOL_REF character.
    // private static final char ANCHOR_END       = '$';
@ -541,6 +559,15 @@ class TransliteratorParser {
                    // Text after a presumed end anchor is a syntax err
                    syntaxError("Malformed variable reference", rule, start);
                }
+                if (UnicodeSet.resemblesPattern(rule, pos-1)) {
+                    if (pp == null) {
+                        pp = new ParsePosition(0);
+                    }
+                    pp.setIndex(pos-1); // Backup to opening '['
+                    buf.append(parser.parseSet(rule, pp));
+                    pos = pp.getIndex();                    
+                    continue;
+                }
                // Handle escapes
                if (c == ESCAPE) {
                    if (pos == limit) {
@ -682,14 +709,6 @@ class TransliteratorParser {
                    }
                    post = buf.length();
                    break;
-                case SET_OPEN:
-                    if (pp == null) {
-                        pp = new ParsePosition(0);
-                    }
-                    pp.setIndex(pos-1); // Backup to opening '['
-                    buf.append(parser.parseSet(rule, pp));
-                    pos = pp.getIndex();
-                    break;
                case CURSOR_POS:
                    if (cursor >= 0) {
                        syntaxError("Multiple cursors", rule, start);
@ -718,6 +737,9 @@ class TransliteratorParser {
                        }
                    }
                    break;
+                case DOT:
+                    buf.append(parser.getDotStandIn());
+                    break;
                case KLEENE_STAR:
                case ONE_OR_MORE:
                case ZERO_OR_ONE:
@ -783,7 +805,6 @@ class TransliteratorParser {
                        buf.append(parser.generateStandInFor(m));
                    }
                    break;
-                // case SET_CLOSE:
                default:
                    // Disallow unquoted characters other than [0-9A-Za-z]
                    // in the printable ASCII range.  These characters are
@ -1357,6 +1378,17 @@ class TransliteratorParser {
        return variableNext++;
    }

+    /**
+     * Return the stand-in for the dot set.  It is allocated the first
+     * time and reused thereafter.
+     */
+    char getDotStandIn() {
+        if (dotStandIn == -1) {
+            dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
+        }
+        return (char) dotStandIn;
+    }
+    
    /**
     * Append the value of the given variable name to the given
     * StringBuffer.
--- a/icu4j/src/com/ibm/icu/text/UnicodePropertySet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodePropertySet.java
@ -0,0 +1,590 @@
+/*
+**********************************************************************
+*   Copyright (c) 2001, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UnicodePropertySet.java,v $
+* $Date: 2001/10/17 19:17:06 $
+* $Revision: 1.1 $
+**********************************************************************
+*/
+package com.ibm.text;
+
+import java.text.*;
+import java.util.*;
+import com.ibm.util.Utility;
+
+/**
+ * INTERNAL CLASS implementing the UnicodeSet properties as outlined
+ * at:
+ *
+ * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
+ *
+ * Recognized syntax:
+ *
+ * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
+ * \p{foo} \P{foo}  - white space not allowed within "\p" or "\P"
+ *
+ * Other than the above restrictions, white space is ignored.  Case
+ * is ignored except in "\p" and "\P".
+ *
+ * This class cannot be instantiated.  It has a public static method,
+ * createPropertySet(), with takes a pattern to be parsed and returns
+ * a new UnicodeSet.  Another public static method,
+ * resemblesPattern(), returns true if a given pattern string appears
+ * to be a property set pattern, and therefore should be passed in to
+ * createPropertySet().
+ *
+ * NOTE: Current implementation is incomplete.  The following list
+ * indicates which properties are supported.
+ *
+ *    + GeneralCategory
+ *      CombiningClass
+ *      BidiClass
+ *      DecompositionType
+ *    + NumericValue
+ *      NumericType
+ *      EastAsianWidth
+ *      LineBreak
+ *      JoiningType
+ *    + Script
+ *
+ * '+' indicates a supported property.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: UnicodePropertySet.java,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:17:06 $
+ */
+class UnicodePropertySet {
+
+    private static final Hashtable NAME_MAP = new Hashtable();
+
+    private static final Hashtable CATEGORY_MAP = new Hashtable();
+
+    /**
+     * A cache mapping character category integers, as returned by
+     * UCharacter.getType(), to sets.  Entries are initially
+     * null and are created on demand.
+     */
+    private static final UnicodeSet[] CATEGORY_CACHE =
+        new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT];
+
+    /**
+     * A cache mapping script integers, as defined by
+     * UScript, to sets.  Entries are initially
+     * null and are created on demand.
+     */
+    private static final UnicodeSet[] SCRIPT_CACHE =
+        new UnicodeSet[UScript.CODE_LIMIT];
+
+    // Special value codes
+    private static final int ANY = -1; // general category: all code points
+
+    //----------------------------------------------------------------
+    // Public API
+    //----------------------------------------------------------------
+
+    /**
+     * Return true if the given position, in the given pattern, appears
+     * to be the start of a property set pattern [:foo:], \p{foo}, or
+     * \P{foo}.
+     */
+    public static boolean resemblesPattern(String pattern, int pos) {
+        // Patterns are at least 5 characters long
+        if ((pos+5) > pattern.length()) {
+            return false;
+        }
+
+        // Look for an opening [:, [:^, \p, or \P
+        return pattern.regionMatches(pos, "[:", 0, 2) ||
+            pattern.regionMatches(true, pos, "\\p", 0, 2);
+    }
+
+    /**
+     * Create a UnicodeSet by parsing the given pattern at the given
+     * parse position.
+     *
+     * @param pattern the pattern string
+     * @param ppos on entry, the position at which to begin parsing.
+     * This shold be one of the locations marked '^':
+     *
+     *   [:blah:]     \p{blah}     \P{blah}
+     *   ^       %    ^       %    ^       %
+     *
+     * On return, the position after the last character parsed, that is,
+     * the locations marked '%'.  If the parse fails, ppos is returned
+     * unchanged.
+     * @return a newly-constructed UnicodeSet object, or null upon
+     * failure.
+     */
+    public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) {
+
+        UnicodeSet set = null;
+
+        int pos = ppos.getIndex();
+
+        // On entry, ppos should point to one of the following locations:
+
+        // Minimum length is 5 characters, e.g. \p{L}
+        if ((pos+5) > pattern.length()) {
+            return null;
+        }
+
+        boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat}
+        boolean invert = false;
+
+        // Look for an opening [:, [:^, \p, or \P
+        if (pattern.regionMatches(pos, "[:", 0, 2)) {
+            posix = true;
+            pos = skipWhitespace(pattern, pos+2);
+            if (pos < pattern.length() && pattern.charAt(pos) == '^') {
+                ++pos;
+                invert = true;
+            }
+        } else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) {
+            invert = (pattern.charAt(pos+1) == 'P');
+            pos = skipWhitespace(pattern, pos+2);
+            if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
+                // Syntax error; "\p" or "\P" not followed by "{"
+                return null;
+            }
+        } else {
+            // Open delimiter not seen
+            return null;
+        }
+
+        // Look for the matching close delimiter, either :] or }
+        int close = pattern.indexOf(posix ? ":]" : "}", pos);
+        if (close < 0) {
+            // Syntax error; close delimiter missing
+            return null;
+        }
+
+        // Look for an '=' sign.  If this is present, we will parse a
+        // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
+        // pattern.
+        int equals = pattern.indexOf('=', pos);
+        if (equals >= 0 && equals < close) {
+            // Equals seen; parse medium/long pattern
+            String typeName = munge(pattern, pos, equals);
+            String valueName = munge(pattern, equals+1, close);
+            SetFactory factory;
+            factory = (SetFactory) NAME_MAP.get(typeName);
+            if (factory == null) {
+                // Syntax error; type name not recognized
+                return null;
+            }
+            set = factory.create(valueName);
+        } else {
+            // No equals seen; parse short format \p{Cf}
+            String shortName = munge(pattern, pos, close);
+
+            // First try general category
+            set = createCategorySet(shortName);
+
+            // If this fails, try script
+            if (set == null) {
+                set = createScriptSet(shortName);
+            }
+        }
+
+        if (invert) {
+            set.complement();
+        }
+
+        // Move to the limit position after the close delimiter
+        ppos.setIndex(close + (posix ? 2 : 1));
+
+        return set;
+    }
+
+    //----------------------------------------------------------------
+    // Property set factory classes
+    // NOTE: This will change/go away when we implement UCharacter
+    // based property retrieval.
+    //----------------------------------------------------------------
+
+    static interface SetFactory {
+
+        UnicodeSet create(String valueName);
+    }
+
+    static class NumericValueFactory implements SetFactory {
+        NumericValueFactory() {}
+        public UnicodeSet create(String valueName) {
+            double value = Double.parseDouble(valueName);
+            final int ivalue = (int) value;
+            if (ivalue != value || ivalue < 0) {
+                // UCharacter doesn't support negative or non-integral
+                // values, so just return an empty set
+                return new UnicodeSet();
+            }
+            return createSetFromFilter(new Filter() {
+                public boolean contains(int cp) {
+                    return UCharacter.getUnicodeNumericValue(cp) == ivalue;
+                }
+            });
+        }
+    }
+
+    //----------------------------------------------------------------
+    // Property set factory static methods
+    // NOTE: This will change/go away when we implement UCharacter
+    // based property retrieval.
+    //----------------------------------------------------------------
+
+    /**
+     * Given a general category value name, create a corresponding
+     * set and return it, or return null if the name is invalid.
+     * @param valueName a pre-munged general category value name
+     */
+    private static UnicodeSet createCategorySet(String valueName) {
+        Integer valueObj;
+        valueObj = (Integer) CATEGORY_MAP.get(valueName);
+        if (valueObj == null) {
+            return null;
+        }
+        int valueCode = valueObj.intValue();
+
+        UnicodeSet set = new UnicodeSet();
+        if (valueCode == ANY) {
+            set.complement();
+            return set;
+        }
+        for (int cat=0; cat<UCharacterCategory.CHAR_CATEGORY_COUNT; ++cat) {
+            if ((valueCode & (1 << cat)) != 0) {
+                set.addAll(UnicodePropertySet.getCategorySet(cat));
+            }
+        }
+        return set;
+    }
+
+    /**
+     * Given a script value name, create a corresponding set and
+     * return it, or return null if the name is invalid.
+     * @param valueName a pre-munged script value name
+     */
+    private static UnicodeSet createScriptSet(String valueName) {
+        int script = UScript.getCode(valueName);
+        if (script == UScript.INVALID_CODE) {
+            // Syntax error; unknown short name
+            return null;
+        }
+        return new UnicodeSet(getScriptSet(script));
+    }
+
+    //----------------------------------------------------------------
+    // Utility methods
+    //----------------------------------------------------------------
+
+    /**
+     * Returns a UnicodeSet for the given category.  This set is
+     * cached and returned again if this method is called again with
+     * the same parameter.
+     *
+     * Callers MUST NOT MODIFY the returned set.
+     */
+    private static UnicodeSet getCategorySet(final int cat) {
+        if (CATEGORY_CACHE[cat] == null) {
+            CATEGORY_CACHE[cat] =
+                createSetFromFilter(new Filter() {
+                    public boolean contains(int cp) {
+                        return UCharacter.getType(cp) == cat;
+                    }
+                });
+        }
+        return CATEGORY_CACHE[cat];
+    }
+
+    /**
+     * Returns a UnicodeSet for the given script.  This set is
+     * cached and returned again if this method is called again with
+     * the same parameter.
+     *
+     * Callers MUST NOT MODIFY the returned set.
+     */
+    private static UnicodeSet getScriptSet(final int script) {
+        if (SCRIPT_CACHE[script] == null) {
+            SCRIPT_CACHE[script] =
+                createSetFromFilter(new Filter() {
+                    public boolean contains(int cp) {
+                        return UScript.getScript(cp) == script;
+                    }
+                });
+        }
+        return SCRIPT_CACHE[script];
+    }
+
+    /**
+     * Given a string, munge it to upper case and lose the whitespace.
+     * So "General Category " becomes "GENERALCATEGORY".  We munge all
+     * type and value strings, and store all type and value keys
+     * pre-munged.
+     */
+    private static String munge(String str, int start, int limit) {
+        StringBuffer buf = new StringBuffer();
+        for (int i=start; i<limit; ) {
+            int c = UTF16.charAt(str, i);
+            i += UTF16.getCharCount(c);
+            if (!UCharacter.isWhitespace(c)) {
+                UTF16.append(buf, UCharacter.toUpperCase(c));
+            }
+        }
+        return buf.toString();
+    }
+
+    /**
+     * Skip over a sequence of zero or more white space characters
+     * at pos.  Return the index of the first non-white-space character
+     * at or after pos, or str.length(), if there is none.
+     */
+    private static int skipWhitespace(String str, int pos) {
+        while (pos < str.length()) {
+            int c = UTF16.charAt(str, pos);
+            if (!UCharacter.isWhitespace(c)) {
+                break;
+            }
+            pos += UTF16.getCharCount(c);
+        }
+        return pos;
+    }
+
+    //----------------------------------------------------------------
+    // Generic filter-based scanning code
+    //
+    // NOTE: In general, we don't want to do this!  This is a temporary
+    // implementation until we have time for something that examines
+    // the underlying UCharacter data structures in an intelligent
+    // way.  Iterating over all code points is dumb.  What we want to
+    // do, for instance, is iterate over internally-stored ranges
+    // of characters that have a given property.
+    //----------------------------------------------------------------
+
+    static interface Filter {
+        boolean contains(int codePoint);
+    }
+
+    static UnicodeSet createSetFromFilter(Filter filter) {
+        // Walk through all Unicode characters, noting the start
+        // and end of each range for which filter.contain(c) is
+        // true.  Add each range to a set.
+        UnicodeSet set = new UnicodeSet();
+        int start = -1;
+        int end = -2;
+
+        // TODO Extend this up to UnicodeSet.MAX_VALUE when we have
+        // better performance; i.e., when this code can get moved into
+        // the UCharacter class and not have to iterate over code
+        // points.  Right now it's way too slow to iterate to 10FFFF.
+
+        for (int i=UnicodeSet.MIN_VALUE; i<=0xFFFF; ++i) {
+            if (filter.contains(i)) {
+                if ((end+1) == i) {
+                    end = i;
+                } else {
+                    if (start >= 0) {
+                        set.add(start, end);
+                    }
+                    start = end = i;
+                }
+            }
+        }
+        if (start >= 0) {
+            set.add(start, end);
+        }
+        return set;
+    }
+
+    //----------------------------------------------------------------
+    // Type and value name maps
+    //----------------------------------------------------------------
+
+    /**
+     * Add a type mapping to the name map.
+     */
+    private static void addType(String shortName, String longName,
+                                SetFactory factory) {
+        // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
+        if (true) {
+            if (NAME_MAP.get(shortName) != null) {
+                throw new InternalError("Duplicate name " + shortName);
+            }
+            if (NAME_MAP.get(longName) != null) {
+                throw new InternalError("Duplicate name " + longName);
+            }
+        }
+
+        NAME_MAP.put(shortName, factory);
+        NAME_MAP.put(longName, factory);
+    }
+
+    /**
+     * Add a value mapping to the name map.
+     */
+    private static void addValue(Hashtable map,
+                                 String shortName, String longName,
+                                 int value) {
+        // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
+        if (true) {
+            if (map.get(shortName) != null) {
+                throw new InternalError("Duplicate name " + shortName);
+            }
+            if (longName != null && map.get(longName) != null) {
+                throw new InternalError("Duplicate name " + longName);
+            }
+        }
+
+        Integer valueObj = new Integer(value);
+        map.put(shortName, valueObj);
+        if (longName != null) {
+            map.put(longName, valueObj);
+        }
+    }
+
+    static {
+        // NOTE:  We munge all search keys to have no whitespace
+        // and upper case.  As such, all stored keys should have
+        // this format.
+
+        // Load the map with type data
+
+        addType("GC", "GENERALCATEGORY", new SetFactory() {
+            public UnicodeSet create(String valueName) {
+                return createCategorySet(valueName);
+            }
+        });
+
+        //addType("CC", "COMBININGCLASS", COMBINING_CLASS);
+        //addType("BC", "BIDICLASS", BIDI_CLASS);
+        //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
+
+        addType("NV", "NUMERICVALUE", new NumericValueFactory());
+
+        //addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
+        //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
+        //addType("LB", "LINEBREAK", LINE_BREAK);
+        //addType("JT", "JOININGTYPE", JOINING_TYPE);
+
+        addType("SC", "SCRIPT", new SetFactory() {
+            public UnicodeSet create(String valueName) {
+                return createScriptSet(valueName);
+            }
+        });
+
+        // Load the map with value data
+
+        // General Category
+
+        addValue(CATEGORY_MAP, "ANY", null, ANY); // special case
+
+        addValue(CATEGORY_MAP, "C", "OTHER",
+                 (1 << UCharacterCategory.CONTROL) |
+                 (1 << UCharacterCategory.FORMAT) |
+                 (1 << UCharacterCategory.GENERAL_OTHER_TYPES) |
+                 (1 << UCharacterCategory.PRIVATE_USE) |
+                 (1 << UCharacterCategory.SURROGATE));
+
+        addValue(CATEGORY_MAP, "CC", "CONTROL",
+                 1 << UCharacterCategory.CONTROL);
+        addValue(CATEGORY_MAP, "CF", "FORMAT",
+                 1 << UCharacterCategory.FORMAT);
+        addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
+                 1 << UCharacterCategory.GENERAL_OTHER_TYPES);
+        addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
+                 1 << UCharacterCategory.PRIVATE_USE);
+        addValue(CATEGORY_MAP, "CS", "SURROGATE",
+                 1 << UCharacterCategory.SURROGATE);
+
+        addValue(CATEGORY_MAP, "L", "LETTER",
+                 (1 << UCharacterCategory.LOWERCASE_LETTER) |
+                 (1 << UCharacterCategory.MODIFIER_LETTER) |
+                 (1 << UCharacterCategory.OTHER_LETTER) |
+                 (1 << UCharacterCategory.TITLECASE_LETTER) |
+                 (1 << UCharacterCategory.UPPERCASE_LETTER));
+
+        addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
+                 1 << UCharacterCategory.LOWERCASE_LETTER);
+        addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
+                 1 << UCharacterCategory.MODIFIER_LETTER);
+        addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
+                 1 << UCharacterCategory.OTHER_LETTER);
+        addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
+                 1 << UCharacterCategory.TITLECASE_LETTER);
+        addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
+                 1 << UCharacterCategory.UPPERCASE_LETTER);
+
+        addValue(CATEGORY_MAP, "M", "MARK",
+                 (1 << UCharacterCategory.NON_SPACING_MARK) |
+                 (1 << UCharacterCategory.COMBINING_SPACING_MARK) |
+                 (1 << UCharacterCategory.ENCLOSING_MARK));
+
+        addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
+                 1 << UCharacterCategory.NON_SPACING_MARK);
+        addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
+                 1 << UCharacterCategory.COMBINING_SPACING_MARK);
+        addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
+                 1 << UCharacterCategory.ENCLOSING_MARK);
+
+        addValue(CATEGORY_MAP, "N", "NUMBER",
+                 (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
+                 (1 << UCharacterCategory.LETTER_NUMBER) |
+                 (1 << UCharacterCategory.OTHER_NUMBER));
+
+        addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
+                 1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER);
+        addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
+                 1 << UCharacterCategory.LETTER_NUMBER);
+        addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
+                 1 << UCharacterCategory.OTHER_NUMBER);
+
+        addValue(CATEGORY_MAP, "P", "PUNCTUATION",
+                 (1 << UCharacterCategory.CONNECTOR_PUNCTUATION) |
+                 (1 << UCharacterCategory.DASH_PUNCTUATION) |
+                 (1 << UCharacterCategory.END_PUNCTUATION) |
+                 (1 << UCharacterCategory.FINAL_PUNCTUATION) |
+                 (1 << UCharacterCategory.INITIAL_PUNCTUATION) |
+                 (1 << UCharacterCategory.OTHER_PUNCTUATION) |
+                 (1 << UCharacterCategory.START_PUNCTUATION));
+
+        addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
+                 1 << UCharacterCategory.CONNECTOR_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
+                 1 << UCharacterCategory.DASH_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
+                 1 << UCharacterCategory.END_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
+                 1 << UCharacterCategory.FINAL_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
+                 1 << UCharacterCategory.INITIAL_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
+                 1 << UCharacterCategory.OTHER_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
+                 1 << UCharacterCategory.START_PUNCTUATION);
+
+        addValue(CATEGORY_MAP, "S", "SYMBOL",
+                 (1 << UCharacterCategory.CURRENCY_SYMBOL) |
+                 (1 << UCharacterCategory.MODIFIER_SYMBOL) |
+                 (1 << UCharacterCategory.MATH_SYMBOL) |
+                 (1 << UCharacterCategory.OTHER_SYMBOL));
+
+        addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
+                 1 << UCharacterCategory.CURRENCY_SYMBOL);
+        addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
+                 1 << UCharacterCategory.MODIFIER_SYMBOL);
+        addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
+                 1 << UCharacterCategory.MATH_SYMBOL);
+        addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
+                 1 << UCharacterCategory.OTHER_SYMBOL);
+
+        addValue(CATEGORY_MAP, "Z", "SEPARATOR",
+                 (1 << UCharacterCategory.LINE_SEPARATOR) |
+                 (1 << UCharacterCategory.PARAGRAPH_SEPARATOR) |
+                 (1 << UCharacterCategory.SPACE_SEPARATOR));
+
+        addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
+                 1 << UCharacterCategory.LINE_SEPARATOR);
+        addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
+                 1 << UCharacterCategory.PARAGRAPH_SEPARATOR);
+        addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
+                 1 << UCharacterCategory.SPACE_SEPARATOR);
+    }
+}
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
- * $Date: 2001/10/10 21:35:05 $
- * $Revision: 1.39 $
+ * $Date: 2001/10/17 19:17:06 $
+ * $Revision: 1.40 $
 *
 *****************************************************************************************
 */
@ -202,60 +202,26 @@ import com.ibm.util.Utility;
 * starting wih 'L', that is, <code>[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]</code>.
 * </table>
 *
- * <p><b>Character categories.</b>
+ * <p><b>Character properties.</b>
 *
- * Character categories are specified using the POSIX-like syntax
- * '[:Lu:]'.  The complement of a category is specified by inserting
- * '^' after the opening '[:'.  The following category names are
- * recognized.  Actual determination of category data uses
- * <code>Character.getType()</code>, so it reflects the underlying
- * implmementation used by <code>Character</code>.  As of Java 2 and
- * JDK 1.1.8, this is Unicode 2.1.2.
+ * <p>Character properties are specified using the POSIX-like syntax
+ * "[:Lu:]" or the Perl-like syntax "\p{Lu}".  The complement of a
+ * category is specified as "[:^Lu:]" or "\P{Lu}".  Actual
+ * determination of category data is accomplished by UCharacter using
+ * the underlying Unicode database.
 *
- * <pre>
- * Normative
- *     Mn = Mark, Non-Spacing
- *     Mc = Mark, Spacing Combining
- *     Me = Mark, Enclosing
+ * <p>For details of the property syntax please see this
+ * <a href="http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html">
+ * draft document</a>.
 *
- *     Nd = Number, Decimal Digit
- *     Nl = Number, Letter
- *     No = Number, Other
- *
- *     Zs = Separator, Space
- *     Zl = Separator, Line
- *     Zp = Separator, Paragraph
- *
- *     Cc = Other, Control
- *     Cf = Other, Format
- *     Cs = Other, Surrogate
- *     Co = Other, Private Use
- *     Cn = Other, Not Assigned
- *
- * Informative
- *     Lu = Letter, Uppercase
- *     Ll = Letter, Lowercase
- *     Lt = Letter, Titlecase
- *     Lm = Letter, Modifier
- *     Lo = Letter, Other
- *
- *     Pc = Punctuation, Connector
- *     Pd = Punctuation, Dash
- *     Ps = Punctuation, Open
- *     Pe = Punctuation, Close
- *    *Pi = Punctuation, Initial quote
- *    *Pf = Punctuation, Final quote
- *     Po = Punctuation, Other
- *
- *     Sm = Symbol, Math
- *     Sc = Symbol, Currency
- *     Sk = Symbol, Modifier
- *     So = Symbol, Other
- * </pre>
- * *Unsupported by Java (and hence unsupported by UnicodeSet).
+ * <p><em>Note:</em> Not all properties are currently supported.
+ * Currently, only the general category, script, and numeric value
+ * properties are supported.  Support for other properties will be
+ * added in the future.
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.39 $ $Date: 2001/10/10 21:35:05 $ */
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.40 $ $Date: 2001/10/17 19:17:06 $
+ */
 public class UnicodeSet extends UnicodeFilter {

    /* Implementation Notes.
@ -313,29 +279,12 @@ public class UnicodeSet extends UnicodeFilter {
     * modified using the non-pattern API, this string will be null,
     * indicating that toPattern() must generate a pattern
     * representation from the inversion list.
-     */ 
+     */
    private String pat = null;

    private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
    private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0

-    private static final String CATEGORY_NAMES =
-        //                    1 1 1 1 1 1 1   1 1 2 2 2 2 2 2 2 2 2
-        //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6   8 9 0 1 2 3 4 5 6 7 8
-        "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
-
-    private static final int UNSUPPORTED_CATEGORY = 17;
-
-    private static final int CATEGORY_COUNT = 29;
-
-    /**
-     * A cache mapping character category integers, as returned by
-     * Character.getType(), to inversion lists.  Entries are initially
-     * null and are created on demand.
-     */
-    private static final UnicodeSet[] CATEGORY_CACHE =
-        new UnicodeSet[CATEGORY_COUNT];
-
    //----------------------------------------------------------------
    // Public API
    //----------------------------------------------------------------
@ -408,19 +357,27 @@ public class UnicodeSet extends UnicodeFilter {
        applyPattern(pattern, pos, symbols, true);
    }

+    private static final String CATEGORY_NAMES =
+        //                    1 1 1 1 1 1 1   1 1 2 2 2 2 2 2 2 2 2
+        //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6   8 9 0 1 2 3 4 5 6 7 8
+        "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
    /**
-     * Constructs a set from the given Unicode character category.
+     * DEPRECATED - Constructs a set from the given Unicode character
+     * category.
     * @param category an integer indicating the character category as
-     * returned by <code>Character.getType()</code>.
+     * returned by <code>java.lang.Character.getType()</code>.  Note
+     * that this is <em>different</em> from the UCharacterCategory
+     * codes.
     * @exception java.lang.IllegalArgumentException if the given
     * category is invalid.
+     * @deprecated this will be removed Dec-31-2001
     */
    public UnicodeSet(int category) {
-        if (category < 0 || category >= CATEGORY_COUNT ||
-            category == UNSUPPORTED_CATEGORY) {
+        if (category < 0 || category > java.lang.Character.OTHER_SYMBOL ||
+            category == 17) {
            throw new IllegalArgumentException("Invalid category");
        }
-        set(getCategorySet(category));
+        applyPattern(CATEGORY_NAMES.substring(2*category, 2*category+2), false);
    }

    /**
@ -489,6 +446,16 @@ public class UnicodeSet extends UnicodeFilter {
        }
    }

+    /**
+     * Return true if the given position, in the given pattern, appears
+     * to be the start of a UnicodeSet pattern.
+     */
+    public static boolean resemblesPattern(String pattern, int pos) {
+        return ((pos+1) < pattern.length() &&
+                pattern.charAt(pos) == '[') ||
+            UnicodePropertySet.resemblesPattern(pattern, pos);
+    }
+
    /**
     * Append the <code>toPattern()</code> representation of a
     * character to the given <code>StringBuffer</code>.
@ -509,6 +476,8 @@ public class UnicodeSet extends UnicodeFilter {
        case '^': // COMPLEMENT:
        case '&': // INTERSECTION:
        case '\\': //BACKSLASH:
+        case '{':
+        case '}':
            buf.append('\\');
            break;
        default:
@ -607,28 +576,28 @@ public class UnicodeSet extends UnicodeFilter {
            }
            return result;
        }
-        
+
        return _generatePattern(result, escapeUnprintable);
    }

    /**
     * Generate and append a string representation of this set to result.
     * This does not use this.pat, the cleaned up copy of the string
-     * passed to applyPattern(). 
+     * passed to applyPattern().
     */
    public StringBuffer _generatePattern(StringBuffer result,
                                         boolean escapeUnprintable) {
        result.append('[');

-        // Check against the predefined categories.  We implicitly build
-        // up ALL category sets the first time toPattern() is called.
-        for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
-            if (this.equals(getCategorySet(cat))) {
-                result.append(':');
-                result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
-                return result.append(":]");
-            }
-        }
+//      // Check against the predefined categories.  We implicitly build
+//      // up ALL category sets the first time toPattern() is called.
+//      for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
+//          if (this.equals(getCategorySet(cat))) {
+//              result.append(':');
+//              result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
+//              return result.append(":]");
+//          }
+//      }

        int count = getRangeCount();

@ -1205,7 +1174,7 @@ public class UnicodeSet extends UnicodeFilter {
        StringBuffer newPat = new StringBuffer("[");
        int nestedPatStart = -1; // see below for usage
        boolean nestedPatDone = false; // see below for usage
-        
+
        boolean invert = false;
        clear();

@ -1231,8 +1200,9 @@ public class UnicodeSet extends UnicodeFilter {
        // mode 1: '[' seen; if next is '^' or ':' then special
        // mode 2: '[' '^'? seen; parse pattern and close with ']'
        // mode 3: '[:' seen; parse category and close with ':]'
+        // mode 4: ']' seen; parse complete
+        // mode 5: Top-level property pattern seen
        int mode = 0;
-        int colonPos = 0; // Expected pos of ':' in '[:'
        int start = pos.getIndex();
        int i = start;
        int limit = pattern.length();
@ -1285,9 +1255,11 @@ public class UnicodeSet extends UnicodeFilter {
            // Parse the opening '[' and optional following '^'
            switch (mode) {
            case 0:
-                if (c == '[') {
+                if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
+                    mode = 3;
+                    break; // Fall through
+                } else if (c == '[') {
                    mode = 1; // Next look for '^'
-                    colonPos = i; // Expect ':' at next offset
                    continue;
                } else {
                    throw new IllegalArgumentException("Missing opening '['");
@ -1299,17 +1271,6 @@ public class UnicodeSet extends UnicodeFilter {
                    invert = true;
                    newPat.append((char) c);
                    continue; // Back to top to fetch next character
-                case ':':
-                    if (i-1 == colonPos) {
-                        // '[:' cannot have whitespace in it
-                        --i; // Backup to the '['
-                        c = '[';
-                        mode = 3;
-                        // Fall through and parse category using the same
-                        // code used to parse a nested category.  The mode
-                        // will indicate that this is actually top level.
-                    }
-                    break; // Fall through
                case '-':
                    isLiteral = true; // Treat leading '-' as a literal
                    break; // Fall through
@ -1326,12 +1287,47 @@ public class UnicodeSet extends UnicodeFilter {
            // buffer.  Characters in the variable buffer have already
            // benn through escape and variable reference processing.
            if (varValueBuffer == null) {
+                /**
+                 * Handle property set patterns.
+                 */
+                if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
+                    ParsePosition pp = new ParsePosition(i-1);
+                    nestedSet = UnicodePropertySet.createFromPattern(pattern, pp);
+                    if (nestedSet == null) {
+                        // assert(pp.getIndex() == i-1);
+                        throw new IllegalArgumentException("Invalid property pattern " +
+                                                           pattern.substring(i-1));
+                    }
+                    nestedPatStart = newPat.length();
+                    nestedPatDone = true; // we're going to do it just below
+
+                    // If we have a top-level property pattern, then trim
+                    // off the opening '[' and use the property pattern
+                    // as the entire pattern.
+                    if (mode == 3) {
+                        newPat.deleteCharAt(0);
+                    }
+                    newPat.append(pattern.substring(i-1, pp.getIndex()));
+                    rebuildPattern = true;
+
+                    i = pp.getIndex(); // advance past property pattern
+                    
+                    if (mode == 3) {
+                        // Entire pattern is a category; leave parse
+                        // loop.  This is one of 2 ways we leave this
+                        // loop if the pattern is well-formed.
+                        set(nestedSet);
+                        mode = 5;
+                        break;
+                    }
+                }
+
                /* Handle escapes.  If a character is escaped, then it assumes its
                 * literal value.  This is true for all characters, both special
                 * characters and characters with no special meaning.  We also
                 * interpret '\\uxxxx' Unicode escapes here (as literals).
                 */
-                if (c == '\\') {
+                else if (c == '\\') {
                    int[] offset = new int[] { i };
                    int escaped = Utility.unescapeAt(pattern, offset);
                    if (escaped == -1) {
@ -1373,61 +1369,25 @@ public class UnicodeSet extends UnicodeFilter {
                }

                /* An opening bracket indicates the first bracket of a nested
-                 * subpattern, either a normal pattern or a category pattern.  We
-                 * recognize these here and set nestedSet accordingly.
+                 * subpattern.
                 */
                else if (!isLiteral && c == '[') {
                    // Record position before nested pattern
                    nestedPatStart = newPat.length();

-                    // Handle "[:...:]", representing a character category
-                    if (i < pattern.length() && pattern.charAt(i) == ':') {
-                        ++i;
-                        int j = pattern.indexOf(":]", i);
-                        if (j < 0) {
-                            throw new IllegalArgumentException("Missing \":]\"");
-                        }
-                        String scratch = pattern.substring(i, j);
-                        nestedSet = new UnicodeSet();
-                        nestedSet.applyCategory(scratch);
-                        nestedPatDone = true; // We're going to do it just below
-                        i = j+2; // Advance i past ":]"
-
-                        // Use a rebuilt pattern.  If we are top level,
-                        // then there is already a SET_OPEN in newPat, and
-                        // SET_CLOSE will be appended elsewhere.
-                        if (mode != 3) {
-                            newPat.append('[');
-                        }
-                        newPat.append(':').append(scratch).append(':');
-                        if (mode != 3) {
-                            newPat.append(']');
-                        }
-                        rebuildPattern = true;
-
-                        if (mode == 3) {
-                            // Entire pattern is a category; leave parse
-                            // loop.  This is one of 2 ways we leave this
-                            // loop if the pattern is well-formed.
-                            set(nestedSet);
-                            mode = 4;
-                            break;
-                        }
-                    } else {
-                        // Recurse to get the pairs for this nested set.
-                        // Backup i to '['.
-                        pos.setIndex(--i);
-                        switch (lastOp) {
-                        case '-':
-                        case '&':
-                            newPat.append(lastOp);
-                            break;
-                        }
-                        nestedSet = new UnicodeSet();
-                        nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
-                        nestedPatDone = true;
-                        i = pos.getIndex();
+                    // Recurse to get the pairs for this nested set.
+                    // Backup i to '['.
+                    pos.setIndex(--i);
+                    switch (lastOp) {
+                    case '-':
+                    case '&':
+                        newPat.append(lastOp);
+                        break;
                    }
+                    nestedSet = new UnicodeSet();
+                    nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
+                    nestedPatDone = true;
+                    i = pos.getIndex();
                }
            }

@ -1487,7 +1447,7 @@ public class UnicodeSet extends UnicodeFilter {
                // loop if the pattern is well-formed.
                if (anchor > 2 || anchor == 1) {
                    throw new IllegalArgumentException("Syntax error near $" + pattern);
-                    
+
                }
                if (anchor == 2) {
                    rebuildPattern = true;
@ -1524,16 +1484,24 @@ public class UnicodeSet extends UnicodeFilter {
            }
        }

-        if (lastChar != NONE) {
+        if (mode < 4) {
+            throw new IllegalArgumentException("Missing ']'");
+        }
+
+        // Treat a trailing '$' as indicating ETHER.  This code is only
+        // executed if symbols == NULL; otherwise other code parses the
+        // anchor.
+        if (lastChar == SymbolTable.SYMBOL_REF) {
+            rebuildPattern = true;
+            newPat.append(lastChar);
+            add(TransliterationRule.ETHER);
+        }
+        
+        else if (lastChar != NONE) {
            add(lastChar, lastChar);
            _appendToPat(newPat, lastChar, false);
        }

-//      if (mode == 0) {
-//          throw new IllegalArgumentException("Missing '[' in \"" +
-//                                             pattern.substring(start) + '"');
-//      }
-
        // Handle unprocessed stuff preceding the closing ']'
        if (lastOp == '-') {
            // Trailing '-' is treated as literal
@ -1543,7 +1511,9 @@ public class UnicodeSet extends UnicodeFilter {
            throw new IllegalArgumentException("Unquoted trailing " + lastOp);
        }

-        newPat.append(']');
+        if (mode == 4) {
+            newPat.append(']');
+        }

        /**
         * If we saw a '^' after the initial '[' of this pattern, then perform
@ -1553,21 +1523,6 @@ public class UnicodeSet extends UnicodeFilter {
            complement();
        }

-        if (mode != 4) {
-            throw new IllegalArgumentException("Missing ']'");
-        }
-
-//      /**
-//       * i indexes the last character we parsed or is pattern.length().  In
-//       * the latter case, we have run off the end without finding a closing
-//       * ']'.  Otherwise, we know i < pattern.length(), and we set the
-//       * ParsePosition to the next character to be parsed.
-//       */
-//      if (i == limit) {
-//          throw new IllegalArgumentException("Missing ']' in \"" +
-//                                             pattern.substring(start) + '"');
-//      }
-  
        pos.setIndex(i);

        // Use the rebuilt pattern (newPat) only if necessary.  Prefer the
@ -1586,136 +1541,6 @@ public class UnicodeSet extends UnicodeFilter {
        }
    }

-    //----------------------------------------------------------------
-    // Implementation: Generation of Unicode categories
-    //----------------------------------------------------------------
-
-    /**
-     * Sets this object to the given category, given its name.
-     * The category name must be either a two-letter name, such as
-     * "Lu", or a one letter name, such as "L".  One-letter names
-     * indicate the logical union of all two-letter names that start
-     * with that letter.  Case is significant.  If the name starts
-     * with the character '^' then the complement of the given
-     * character set is returned.
-     *
-     * Although individual categories such as "Lu" are cached, we do
-     * not currently cache single-letter categories such as "L" or
-     * complements such as "^Lu" or "^L".  It would be easy to cache
-     * these as well in a hashtable should the need arise.
-     *
-     * NEW: The category name can now be a script name, as defined
-     * by UScript.
-     */
-    private void applyCategory(String catName) {
-        boolean invert = (catName.length() > 1 &&
-                          catName.charAt(0) == '^');
-        if (invert) {
-            catName = catName.substring(1);
-        }
-
-        boolean match = false;
-
-        // BE CAREFUL not to modify the return value from
-        // getCategorySet(int).
-
-        // if we have two characters, search the category map for that
-        // code and either construct and return a UnicodeSet from the
-        // data in the category map or throw an exception
-        if (catName.length() == 2) {
-            int i = CATEGORY_NAMES.indexOf(catName);
-            if (i>=0 && i%2==0) {
-                i /= 2;
-                if (i != UNSUPPORTED_CATEGORY) {
-                    set(getCategorySet(i));
-                    match = true;
-                }
-            }
-        } else if (catName.length() == 1) {
-            // if we have one character, search the category map for
-            // codes beginning with that letter, and union together
-            // all of the matching sets that we find (or throw an
-            // exception if there are no matches)
-            clear();
-            for (int i=0; i<CATEGORY_COUNT; ++i) {
-                if (i != UNSUPPORTED_CATEGORY &&
-                    CATEGORY_NAMES.charAt(2*i) == catName.charAt(0)) {
-                    addAll(getCategorySet(i));
-                    match = true;
-                }
-            }
-        }
-
-        if (!match) {
-            // TODO: Add caching of these, if desired
-            int script = UScript.getCode(catName);
-            if (script != UScript.INVALID_CODE) {
-                match = true;
-                clear();
-                int start = -1;
-                int end = -2;
-                for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
-                    if (UScript.getScript(i) == script) {
-                        if ((end+1) == i) {
-                            end = i;
-                        } else {
-                            if (start >= 0) {
-                                add(start, end);
-                            }
-                            start = end = i;
-                        }
-                    }
-                }
-                if (start >= 0) {
-                    add(start, end);
-                }
-            }
-        }
-
-        if (!match) {
-            throw new IllegalArgumentException("Illegal category [:" + catName + ":]");
-        }
-
-        if (invert) {
-            complement();
-        }
-    }
-
-    /**
-     * Returns an inversion list for the given category.  This list is
-     * cached and returned again if this method is called again with
-     * the same parameter.
-     *
-     * Callers MUST NOT MODIFY the returned set.
-     */
-    private static UnicodeSet getCategorySet(int cat) {
-        if (CATEGORY_CACHE[cat] == null) {
-            // Walk through all Unicode characters, noting the start
-            // and end of each range for which Character.getType(c)
-            // returns the given category integer.
-            UnicodeSet set = new UnicodeSet();
-            int start = -1;
-            int end = -2;
-            for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
-                if (Character.getType((char)i) == cat) {
-                    if ((end+1) == i) {
-                        end = i;
-                    } else {
-                        if (start >= 0) {
-                            set.add(start, end);
-                        }
-                        start = end = i;
-                    }
-                }
-            }
-            if (start >= 0) {
-                set.add(start, end);
-            }
-            CATEGORY_CACHE[cat] = set;
-        }
-        return CATEGORY_CACHE[cat];
-    }
-
    //----------------------------------------------------------------
    // Implementation: Utility methods
    //----------------------------------------------------------------
--- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
- * $Date: 2001/10/10 20:23:27 $
- * $Revision: 1.52 $
+ * $Date: 2001/10/17 19:19:00 $
+ * $Revision: 1.53 $
 *
 *****************************************************************************************
 */
@ -1318,7 +1318,7 @@ public class TransliteratorTest extends TestFmwk {
        public TestFact(String theID) {
            id = theID;
        }
-        public Transliterator getInstance() {
+        public Transliterator getInstance(String ignoredID) {
            return new NameableNullTrans(id);
        }
    };
@ -1533,6 +1533,15 @@ public class TransliteratorTest extends TestFmwk {
        }
    }

+    /**
+     * Test new property set syntax
+     */
+    public void TestPropertySet() {
+        expect("a>A; \\p{Lu}>x; \\p{ALL}>y;", "abcDEF", "Ayyxxx");
+        expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
+               "[ a stitch ]\n[ in time ]\r[ saves 9]");
+    }
+
    //======================================================================
    // icu4j ONLY
    // These tests are not mirrored (yet) in icu4c at
@ -1551,6 +1560,10 @@ public class TransliteratorTest extends TestFmwk {
        }
    }

+    public void TestDebugIndic() {
+        expect("'-'h\\u0323>a;", "-h\u0323", "a");
+    }
+
    //======================================================================
    // Ram's tests
    //======================================================================
--- a/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
+++ b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeSetTest.java,v $ 
- * $Date: 2001/10/10 21:35:33 $ 
- * $Revision: 1.13 $
+ * $Date: 2001/10/17 19:17:59 $ 
+ * $Revision: 1.14 $
 *
 *****************************************************************************************
 */
@ -52,7 +52,7 @@ public class UnicodeSetTest extends TestFmwk {
        // not used int TOP = 0x200; // Don't need to go over the whole range:
        set = new UnicodeSet("[:L:]");
        for (int i=0; i<0x200; ++i) {
-            boolean l = Character.isLetter((char)i);
+            boolean l = UCharacter.isLetter(i);
            if (l != set.contains((char)i)) {
                errln("FAIL: L contains " + (char)i + " = " + 
                      set.contains((char)i));
@ -62,7 +62,7 @@ public class UnicodeSetTest extends TestFmwk {

        set = new UnicodeSet("[:Lu:]");
        for (int i=0; i<0x200; ++i) {
-            boolean lu = (Character.getType((char)i) == Character.UPPERCASE_LETTER);
+            boolean lu = (UCharacter.getType(i) == UCharacterCategory.UPPERCASE_LETTER);
            if (lu != set.contains((char)i)) {
                errln("FAIL: Lu contains " + (char)i + " = " + 
                      set.contains((char)i));
@ -249,11 +249,13 @@ public class UnicodeSetTest extends TestFmwk {
    /**
     * Test the [:Latin:] syntax.
     */
-    public void TestScriptSet() {
+    public void TestPropertySet() {
        UnicodeSet set = new UnicodeSet("[:Latin:]");
        expectContainment(set, "aA", "\u0391\u03B1");
-        set = new UnicodeSet("[:Greek:]");
+        set = new UnicodeSet("[\\p{Greek}]");
        expectContainment(set, "\u0391\u03B1", "aA");
+        set = new UnicodeSet("\\P{ GENERAL Category = upper case letter }");
+        expectContainment(set, "abc", "ABC");
    }

    /**
@ -453,7 +455,7 @@ public class UnicodeSetTest extends TestFmwk {
                }
            }
            if (bad.length() > 0) {
-                logln(Utility.escape("Fail: set " + set + " does not contain " + bad +
+                errln(Utility.escape("FAIL: set " + set + " does not contain " + bad +
                      ", expected containment of " + charsIn));
            } else {
                logln(Utility.escape("Ok: set " + set + " contains " + charsIn));
@ -468,7 +470,7 @@ public class UnicodeSetTest extends TestFmwk {
                }
            }
            if (bad.length() > 0) {
-                logln(Utility.escape("Fail: set " + set + " contains " + bad +
+                errln(Utility.escape("FAIL: set " + set + " contains " + bad +
                      ", expected non-containment of " + charsOut));
            } else {
                logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut));
--- a/icu4j/src/com/ibm/text/Quantifier.java
+++ b/icu4j/src/com/ibm/text/Quantifier.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Quantifier.java,v $ 
- * $Date: 2001/10/04 18:24:15 $ 
- * $Revision: 1.1 $
+ * $Date: 2001/10/17 19:17:06 $ 
+ * $Revision: 1.2 $
 *
 *****************************************************************************************
 */
@ -45,9 +45,15 @@ class Quantifier implements UnicodeMatcher {
        int start = offset[0];
        int count = 0;
        while (count < maxCount) {
+            int pos = offset[0];
            int m = matcher.matches(text, offset, limit, incremental);
            if (m == U_MATCH) {
                ++count;
+                if (pos == offset[0]) {
+                    // If offset has not moved we have a zero-width match.
+                    // Don't keep matching it infinitely.
+                    break;
+                }
            } else if (incremental && m == U_PARTIAL_MATCH) {
                return U_PARTIAL_MATCH;
            } else {
--- a/icu4j/src/com/ibm/text/TransliteratorParser.java
+++ b/icu4j/src/com/ibm/text/TransliteratorParser.java
@ -1,3 +1,13 @@
+/*
+**********************************************************************
+*   Copyright (c) 2001, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
+* $Date: 2001/10/17 19:17:06 $
+* $Revision: 1.4 $
+**********************************************************************
+*/
 package com.ibm.text;

 import com.ibm.text.resources.ResourceReader;
@ -85,6 +95,13 @@ class TransliteratorParser {
     */
    private String undefinedVariableName;

+    /**
+     * The stand-in character for the 'dot' set, represented by '.' in
+     * patterns.  This is allocated the first time it is needed, and
+     * reused thereafter.
+     */
+    private int dotStandIn = -1;
+
    //----------------------------------------------------------------------
    // Constants
    //----------------------------------------------------------------------
@ -109,8 +126,6 @@ class TransliteratorParser {

    private static final char CONTEXT_ANTE        = '{'; // ante{key
    private static final char CONTEXT_POST        = '}'; // key}post
-    private static final char SET_OPEN            = '[';
-    private static final char SET_CLOSE           = ']';
    private static final char CURSOR_POS          = '|';
    private static final char CURSOR_OFFSET       = '@';
    private static final char ANCHOR_START        = '^';
@ -119,6 +134,9 @@ class TransliteratorParser {
    private static final char ONE_OR_MORE         = '+';
    private static final char ZERO_OR_ONE         = '?';

+    private static final char DOT                 = '.';
+    private static final String DOT_SET           = "[^[:Zp:][:Zl:]\r\n$]";
+
    // By definition, the ANCHOR_END special character is a
    // trailing SymbolTable.SYMBOL_REF character.
    // private static final char ANCHOR_END       = '$';
@ -541,6 +559,15 @@ class TransliteratorParser {
                    // Text after a presumed end anchor is a syntax err
                    syntaxError("Malformed variable reference", rule, start);
                }
+                if (UnicodeSet.resemblesPattern(rule, pos-1)) {
+                    if (pp == null) {
+                        pp = new ParsePosition(0);
+                    }
+                    pp.setIndex(pos-1); // Backup to opening '['
+                    buf.append(parser.parseSet(rule, pp));
+                    pos = pp.getIndex();                    
+                    continue;
+                }
                // Handle escapes
                if (c == ESCAPE) {
                    if (pos == limit) {
@ -682,14 +709,6 @@ class TransliteratorParser {
                    }
                    post = buf.length();
                    break;
-                case SET_OPEN:
-                    if (pp == null) {
-                        pp = new ParsePosition(0);
-                    }
-                    pp.setIndex(pos-1); // Backup to opening '['
-                    buf.append(parser.parseSet(rule, pp));
-                    pos = pp.getIndex();
-                    break;
                case CURSOR_POS:
                    if (cursor >= 0) {
                        syntaxError("Multiple cursors", rule, start);
@ -718,6 +737,9 @@ class TransliteratorParser {
                        }
                    }
                    break;
+                case DOT:
+                    buf.append(parser.getDotStandIn());
+                    break;
                case KLEENE_STAR:
                case ONE_OR_MORE:
                case ZERO_OR_ONE:
@ -783,7 +805,6 @@ class TransliteratorParser {
                        buf.append(parser.generateStandInFor(m));
                    }
                    break;
-                // case SET_CLOSE:
                default:
                    // Disallow unquoted characters other than [0-9A-Za-z]
                    // in the printable ASCII range.  These characters are
@ -1357,6 +1378,17 @@ class TransliteratorParser {
        return variableNext++;
    }

+    /**
+     * Return the stand-in for the dot set.  It is allocated the first
+     * time and reused thereafter.
+     */
+    char getDotStandIn() {
+        if (dotStandIn == -1) {
+            dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
+        }
+        return (char) dotStandIn;
+    }
+    
    /**
     * Append the value of the given variable name to the given
     * StringBuffer.
--- a/icu4j/src/com/ibm/text/UnicodePropertySet.java
+++ b/icu4j/src/com/ibm/text/UnicodePropertySet.java
@ -0,0 +1,590 @@
+/*
+**********************************************************************
+*   Copyright (c) 2001, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodePropertySet.java,v $
+* $Date: 2001/10/17 19:17:06 $
+* $Revision: 1.1 $
+**********************************************************************
+*/
+package com.ibm.text;
+
+import java.text.*;
+import java.util.*;
+import com.ibm.util.Utility;
+
+/**
+ * INTERNAL CLASS implementing the UnicodeSet properties as outlined
+ * at:
+ *
+ * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
+ *
+ * Recognized syntax:
+ *
+ * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
+ * \p{foo} \P{foo}  - white space not allowed within "\p" or "\P"
+ *
+ * Other than the above restrictions, white space is ignored.  Case
+ * is ignored except in "\p" and "\P".
+ *
+ * This class cannot be instantiated.  It has a public static method,
+ * createPropertySet(), with takes a pattern to be parsed and returns
+ * a new UnicodeSet.  Another public static method,
+ * resemblesPattern(), returns true if a given pattern string appears
+ * to be a property set pattern, and therefore should be passed in to
+ * createPropertySet().
+ *
+ * NOTE: Current implementation is incomplete.  The following list
+ * indicates which properties are supported.
+ *
+ *    + GeneralCategory
+ *      CombiningClass
+ *      BidiClass
+ *      DecompositionType
+ *    + NumericValue
+ *      NumericType
+ *      EastAsianWidth
+ *      LineBreak
+ *      JoiningType
+ *    + Script
+ *
+ * '+' indicates a supported property.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: UnicodePropertySet.java,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:17:06 $
+ */
+class UnicodePropertySet {
+
+    private static final Hashtable NAME_MAP = new Hashtable();
+
+    private static final Hashtable CATEGORY_MAP = new Hashtable();
+
+    /**
+     * A cache mapping character category integers, as returned by
+     * UCharacter.getType(), to sets.  Entries are initially
+     * null and are created on demand.
+     */
+    private static final UnicodeSet[] CATEGORY_CACHE =
+        new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT];
+
+    /**
+     * A cache mapping script integers, as defined by
+     * UScript, to sets.  Entries are initially
+     * null and are created on demand.
+     */
+    private static final UnicodeSet[] SCRIPT_CACHE =
+        new UnicodeSet[UScript.CODE_LIMIT];
+
+    // Special value codes
+    private static final int ANY = -1; // general category: all code points
+
+    //----------------------------------------------------------------
+    // Public API
+    //----------------------------------------------------------------
+
+    /**
+     * Return true if the given position, in the given pattern, appears
+     * to be the start of a property set pattern [:foo:], \p{foo}, or
+     * \P{foo}.
+     */
+    public static boolean resemblesPattern(String pattern, int pos) {
+        // Patterns are at least 5 characters long
+        if ((pos+5) > pattern.length()) {
+            return false;
+        }
+
+        // Look for an opening [:, [:^, \p, or \P
+        return pattern.regionMatches(pos, "[:", 0, 2) ||
+            pattern.regionMatches(true, pos, "\\p", 0, 2);
+    }
+
+    /**
+     * Create a UnicodeSet by parsing the given pattern at the given
+     * parse position.
+     *
+     * @param pattern the pattern string
+     * @param ppos on entry, the position at which to begin parsing.
+     * This shold be one of the locations marked '^':
+     *
+     *   [:blah:]     \p{blah}     \P{blah}
+     *   ^       %    ^       %    ^       %
+     *
+     * On return, the position after the last character parsed, that is,
+     * the locations marked '%'.  If the parse fails, ppos is returned
+     * unchanged.
+     * @return a newly-constructed UnicodeSet object, or null upon
+     * failure.
+     */
+    public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) {
+
+        UnicodeSet set = null;
+
+        int pos = ppos.getIndex();
+
+        // On entry, ppos should point to one of the following locations:
+
+        // Minimum length is 5 characters, e.g. \p{L}
+        if ((pos+5) > pattern.length()) {
+            return null;
+        }
+
+        boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat}
+        boolean invert = false;
+
+        // Look for an opening [:, [:^, \p, or \P
+        if (pattern.regionMatches(pos, "[:", 0, 2)) {
+            posix = true;
+            pos = skipWhitespace(pattern, pos+2);
+            if (pos < pattern.length() && pattern.charAt(pos) == '^') {
+                ++pos;
+                invert = true;
+            }
+        } else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) {
+            invert = (pattern.charAt(pos+1) == 'P');
+            pos = skipWhitespace(pattern, pos+2);
+            if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
+                // Syntax error; "\p" or "\P" not followed by "{"
+                return null;
+            }
+        } else {
+            // Open delimiter not seen
+            return null;
+        }
+
+        // Look for the matching close delimiter, either :] or }
+        int close = pattern.indexOf(posix ? ":]" : "}", pos);
+        if (close < 0) {
+            // Syntax error; close delimiter missing
+            return null;
+        }
+
+        // Look for an '=' sign.  If this is present, we will parse a
+        // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
+        // pattern.
+        int equals = pattern.indexOf('=', pos);
+        if (equals >= 0 && equals < close) {
+            // Equals seen; parse medium/long pattern
+            String typeName = munge(pattern, pos, equals);
+            String valueName = munge(pattern, equals+1, close);
+            SetFactory factory;
+            factory = (SetFactory) NAME_MAP.get(typeName);
+            if (factory == null) {
+                // Syntax error; type name not recognized
+                return null;
+            }
+            set = factory.create(valueName);
+        } else {
+            // No equals seen; parse short format \p{Cf}
+            String shortName = munge(pattern, pos, close);
+
+            // First try general category
+            set = createCategorySet(shortName);
+
+            // If this fails, try script
+            if (set == null) {
+                set = createScriptSet(shortName);
+            }
+        }
+
+        if (invert) {
+            set.complement();
+        }
+
+        // Move to the limit position after the close delimiter
+        ppos.setIndex(close + (posix ? 2 : 1));
+
+        return set;
+    }
+
+    //----------------------------------------------------------------
+    // Property set factory classes
+    // NOTE: This will change/go away when we implement UCharacter
+    // based property retrieval.
+    //----------------------------------------------------------------
+
+    static interface SetFactory {
+
+        UnicodeSet create(String valueName);
+    }
+
+    static class NumericValueFactory implements SetFactory {
+        NumericValueFactory() {}
+        public UnicodeSet create(String valueName) {
+            double value = Double.parseDouble(valueName);
+            final int ivalue = (int) value;
+            if (ivalue != value || ivalue < 0) {
+                // UCharacter doesn't support negative or non-integral
+                // values, so just return an empty set
+                return new UnicodeSet();
+            }
+            return createSetFromFilter(new Filter() {
+                public boolean contains(int cp) {
+                    return UCharacter.getUnicodeNumericValue(cp) == ivalue;
+                }
+            });
+        }
+    }
+
+    //----------------------------------------------------------------
+    // Property set factory static methods
+    // NOTE: This will change/go away when we implement UCharacter
+    // based property retrieval.
+    //----------------------------------------------------------------
+
+    /**
+     * Given a general category value name, create a corresponding
+     * set and return it, or return null if the name is invalid.
+     * @param valueName a pre-munged general category value name
+     */
+    private static UnicodeSet createCategorySet(String valueName) {
+        Integer valueObj;
+        valueObj = (Integer) CATEGORY_MAP.get(valueName);
+        if (valueObj == null) {
+            return null;
+        }
+        int valueCode = valueObj.intValue();
+
+        UnicodeSet set = new UnicodeSet();
+        if (valueCode == ANY) {
+            set.complement();
+            return set;
+        }
+        for (int cat=0; cat<UCharacterCategory.CHAR_CATEGORY_COUNT; ++cat) {
+            if ((valueCode & (1 << cat)) != 0) {
+                set.addAll(UnicodePropertySet.getCategorySet(cat));
+            }
+        }
+        return set;
+    }
+
+    /**
+     * Given a script value name, create a corresponding set and
+     * return it, or return null if the name is invalid.
+     * @param valueName a pre-munged script value name
+     */
+    private static UnicodeSet createScriptSet(String valueName) {
+        int script = UScript.getCode(valueName);
+        if (script == UScript.INVALID_CODE) {
+            // Syntax error; unknown short name
+            return null;
+        }
+        return new UnicodeSet(getScriptSet(script));
+    }
+
+    //----------------------------------------------------------------
+    // Utility methods
+    //----------------------------------------------------------------
+
+    /**
+     * Returns a UnicodeSet for the given category.  This set is
+     * cached and returned again if this method is called again with
+     * the same parameter.
+     *
+     * Callers MUST NOT MODIFY the returned set.
+     */
+    private static UnicodeSet getCategorySet(final int cat) {
+        if (CATEGORY_CACHE[cat] == null) {
+            CATEGORY_CACHE[cat] =
+                createSetFromFilter(new Filter() {
+                    public boolean contains(int cp) {
+                        return UCharacter.getType(cp) == cat;
+                    }
+                });
+        }
+        return CATEGORY_CACHE[cat];
+    }
+
+    /**
+     * Returns a UnicodeSet for the given script.  This set is
+     * cached and returned again if this method is called again with
+     * the same parameter.
+     *
+     * Callers MUST NOT MODIFY the returned set.
+     */
+    private static UnicodeSet getScriptSet(final int script) {
+        if (SCRIPT_CACHE[script] == null) {
+            SCRIPT_CACHE[script] =
+                createSetFromFilter(new Filter() {
+                    public boolean contains(int cp) {
+                        return UScript.getScript(cp) == script;
+                    }
+                });
+        }
+        return SCRIPT_CACHE[script];
+    }
+
+    /**
+     * Given a string, munge it to upper case and lose the whitespace.
+     * So "General Category " becomes "GENERALCATEGORY".  We munge all
+     * type and value strings, and store all type and value keys
+     * pre-munged.
+     */
+    private static String munge(String str, int start, int limit) {
+        StringBuffer buf = new StringBuffer();
+        for (int i=start; i<limit; ) {
+            int c = UTF16.charAt(str, i);
+            i += UTF16.getCharCount(c);
+            if (!UCharacter.isWhitespace(c)) {
+                UTF16.append(buf, UCharacter.toUpperCase(c));
+            }
+        }
+        return buf.toString();
+    }
+
+    /**
+     * Skip over a sequence of zero or more white space characters
+     * at pos.  Return the index of the first non-white-space character
+     * at or after pos, or str.length(), if there is none.
+     */
+    private static int skipWhitespace(String str, int pos) {
+        while (pos < str.length()) {
+            int c = UTF16.charAt(str, pos);
+            if (!UCharacter.isWhitespace(c)) {
+                break;
+            }
+            pos += UTF16.getCharCount(c);
+        }
+        return pos;
+    }
+
+    //----------------------------------------------------------------
+    // Generic filter-based scanning code
+    //
+    // NOTE: In general, we don't want to do this!  This is a temporary
+    // implementation until we have time for something that examines
+    // the underlying UCharacter data structures in an intelligent
+    // way.  Iterating over all code points is dumb.  What we want to
+    // do, for instance, is iterate over internally-stored ranges
+    // of characters that have a given property.
+    //----------------------------------------------------------------
+
+    static interface Filter {
+        boolean contains(int codePoint);
+    }
+
+    static UnicodeSet createSetFromFilter(Filter filter) {
+        // Walk through all Unicode characters, noting the start
+        // and end of each range for which filter.contain(c) is
+        // true.  Add each range to a set.
+        UnicodeSet set = new UnicodeSet();
+        int start = -1;
+        int end = -2;
+
+        // TODO Extend this up to UnicodeSet.MAX_VALUE when we have
+        // better performance; i.e., when this code can get moved into
+        // the UCharacter class and not have to iterate over code
+        // points.  Right now it's way too slow to iterate to 10FFFF.
+
+        for (int i=UnicodeSet.MIN_VALUE; i<=0xFFFF; ++i) {
+            if (filter.contains(i)) {
+                if ((end+1) == i) {
+                    end = i;
+                } else {
+                    if (start >= 0) {
+                        set.add(start, end);
+                    }
+                    start = end = i;
+                }
+            }
+        }
+        if (start >= 0) {
+            set.add(start, end);
+        }
+        return set;
+    }
+
+    //----------------------------------------------------------------
+    // Type and value name maps
+    //----------------------------------------------------------------
+
+    /**
+     * Add a type mapping to the name map.
+     */
+    private static void addType(String shortName, String longName,
+                                SetFactory factory) {
+        // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
+        if (true) {
+            if (NAME_MAP.get(shortName) != null) {
+                throw new InternalError("Duplicate name " + shortName);
+            }
+            if (NAME_MAP.get(longName) != null) {
+                throw new InternalError("Duplicate name " + longName);
+            }
+        }
+
+        NAME_MAP.put(shortName, factory);
+        NAME_MAP.put(longName, factory);
+    }
+
+    /**
+     * Add a value mapping to the name map.
+     */
+    private static void addValue(Hashtable map,
+                                 String shortName, String longName,
+                                 int value) {
+        // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
+        if (true) {
+            if (map.get(shortName) != null) {
+                throw new InternalError("Duplicate name " + shortName);
+            }
+            if (longName != null && map.get(longName) != null) {
+                throw new InternalError("Duplicate name " + longName);
+            }
+        }
+
+        Integer valueObj = new Integer(value);
+        map.put(shortName, valueObj);
+        if (longName != null) {
+            map.put(longName, valueObj);
+        }
+    }
+
+    static {
+        // NOTE:  We munge all search keys to have no whitespace
+        // and upper case.  As such, all stored keys should have
+        // this format.
+
+        // Load the map with type data
+
+        addType("GC", "GENERALCATEGORY", new SetFactory() {
+            public UnicodeSet create(String valueName) {
+                return createCategorySet(valueName);
+            }
+        });
+
+        //addType("CC", "COMBININGCLASS", COMBINING_CLASS);
+        //addType("BC", "BIDICLASS", BIDI_CLASS);
+        //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
+
+        addType("NV", "NUMERICVALUE", new NumericValueFactory());
+
+        //addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
+        //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
+        //addType("LB", "LINEBREAK", LINE_BREAK);
+        //addType("JT", "JOININGTYPE", JOINING_TYPE);
+
+        addType("SC", "SCRIPT", new SetFactory() {
+            public UnicodeSet create(String valueName) {
+                return createScriptSet(valueName);
+            }
+        });
+
+        // Load the map with value data
+
+        // General Category
+
+        addValue(CATEGORY_MAP, "ANY", null, ANY); // special case
+
+        addValue(CATEGORY_MAP, "C", "OTHER",
+                 (1 << UCharacterCategory.CONTROL) |
+                 (1 << UCharacterCategory.FORMAT) |
+                 (1 << UCharacterCategory.GENERAL_OTHER_TYPES) |
+                 (1 << UCharacterCategory.PRIVATE_USE) |
+                 (1 << UCharacterCategory.SURROGATE));
+
+        addValue(CATEGORY_MAP, "CC", "CONTROL",
+                 1 << UCharacterCategory.CONTROL);
+        addValue(CATEGORY_MAP, "CF", "FORMAT",
+                 1 << UCharacterCategory.FORMAT);
+        addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
+                 1 << UCharacterCategory.GENERAL_OTHER_TYPES);
+        addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
+                 1 << UCharacterCategory.PRIVATE_USE);
+        addValue(CATEGORY_MAP, "CS", "SURROGATE",
+                 1 << UCharacterCategory.SURROGATE);
+
+        addValue(CATEGORY_MAP, "L", "LETTER",
+                 (1 << UCharacterCategory.LOWERCASE_LETTER) |
+                 (1 << UCharacterCategory.MODIFIER_LETTER) |
+                 (1 << UCharacterCategory.OTHER_LETTER) |
+                 (1 << UCharacterCategory.TITLECASE_LETTER) |
+                 (1 << UCharacterCategory.UPPERCASE_LETTER));
+
+        addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
+                 1 << UCharacterCategory.LOWERCASE_LETTER);
+        addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
+                 1 << UCharacterCategory.MODIFIER_LETTER);
+        addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
+                 1 << UCharacterCategory.OTHER_LETTER);
+        addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
+                 1 << UCharacterCategory.TITLECASE_LETTER);
+        addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
+                 1 << UCharacterCategory.UPPERCASE_LETTER);
+
+        addValue(CATEGORY_MAP, "M", "MARK",
+                 (1 << UCharacterCategory.NON_SPACING_MARK) |
+                 (1 << UCharacterCategory.COMBINING_SPACING_MARK) |
+                 (1 << UCharacterCategory.ENCLOSING_MARK));
+
+        addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
+                 1 << UCharacterCategory.NON_SPACING_MARK);
+        addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
+                 1 << UCharacterCategory.COMBINING_SPACING_MARK);
+        addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
+                 1 << UCharacterCategory.ENCLOSING_MARK);
+
+        addValue(CATEGORY_MAP, "N", "NUMBER",
+                 (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
+                 (1 << UCharacterCategory.LETTER_NUMBER) |
+                 (1 << UCharacterCategory.OTHER_NUMBER));
+
+        addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
+                 1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER);
+        addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
+                 1 << UCharacterCategory.LETTER_NUMBER);
+        addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
+                 1 << UCharacterCategory.OTHER_NUMBER);
+
+        addValue(CATEGORY_MAP, "P", "PUNCTUATION",
+                 (1 << UCharacterCategory.CONNECTOR_PUNCTUATION) |
+                 (1 << UCharacterCategory.DASH_PUNCTUATION) |
+                 (1 << UCharacterCategory.END_PUNCTUATION) |
+                 (1 << UCharacterCategory.FINAL_PUNCTUATION) |
+                 (1 << UCharacterCategory.INITIAL_PUNCTUATION) |
+                 (1 << UCharacterCategory.OTHER_PUNCTUATION) |
+                 (1 << UCharacterCategory.START_PUNCTUATION));
+
+        addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
+                 1 << UCharacterCategory.CONNECTOR_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
+                 1 << UCharacterCategory.DASH_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
+                 1 << UCharacterCategory.END_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
+                 1 << UCharacterCategory.FINAL_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
+                 1 << UCharacterCategory.INITIAL_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
+                 1 << UCharacterCategory.OTHER_PUNCTUATION);
+        addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
+                 1 << UCharacterCategory.START_PUNCTUATION);
+
+        addValue(CATEGORY_MAP, "S", "SYMBOL",
+                 (1 << UCharacterCategory.CURRENCY_SYMBOL) |
+                 (1 << UCharacterCategory.MODIFIER_SYMBOL) |
+                 (1 << UCharacterCategory.MATH_SYMBOL) |
+                 (1 << UCharacterCategory.OTHER_SYMBOL));
+
+        addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
+                 1 << UCharacterCategory.CURRENCY_SYMBOL);
+        addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
+                 1 << UCharacterCategory.MODIFIER_SYMBOL);
+        addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
+                 1 << UCharacterCategory.MATH_SYMBOL);
+        addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
+                 1 << UCharacterCategory.OTHER_SYMBOL);
+
+        addValue(CATEGORY_MAP, "Z", "SEPARATOR",
+                 (1 << UCharacterCategory.LINE_SEPARATOR) |
+                 (1 << UCharacterCategory.PARAGRAPH_SEPARATOR) |
+                 (1 << UCharacterCategory.SPACE_SEPARATOR));
+
+        addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
+                 1 << UCharacterCategory.LINE_SEPARATOR);
+        addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
+                 1 << UCharacterCategory.PARAGRAPH_SEPARATOR);
+        addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
+                 1 << UCharacterCategory.SPACE_SEPARATOR);
+    }
+}
--- a/icu4j/src/com/ibm/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
- * $Date: 2001/10/10 21:35:05 $
- * $Revision: 1.39 $
+ * $Date: 2001/10/17 19:17:06 $
+ * $Revision: 1.40 $
 *
 *****************************************************************************************
 */
@ -202,60 +202,26 @@ import com.ibm.util.Utility;
 * starting wih 'L', that is, <code>[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]</code>.
 * </table>
 *
- * <p><b>Character categories.</b>
+ * <p><b>Character properties.</b>
 *
- * Character categories are specified using the POSIX-like syntax
- * '[:Lu:]'.  The complement of a category is specified by inserting
- * '^' after the opening '[:'.  The following category names are
- * recognized.  Actual determination of category data uses
- * <code>Character.getType()</code>, so it reflects the underlying
- * implmementation used by <code>Character</code>.  As of Java 2 and
- * JDK 1.1.8, this is Unicode 2.1.2.
+ * <p>Character properties are specified using the POSIX-like syntax
+ * "[:Lu:]" or the Perl-like syntax "\p{Lu}".  The complement of a
+ * category is specified as "[:^Lu:]" or "\P{Lu}".  Actual
+ * determination of category data is accomplished by UCharacter using
+ * the underlying Unicode database.
 *
- * <pre>
- * Normative
- *     Mn = Mark, Non-Spacing
- *     Mc = Mark, Spacing Combining
- *     Me = Mark, Enclosing
+ * <p>For details of the property syntax please see this
+ * <a href="http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html">
+ * draft document</a>.
 *
- *     Nd = Number, Decimal Digit
- *     Nl = Number, Letter
- *     No = Number, Other
- *
- *     Zs = Separator, Space
- *     Zl = Separator, Line
- *     Zp = Separator, Paragraph
- *
- *     Cc = Other, Control
- *     Cf = Other, Format
- *     Cs = Other, Surrogate
- *     Co = Other, Private Use
- *     Cn = Other, Not Assigned
- *
- * Informative
- *     Lu = Letter, Uppercase
- *     Ll = Letter, Lowercase
- *     Lt = Letter, Titlecase
- *     Lm = Letter, Modifier
- *     Lo = Letter, Other
- *
- *     Pc = Punctuation, Connector
- *     Pd = Punctuation, Dash
- *     Ps = Punctuation, Open
- *     Pe = Punctuation, Close
- *    *Pi = Punctuation, Initial quote
- *    *Pf = Punctuation, Final quote
- *     Po = Punctuation, Other
- *
- *     Sm = Symbol, Math
- *     Sc = Symbol, Currency
- *     Sk = Symbol, Modifier
- *     So = Symbol, Other
- * </pre>
- * *Unsupported by Java (and hence unsupported by UnicodeSet).
+ * <p><em>Note:</em> Not all properties are currently supported.
+ * Currently, only the general category, script, and numeric value
+ * properties are supported.  Support for other properties will be
+ * added in the future.
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.39 $ $Date: 2001/10/10 21:35:05 $ */
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.40 $ $Date: 2001/10/17 19:17:06 $
+ */
 public class UnicodeSet extends UnicodeFilter {

    /* Implementation Notes.
@ -313,29 +279,12 @@ public class UnicodeSet extends UnicodeFilter {
     * modified using the non-pattern API, this string will be null,
     * indicating that toPattern() must generate a pattern
     * representation from the inversion list.
-     */ 
+     */
    private String pat = null;

    private static final int START_EXTRA = 16;         // initial storage. Must be >= 0
    private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0

-    private static final String CATEGORY_NAMES =
-        //                    1 1 1 1 1 1 1   1 1 2 2 2 2 2 2 2 2 2
-        //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6   8 9 0 1 2 3 4 5 6 7 8
-        "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
-
-    private static final int UNSUPPORTED_CATEGORY = 17;
-
-    private static final int CATEGORY_COUNT = 29;
-
-    /**
-     * A cache mapping character category integers, as returned by
-     * Character.getType(), to inversion lists.  Entries are initially
-     * null and are created on demand.
-     */
-    private static final UnicodeSet[] CATEGORY_CACHE =
-        new UnicodeSet[CATEGORY_COUNT];
-
    //----------------------------------------------------------------
    // Public API
    //----------------------------------------------------------------
@ -408,19 +357,27 @@ public class UnicodeSet extends UnicodeFilter {
        applyPattern(pattern, pos, symbols, true);
    }

+    private static final String CATEGORY_NAMES =
+        //                    1 1 1 1 1 1 1   1 1 2 2 2 2 2 2 2 2 2
+        //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6   8 9 0 1 2 3 4 5 6 7 8
+        "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
    /**
-     * Constructs a set from the given Unicode character category.
+     * DEPRECATED - Constructs a set from the given Unicode character
+     * category.
     * @param category an integer indicating the character category as
-     * returned by <code>Character.getType()</code>.
+     * returned by <code>java.lang.Character.getType()</code>.  Note
+     * that this is <em>different</em> from the UCharacterCategory
+     * codes.
     * @exception java.lang.IllegalArgumentException if the given
     * category is invalid.
+     * @deprecated this will be removed Dec-31-2001
     */
    public UnicodeSet(int category) {
-        if (category < 0 || category >= CATEGORY_COUNT ||
-            category == UNSUPPORTED_CATEGORY) {
+        if (category < 0 || category > java.lang.Character.OTHER_SYMBOL ||
+            category == 17) {
            throw new IllegalArgumentException("Invalid category");
        }
-        set(getCategorySet(category));
+        applyPattern(CATEGORY_NAMES.substring(2*category, 2*category+2), false);
    }

    /**
@ -489,6 +446,16 @@ public class UnicodeSet extends UnicodeFilter {
        }
    }

+    /**
+     * Return true if the given position, in the given pattern, appears
+     * to be the start of a UnicodeSet pattern.
+     */
+    public static boolean resemblesPattern(String pattern, int pos) {
+        return ((pos+1) < pattern.length() &&
+                pattern.charAt(pos) == '[') ||
+            UnicodePropertySet.resemblesPattern(pattern, pos);
+    }
+
    /**
     * Append the <code>toPattern()</code> representation of a
     * character to the given <code>StringBuffer</code>.
@ -509,6 +476,8 @@ public class UnicodeSet extends UnicodeFilter {
        case '^': // COMPLEMENT:
        case '&': // INTERSECTION:
        case '\\': //BACKSLASH:
+        case '{':
+        case '}':
            buf.append('\\');
            break;
        default:
@ -607,28 +576,28 @@ public class UnicodeSet extends UnicodeFilter {
            }
            return result;
        }
-        
+
        return _generatePattern(result, escapeUnprintable);
    }

    /**
     * Generate and append a string representation of this set to result.
     * This does not use this.pat, the cleaned up copy of the string
-     * passed to applyPattern(). 
+     * passed to applyPattern().
     */
    public StringBuffer _generatePattern(StringBuffer result,
                                         boolean escapeUnprintable) {
        result.append('[');

-        // Check against the predefined categories.  We implicitly build
-        // up ALL category sets the first time toPattern() is called.
-        for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
-            if (this.equals(getCategorySet(cat))) {
-                result.append(':');
-                result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
-                return result.append(":]");
-            }
-        }
+//      // Check against the predefined categories.  We implicitly build
+//      // up ALL category sets the first time toPattern() is called.
+//      for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
+//          if (this.equals(getCategorySet(cat))) {
+//              result.append(':');
+//              result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
+//              return result.append(":]");
+//          }
+//      }

        int count = getRangeCount();

@ -1205,7 +1174,7 @@ public class UnicodeSet extends UnicodeFilter {
        StringBuffer newPat = new StringBuffer("[");
        int nestedPatStart = -1; // see below for usage
        boolean nestedPatDone = false; // see below for usage
-        
+
        boolean invert = false;
        clear();

@ -1231,8 +1200,9 @@ public class UnicodeSet extends UnicodeFilter {
        // mode 1: '[' seen; if next is '^' or ':' then special
        // mode 2: '[' '^'? seen; parse pattern and close with ']'
        // mode 3: '[:' seen; parse category and close with ':]'
+        // mode 4: ']' seen; parse complete
+        // mode 5: Top-level property pattern seen
        int mode = 0;
-        int colonPos = 0; // Expected pos of ':' in '[:'
        int start = pos.getIndex();
        int i = start;
        int limit = pattern.length();
@ -1285,9 +1255,11 @@ public class UnicodeSet extends UnicodeFilter {
            // Parse the opening '[' and optional following '^'
            switch (mode) {
            case 0:
-                if (c == '[') {
+                if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
+                    mode = 3;
+                    break; // Fall through
+                } else if (c == '[') {
                    mode = 1; // Next look for '^'
-                    colonPos = i; // Expect ':' at next offset
                    continue;
                } else {
                    throw new IllegalArgumentException("Missing opening '['");
@ -1299,17 +1271,6 @@ public class UnicodeSet extends UnicodeFilter {
                    invert = true;
                    newPat.append((char) c);
                    continue; // Back to top to fetch next character
-                case ':':
-                    if (i-1 == colonPos) {
-                        // '[:' cannot have whitespace in it
-                        --i; // Backup to the '['
-                        c = '[';
-                        mode = 3;
-                        // Fall through and parse category using the same
-                        // code used to parse a nested category.  The mode
-                        // will indicate that this is actually top level.
-                    }
-                    break; // Fall through
                case '-':
                    isLiteral = true; // Treat leading '-' as a literal
                    break; // Fall through
@ -1326,12 +1287,47 @@ public class UnicodeSet extends UnicodeFilter {
            // buffer.  Characters in the variable buffer have already
            // benn through escape and variable reference processing.
            if (varValueBuffer == null) {
+                /**
+                 * Handle property set patterns.
+                 */
+                if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
+                    ParsePosition pp = new ParsePosition(i-1);
+                    nestedSet = UnicodePropertySet.createFromPattern(pattern, pp);
+                    if (nestedSet == null) {
+                        // assert(pp.getIndex() == i-1);
+                        throw new IllegalArgumentException("Invalid property pattern " +
+                                                           pattern.substring(i-1));
+                    }
+                    nestedPatStart = newPat.length();
+                    nestedPatDone = true; // we're going to do it just below
+
+                    // If we have a top-level property pattern, then trim
+                    // off the opening '[' and use the property pattern
+                    // as the entire pattern.
+                    if (mode == 3) {
+                        newPat.deleteCharAt(0);
+                    }
+                    newPat.append(pattern.substring(i-1, pp.getIndex()));
+                    rebuildPattern = true;
+
+                    i = pp.getIndex(); // advance past property pattern
+                    
+                    if (mode == 3) {
+                        // Entire pattern is a category; leave parse
+                        // loop.  This is one of 2 ways we leave this
+                        // loop if the pattern is well-formed.
+                        set(nestedSet);
+                        mode = 5;
+                        break;
+                    }
+                }
+
                /* Handle escapes.  If a character is escaped, then it assumes its
                 * literal value.  This is true for all characters, both special
                 * characters and characters with no special meaning.  We also
                 * interpret '\\uxxxx' Unicode escapes here (as literals).
                 */
-                if (c == '\\') {
+                else if (c == '\\') {
                    int[] offset = new int[] { i };
                    int escaped = Utility.unescapeAt(pattern, offset);
                    if (escaped == -1) {
@ -1373,61 +1369,25 @@ public class UnicodeSet extends UnicodeFilter {
                }

                /* An opening bracket indicates the first bracket of a nested
-                 * subpattern, either a normal pattern or a category pattern.  We
-                 * recognize these here and set nestedSet accordingly.
+                 * subpattern.
                 */
                else if (!isLiteral && c == '[') {
                    // Record position before nested pattern
                    nestedPatStart = newPat.length();

-                    // Handle "[:...:]", representing a character category
-                    if (i < pattern.length() && pattern.charAt(i) == ':') {
-                        ++i;
-                        int j = pattern.indexOf(":]", i);
-                        if (j < 0) {
-                            throw new IllegalArgumentException("Missing \":]\"");
-                        }
-                        String scratch = pattern.substring(i, j);
-                        nestedSet = new UnicodeSet();
-                        nestedSet.applyCategory(scratch);
-                        nestedPatDone = true; // We're going to do it just below
-                        i = j+2; // Advance i past ":]"
-
-                        // Use a rebuilt pattern.  If we are top level,
-                        // then there is already a SET_OPEN in newPat, and
-                        // SET_CLOSE will be appended elsewhere.
-                        if (mode != 3) {
-                            newPat.append('[');
-                        }
-                        newPat.append(':').append(scratch).append(':');
-                        if (mode != 3) {
-                            newPat.append(']');
-                        }
-                        rebuildPattern = true;
-
-                        if (mode == 3) {
-                            // Entire pattern is a category; leave parse
-                            // loop.  This is one of 2 ways we leave this
-                            // loop if the pattern is well-formed.
-                            set(nestedSet);
-                            mode = 4;
-                            break;
-                        }
-                    } else {
-                        // Recurse to get the pairs for this nested set.
-                        // Backup i to '['.
-                        pos.setIndex(--i);
-                        switch (lastOp) {
-                        case '-':
-                        case '&':
-                            newPat.append(lastOp);
-                            break;
-                        }
-                        nestedSet = new UnicodeSet();
-                        nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
-                        nestedPatDone = true;
-                        i = pos.getIndex();
+                    // Recurse to get the pairs for this nested set.
+                    // Backup i to '['.
+                    pos.setIndex(--i);
+                    switch (lastOp) {
+                    case '-':
+                    case '&':
+                        newPat.append(lastOp);
+                        break;
                    }
+                    nestedSet = new UnicodeSet();
+                    nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
+                    nestedPatDone = true;
+                    i = pos.getIndex();
                }
            }

@ -1487,7 +1447,7 @@ public class UnicodeSet extends UnicodeFilter {
                // loop if the pattern is well-formed.
                if (anchor > 2 || anchor == 1) {
                    throw new IllegalArgumentException("Syntax error near $" + pattern);
-                    
+
                }
                if (anchor == 2) {
                    rebuildPattern = true;
@ -1524,16 +1484,24 @@ public class UnicodeSet extends UnicodeFilter {
            }
        }

-        if (lastChar != NONE) {
+        if (mode < 4) {
+            throw new IllegalArgumentException("Missing ']'");
+        }
+
+        // Treat a trailing '$' as indicating ETHER.  This code is only
+        // executed if symbols == NULL; otherwise other code parses the
+        // anchor.
+        if (lastChar == SymbolTable.SYMBOL_REF) {
+            rebuildPattern = true;
+            newPat.append(lastChar);
+            add(TransliterationRule.ETHER);
+        }
+        
+        else if (lastChar != NONE) {
            add(lastChar, lastChar);
            _appendToPat(newPat, lastChar, false);
        }

-//      if (mode == 0) {
-//          throw new IllegalArgumentException("Missing '[' in \"" +
-//                                             pattern.substring(start) + '"');
-//      }
-
        // Handle unprocessed stuff preceding the closing ']'
        if (lastOp == '-') {
            // Trailing '-' is treated as literal
@ -1543,7 +1511,9 @@ public class UnicodeSet extends UnicodeFilter {
            throw new IllegalArgumentException("Unquoted trailing " + lastOp);
        }

-        newPat.append(']');
+        if (mode == 4) {
+            newPat.append(']');
+        }

        /**
         * If we saw a '^' after the initial '[' of this pattern, then perform
@ -1553,21 +1523,6 @@ public class UnicodeSet extends UnicodeFilter {
            complement();
        }

-        if (mode != 4) {
-            throw new IllegalArgumentException("Missing ']'");
-        }
-
-//      /**
-//       * i indexes the last character we parsed or is pattern.length().  In
-//       * the latter case, we have run off the end without finding a closing
-//       * ']'.  Otherwise, we know i < pattern.length(), and we set the
-//       * ParsePosition to the next character to be parsed.
-//       */
-//      if (i == limit) {
-//          throw new IllegalArgumentException("Missing ']' in \"" +
-//                                             pattern.substring(start) + '"');
-//      }
-  
        pos.setIndex(i);

        // Use the rebuilt pattern (newPat) only if necessary.  Prefer the
@ -1586,136 +1541,6 @@ public class UnicodeSet extends UnicodeFilter {
        }
    }

-    //----------------------------------------------------------------
-    // Implementation: Generation of Unicode categories
-    //----------------------------------------------------------------
-
-    /**
-     * Sets this object to the given category, given its name.
-     * The category name must be either a two-letter name, such as
-     * "Lu", or a one letter name, such as "L".  One-letter names
-     * indicate the logical union of all two-letter names that start
-     * with that letter.  Case is significant.  If the name starts
-     * with the character '^' then the complement of the given
-     * character set is returned.
-     *
-     * Although individual categories such as "Lu" are cached, we do
-     * not currently cache single-letter categories such as "L" or
-     * complements such as "^Lu" or "^L".  It would be easy to cache
-     * these as well in a hashtable should the need arise.
-     *
-     * NEW: The category name can now be a script name, as defined
-     * by UScript.
-     */
-    private void applyCategory(String catName) {
-        boolean invert = (catName.length() > 1 &&
-                          catName.charAt(0) == '^');
-        if (invert) {
-            catName = catName.substring(1);
-        }
-
-        boolean match = false;
-
-        // BE CAREFUL not to modify the return value from
-        // getCategorySet(int).
-
-        // if we have two characters, search the category map for that
-        // code and either construct and return a UnicodeSet from the
-        // data in the category map or throw an exception
-        if (catName.length() == 2) {
-            int i = CATEGORY_NAMES.indexOf(catName);
-            if (i>=0 && i%2==0) {
-                i /= 2;
-                if (i != UNSUPPORTED_CATEGORY) {
-                    set(getCategorySet(i));
-                    match = true;
-                }
-            }
-        } else if (catName.length() == 1) {
-            // if we have one character, search the category map for
-            // codes beginning with that letter, and union together
-            // all of the matching sets that we find (or throw an
-            // exception if there are no matches)
-            clear();
-            for (int i=0; i<CATEGORY_COUNT; ++i) {
-                if (i != UNSUPPORTED_CATEGORY &&
-                    CATEGORY_NAMES.charAt(2*i) == catName.charAt(0)) {
-                    addAll(getCategorySet(i));
-                    match = true;
-                }
-            }
-        }
-
-        if (!match) {
-            // TODO: Add caching of these, if desired
-            int script = UScript.getCode(catName);
-            if (script != UScript.INVALID_CODE) {
-                match = true;
-                clear();
-                int start = -1;
-                int end = -2;
-                for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
-                    if (UScript.getScript(i) == script) {
-                        if ((end+1) == i) {
-                            end = i;
-                        } else {
-                            if (start >= 0) {
-                                add(start, end);
-                            }
-                            start = end = i;
-                        }
-                    }
-                }
-                if (start >= 0) {
-                    add(start, end);
-                }
-            }
-        }
-
-        if (!match) {
-            throw new IllegalArgumentException("Illegal category [:" + catName + ":]");
-        }
-
-        if (invert) {
-            complement();
-        }
-    }
-
-    /**
-     * Returns an inversion list for the given category.  This list is
-     * cached and returned again if this method is called again with
-     * the same parameter.
-     *
-     * Callers MUST NOT MODIFY the returned set.
-     */
-    private static UnicodeSet getCategorySet(int cat) {
-        if (CATEGORY_CACHE[cat] == null) {
-            // Walk through all Unicode characters, noting the start
-            // and end of each range for which Character.getType(c)
-            // returns the given category integer.
-            UnicodeSet set = new UnicodeSet();
-            int start = -1;
-            int end = -2;
-            for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
-                if (Character.getType((char)i) == cat) {
-                    if ((end+1) == i) {
-                        end = i;
-                    } else {
-                        if (start >= 0) {
-                            set.add(start, end);
-                        }
-                        start = end = i;
-                    }
-                }
-            }
-            if (start >= 0) {
-                set.add(start, end);
-            }
-            CATEGORY_CACHE[cat] = set;
-        }
-        return CATEGORY_CACHE[cat];
-    }
-
    //----------------------------------------------------------------
    // Implementation: Utility methods
    //----------------------------------------------------------------