Allow arbitrary length variable values. Clean up Data API. Update javadocs.

X-SVN-Rev: 1242
2025-04-13 08:53:20 +00:00 · 2000-04-25 01:42:58 +00:00 · 2000-04-25 01:42:58 +00:00 · 7a49adef39
commit 7a49adef39
parent cd1cfb9094
8 changed files with 470 additions and 402 deletions
--- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.26 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.27 $
 *
 *****************************************************************************************
 */
@ -20,17 +20,17 @@ import com.ibm.util.Utility;
 /**
 * <strong>RuleBasedTransliterator</strong> is a transliterator
 * that reads a set of rules in order to determine how to perform
- * translations. Rules are stored in resource bundles indexed by
- * name. Rules are separated by semicolons (';'). To include a
- * literal semicolon, prefix it with a backslash ('\'). Whitespace,
- * as defined by <code>Character.isWhitespace()</code>, is ignored.
- * If the first non-blank character on a line is '#', the entire
- * line is ignored as a comment. </p>
+ * translations. Rule sets are stored in resource bundles indexed by
+ * name. Rules within a rule set are separated by semicolons (';').
+ * To include a literal semicolon, prefix it with a backslash ('\').
+ * Whitespace, as defined by <code>Character.isWhitespace()</code>,
+ * is ignored. If the first non-blank character on a line is '#',
+ * the entire line is ignored as a comment. </p>
 * 
 * <p>Each set of rules consists of two groups, one forward, and one
 * reverse. This is a convention that is not enforced; rules for one
 * direction may be omitted, with the result that translations in
- * that direction will not modify the source text. Alternatively,
+ * that direction will not modify the source text. In addition,
 * bidirectional forward-reverse rules may be specified for
 * symmetrical transformations.</p>
 * 
@ -39,69 +39,27 @@ import com.ibm.util.Utility;
 * <p>Rule statements take one of the following forms: </p>
 * 
 * <dl>
- *     <dt><code>$alefmadda=\u0622</code></dt>
+ *     <dt><code>$alefmadda=\u0622;</code></dt>
 *     <dd><strong>Variable definition.</strong> The name on the
- *         left is assigned the character or expression on the
- *         right. Names must begin with a letter and consist only of
- *         letters, digits, and underscores. Case is significant.
- *         Duplicate names (including duplicates of simple variables
- *         or category names) cause an exception to be thrown. If
- *         the right hand side consists of one character, then the
- *         variable stands for that character. In this example,
+ *         left is assigned the text on the right. In this example,
 *         after this statement, instances of the left hand name,
 *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
- *         the Unicode character U+0622. The right hand side must be
- *         exactly one character long (current limitation).</dd>
- *     <dt>&nbsp;</dt>
- *     <dt><code>$softvowel=[eiyEIY]</code></dt>
- *     <dd><strong>Category definition.</strong> The name on the
- *         left is assigned to stand for a set of characters. The
- *         same rules for names of simple variables apply. After
- *         this statement, the left hand variable will be
- *         interpreted as indicating a set of characters in
- *         appropriate contexts. The pattern syntax defining sets of
- *         characters is defined by {@link UnicodeSet}. Examples of
- *         valid patterns are:<table border="0">
- *             <tr>
- *                 <td valign="top" nowrap><code>[abc]</code></td>
- *                 <td valign="top">The set containing the
- *                 characters 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^abc]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[A-Z]</code></td>
- *                 <td valign="top">The set of all characters from
- *                 'A' to 'Z' in Unicode order.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[:Lu:]</code></td>
- *                 <td valign="top">The set of Unicode uppercase
- *                 letters. See <a href="http://www.unicode.org">www.unicode.org</a>
- *                 for a complete list of categories and their
- *                 two-letter codes.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a' through 'z' and uppercase or lowercase
- *                 letters.</td>
- *             </tr>
- *         </table>
- *         <p>Patterns may contain variable references, such as
- *         &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;. See
- *         {@link UnicodeSet} for more documentation and examples. </p>
- *     </dd>
- *     <dt><code>ai&gt;$alefmadda</code></dt>
+ *         the Unicode character U+0622. Variable names must begin
+ *         with a letter and consist only of letters, digits, and
+ *         underscores. Case is significant. Duplicate names cause
+ *         an exception to be thrown, that is, variables cannot be
+ *         redefined. The right hand side may contain well-formed
+ *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
+ *         The right hand side may contain embedded <code>UnicodeSet</code>
+ *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
+ *     <dd>&nbsp;</dd>
+ *     <dt><code>ai&gt;$alefmadda;</code></dt>
 *     <dd><strong>Forward translation rule.</strong> This rule
 *         states that the string on the left will be changed to the
 *         string on the right when performing forward
 *         transliteration.</dd>
 *     <dt>&nbsp;</dt>
- *     <dt><code>ai&lt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;$alefmadda;</code></dt>
 *     <dd><strong>Reverse translation rule.</strong> This rule
 *         states that the string on the right will be changed to
 *         the string on the left when performing reverse
@ -109,7 +67,7 @@ import com.ibm.util.Utility;
 * </dl>
 * 
 * <dl>
- *     <dt><code>ai&lt;&gt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
 *     <dd><strong>Bidirectional translation rule.</strong> This
 *         rule states that the string on the right will be changed
 *         to the string on the left when performing forward
@ -151,9 +109,16 @@ import com.ibm.util.Utility;
 *     y and z</code></p>
 * </blockquote>
 * 
- * <p>In addition to being defined in variables, <code>UnicodeSet</code>
- * patterns may be embedded directly into rule strings. Thus, the
- * following two rules are equivalent:</p>
+ * <p><b>UnicodeSet</b></p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may appear anywhere that
+ * makes sense. They may appear in variable definitions.
+ * Contrariwise, <code>UnicodeSet</code> patterns may themselves
+ * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
+ * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may also be embedded directly
+ * into rule strings. Thus, the following two rules are equivalent:</p>
 * 
 * <blockquote>
 *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
@ -162,6 +127,8 @@ import com.ibm.util.Utility;
 *     Another way</code></p>
 * </blockquote>
 * 
+ * <p>See {@link UnicodeSet} for more documentation and examples.</p>
+ * 
 * <p><b>Segments</b></p>
 * 
 * <p>Segments of the input string can be matched and copied to the
@ -169,7 +136,8 @@ import com.ibm.util.Utility;
 * general, and makes reordering possible. For example:</p>
 * 
 * <blockquote>
- *     <p><code>([a-z]) &gt; $1 $1; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
+ *     <p><code>([a-z]) &gt; $1 $1;
+ *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
 *     double lowercase letters<br>
 *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
 * </blockquote>
@ -284,7 +252,7 @@ import com.ibm.util.Utility;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 * 
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $
 */
 public class RuleBasedTransliterator extends Transliterator {

@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator {
        public TransliterationRuleSet ruleSet;

        /**
-         * Map variable name (String) to variable (Character).  A variable
-         * name may correspond to a single literal character, in which
-         * case the character is stored in this hash.  It may also
-         * correspond to a UnicodeSet, in which case a character is
-         * again stored in this hash, but the character is a stand-in: it
-         * is an index for a secondary lookup in data.setVariables.  The stand-in
-         * also represents the UnicodeSet in the stored rules.
+         * Map variable name (String) to variable (char[]).  A variable name
+         * corresponds to zero or more characters, stored in a char[] array in
+         * this hash.  One or more of these chars may also correspond to a
+         * UnicodeSet, in which case the character in the char[] in this hash is
+         * a stand-in: it is an index for a secondary lookup in
+         * data.setVariables.  The stand-in also represents the UnicodeSet in
+         * the stored rules.
         */
-        public Hashtable variableNames;
+        private Hashtable variableNames;

        /**
         * Map category variable (Character) to set (UnicodeSet).
@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator {
         * stored in the rule text to represent the set of characters.
         * setVariables[i] represents character (setVariablesBase + i).
         */
-        public UnicodeSet[] setVariables;
+        private UnicodeSet[] setVariables;

        /**
         * The character that represents setVariables[0].  Characters
         * setVariablesBase through setVariablesBase +
         * setVariables.length - 1 represent UnicodeSet objects.
         */
-        public char setVariablesBase;
-
-        /**
-         * Return the UnicodeSet represented by the given character, or
-         * null if none.
-         */
-        public UnicodeSet lookup(char c) {
-            int i = c - setVariablesBase;
-            return (i >= 0 && i < setVariables.length)
-                ? setVariables[i] : null;
-        }
+        private char setVariablesBase;

        /**
         * The character that represents segment 1.  Characters segmentBase
         * through segmentBase + 8 represent segments 1 through 9.
         */
-        public char segmentBase;
+        private char segmentBase;
+
+        /**
+         * Return the UnicodeSet represented by the given character, or
+         * null if none.
+         */
+        public UnicodeSet lookupSet(char c) {
+            int i = c - setVariablesBase;
+            return (i >= 0 && i < setVariables.length)
+                ? setVariables[i] : null;
+        }

        /**
         * Return the zero-based index of the segment represented by the given
@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator {
        private class ParseData implements SymbolTable {
            
            /**
-             * Implement SymbolTable API.  Lookup a variable, returning
-             * either a Character, a UnicodeSet, or null.
+             * Implement SymbolTable API.
             */
-            public Object lookup(String name) {
-                Character ch = (Character) data.variableNames.get(name);
-                if (ch != null) {
-                    int i = ch.charValue() - data.setVariablesBase;
-                    if (i >= 0 && i < setVariablesVector.size()) {
-                        return setVariablesVector.elementAt(i);
-                    }
+            public char[] lookup(String name) {
+                return (char[]) data.variableNames.get(name);
+            }
+
+            /**
+             * Implement SymbolTable API.
+             */
+            public UnicodeSet lookupSet(char ch) {
+                // Note that we cannot use data.lookupSet() because the
+                // set array has not been constructed yet.
+                int i = ch - data.setVariablesBase;
+                if (i >= 0 && i < setVariablesVector.size()) {
+                    return (UnicodeSet) setVariablesVector.elementAt(i);
                }
-                return ch;
+                return null;
            }

            /**
@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator {
                                String name = parser.parseData.
                                                parseReference(rule, pp, limit);
                                pos = pp.getIndex();
-                                // If this is a variable definition statement, then the LHS
-                                // variable will be undefined.  In that case getVariableName()
-                                // will return the special placeholder variableLimit-1.
-                                buf.append(parser.getVariableDef(name));
+                                // If this is a variable definition statement,
+                                // then the LHS variable will be undefined.  In
+                                // that case appendVariableDef() will append the
+                                // special placeholder char variableLimit-1.
+
+                                //buf.append(parser.getVariableDef(name));
+                                parser.appendVariableDef(name, buf);
                            }
                        }
                        break;
@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator {
                if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
                    syntaxError("Malformed LHS", rule, start);
                }
-                if (right.text.length() != 1) {
-                    syntaxError("Malformed RHS", rule, start);
-                }
-                data.variableNames.put(undefinedVariableName,
-                                       new Character(right.text.charAt(0)));
+                // We allow anything on the right, including an empty string.
+                int n = right.text.length();
+                char[] value = new char[n];
+                right.text.getChars(0, n, value, 0);
+                data.variableNames.put(undefinedVariableName, value);
+
                ++variableLimit;
                return pos;
            }
@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator {
        }

        /**
-         * Returns the single character value of the given variable name.  Defined
-         * names are recognized.
+         * Append the value of the given variable name to the given
+         * StringBuffer.
         * @exception IllegalArgumentException if the name is unknown.
         */
-        private char getVariableDef(String name) {
-            Character ch = (Character) data.variableNames.get(name);
+        private void appendVariableDef(String name, StringBuffer buf) {
+            char[] ch = (char[]) data.variableNames.get(name);
            if (ch == null) {
                // We allow one undefined variable so that variable definition
                // statements work.  For the first undefined variable we return
@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator {
                    if (variableNext >= variableLimit) {
                        throw new RuntimeException("Private use variables exhausted");
                    }
-                    return --variableLimit;
+                    buf.append((char) --variableLimit);
+                } else {
+                    throw new IllegalArgumentException("Undefined variable $"
+                                                       + name);
                }
-                throw new IllegalArgumentException("Undefined variable $"
-                                                   + name);
+            } else {
+                buf.append(ch);
            }
-            return ch.charValue();
        }

        /**
@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator {

 /**
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.27  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
 * Revision 1.26  2000/04/22 01:25:10  alan
 * Add support for cursor positioner '@'; update javadoc
 *
--- a/icu4j/src/com/ibm/icu/text/SymbolTable.java
+++ b/icu4j/src/com/ibm/icu/text/SymbolTable.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.3 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.4 $
 *
 *****************************************************************************************
 */
@ -32,10 +32,17 @@ public interface SymbolTable {
    final char SYMBOL_REF = '$';

    /**
-     * Lookup the object associated with this string and return it.
-     * Return <tt>null</tt> if no such name exists.
+     * Lookup the characters associated with this string and return it.
+     * Return <tt>null</tt> if no such name exists.  The resultant
+     * array may have length zero.
     */
-    Object lookup(String s);
+    char[] lookup(String s);
+
+    /**
+     * Lookup the UnicodeSet associated with the given character, and
+     * return it.  Return <tt>null</tt> if not found.
+     */
+    UnicodeSet lookupSet(char ch);

    /**
     * Parse a symbol reference name from the given string, starting
--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
 */
 class TransliterationRule {
    /**
@ -240,7 +240,7 @@ class TransliterationRule {
            return -1;
        }
        char c = pattern.charAt(anteContextLength);
-        return variables.lookup(c) == null ? (c & 0xFF) : -1;
+        return variables.lookupSet(c) == null ? (c & 0xFF) : -1;
    }

    /**
@ -300,7 +300,7 @@ class TransliterationRule {
            return true;
        }
        char c = pattern.charAt(anteContextLength);
-        UnicodeSet set = variables.lookup(c);
+        UnicodeSet set = variables.lookupSet(c);
        return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
    }

@ -486,13 +486,16 @@ class TransliterationRule {
                                               UnicodeFilter filter) {
        UnicodeSet set = null;
        return (filter == null || filter.contains(textChar)) &&
-            (((set = variables.lookup(keyChar)) == null) ?
+            (((set = variables.lookupSet(keyChar)) == null) ?
             keyChar == textChar : set.contains(textChar));
    }
 }

 /**
 * $Log: TransliterationRule.java,v $
+ * Revision 1.19  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
 * Revision 1.18  2000/04/22 01:25:10  alan
 * Add support for cursor positioner '@'; update javadoc
 *
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -241,7 +241,7 @@ import java.text.*;
 * *Unsupported by Java (and hence unsupported by UnicodeSet).
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
 */
 public class UnicodeSet implements UnicodeFilter {
    /**
@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter {
        int start = pos.getIndex();
        int i = start;
        int limit = pattern.length();
-        for (; i<limit; ++i) {
+        /* In the case of an embedded SymbolTable variable, we look it up and
+         * then take characters from the resultant char[] array.  These chars
+         * are subjected to an extra level of lookup in the SymbolTable in case
+         * they are stand-ins for a nested UnicodeSet.  */
+        char[] varValueBuffer = null;
+        int ivarValueBuffer = 0;
+        for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
            /* If the next element is a single character, c will be set to it,
             * and nestedPairs will be null.  In this case isLiteral indicates
             * whether the character should assume special meaning if it has
@ -783,9 +789,23 @@ public class UnicodeSet implements UnicodeFilter {
             * nestedPairs will be set to the pairs list for the nested set, and
             * c's value should be ignored.
             */
-            char c = pattern.charAt(i);
            String nestedPairs = null;
            boolean isLiteral = false;
+            char c;
+            if (varValueBuffer != null) {
+                if (ivarValueBuffer < varValueBuffer.length) {
+                    c = varValueBuffer[ivarValueBuffer++];
+                    UnicodeSet set = symbols.lookupSet(c);
+                    if (set != null) {
+                        nestedPairs = set.pairs.toString();
+                    }
+                } else {
+                    varValueBuffer = null;
+                    c = pattern.charAt(i);
+                }
+            } else {
+                c = pattern.charAt(i);
+            }

            // Ignore whitespace.  This is not Unicode whitespace, but Java
            // whitespace, a subset of Unicode whitespace.
@ -829,82 +849,97 @@ public class UnicodeSet implements UnicodeFilter {
            // will be 2 if we want a closing ']', or 3 if we should parse a
            // category and close with ":]".

-            /* Handle escapes.  If a character is escaped, then it assumes its
-             * literal value.  This is true for all characters, both special
-             * characters and characters with no special meaning.  We also
-             * interpret '\\uxxxx' Unicode escapes here (as literals).
-             */
-            if (c == '\\') {
-                ++i;
-                if (i < limit) {
-                    c = pattern.charAt(i);
-                    isLiteral = true;
-                    if (c == 'u') {
-                        if ((i+4) >= limit) {
-                            throw new IllegalArgumentException("Invalid \\u escape");
-                        }
-                        c = '\u0000';
-                        for (int j=(++i)+4; i<j; ++i) { // [sic]
-                            int digit = Character.digit(pattern.charAt(i), 16);
-                            if (digit<0) {
+            // Only process escapes, variable references, and nested sets
+            // if we are _not_ retrieving characters from the variable
+            // buffer.  Characters in the variable buffer have already
+            // benn through escape and variable reference processing.
+            if (varValueBuffer == null) {
+                /* Handle escapes.  If a character is escaped, then it assumes its
+                 * literal value.  This is true for all characters, both special
+                 * characters and characters with no special meaning.  We also
+                 * interpret '\\uxxxx' Unicode escapes here (as literals).
+                 */
+                if (c == '\\') {
+                    ++i;
+                    if (i < limit) {
+                        c = pattern.charAt(i);
+                        isLiteral = true;
+                        if (c == 'u') {
+                            if ((i+4) >= limit) {
                                throw new IllegalArgumentException("Invalid \\u escape");
                            }
-                            c = (char) ((c << 4) | digit);
+                            c = '\u0000';
+                            for (int j=(++i)+4; i<j; ++i) { // [sic]
+                                int digit = Character.digit(pattern.charAt(i), 16);
+                                if (digit<0) {
+                                    throw new IllegalArgumentException("Invalid \\u escape");
+                                }
+                                c = (char) ((c << 4) | digit);
+                            }
+                            --i; // Move i back to last parsed character
                        }
-                        --i; // Move i back to last parsed character
+                    } else {
+                        throw new IllegalArgumentException("Trailing '\\'");
                    }
-                } else {
-                    throw new IllegalArgumentException("Trailing '\\'");
                }
-            }

-            /* Parse variable references.  These are treated as literals.  If a
-             * variable refers to a UnicodeSet, nestedPairs is assigned here.
-             * Variable names are only parsed if varNameToChar is not null.
-             * Set variables are only looked up if varCharToSet is not null.
-             */
-            else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
-                pos.setIndex(++i);
-                String name = symbols.parseReference(pattern, pos, limit);
-                Object obj = symbols.lookup(name);
-                if (obj == null) {
-                    throw new IllegalArgumentException("Undefined variable: "
-                                                       + name);
+                /* Parse variable references.  These are treated as literals.  If a
+                 * variable refers to a UnicodeSet, nestedPairs is assigned here.
+                 * Variable names are only parsed if varNameToChar is not null.
+                 * Set variables are only looked up if varCharToSet is not null.
+                 */
+                else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
+                    pos.setIndex(++i);
+                    String name = symbols.parseReference(pattern, pos, limit);
+                    /*
+                    Object obj = symbols.lookup(name);
+                    if (obj == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    isLiteral = true;
+                    if (obj instanceof Character) {
+                        c = ((Character) obj).charValue();
+                    } else {
+                        nestedPairs = ((UnicodeSet) obj).pairs.toString();
+                    }
+                    */
+                    varValueBuffer = symbols.lookup(name);
+                    if (varValueBuffer == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    ivarValueBuffer = 0;
+                    i = pos.getIndex()-1; // Make i point at last char of var name
+                    continue; // Back to the top to get varValueBuffer[0]
                }
-                isLiteral = true;
-                if (obj instanceof Character) {
-                    c = ((Character) obj).charValue();
-                } else {
-                    nestedPairs = ((UnicodeSet) obj).pairs.toString();
-                }
-                i = pos.getIndex()-1; // Make i point at last char of var name
-            }

-            /* An opening bracket indicates the first bracket of a nested
-             * subpattern, either a normal pattern or a category pattern.  We
-             * recognize these here and set nestedPairs accordingly.
-             */
-            else if (!isLiteral && c == '[') {
-                // Handle "[:...:]", representing a character category
-                char d = charAfter(pattern, i);
-                if (d == ':') {
-                    i += 2;
-                    int j = pattern.indexOf(":]", i);
-                    if (j < 0) {
-                        throw new IllegalArgumentException("Missing \":]\"");
+                /* An opening bracket indicates the first bracket of a nested
+                 * subpattern, either a normal pattern or a category pattern.  We
+                 * recognize these here and set nestedPairs accordingly.
+                 */
+                else if (!isLiteral && c == '[') {
+                    // Handle "[:...:]", representing a character category
+                    char d = charAfter(pattern, i);
+                    if (d == ':') {
+                        i += 2;
+                        int j = pattern.indexOf(":]", i);
+                        if (j < 0) {
+                            throw new IllegalArgumentException("Missing \":]\"");
+                        }
+                        nestedPairs = getCategoryPairs(pattern.substring(i, j));
+                        i = j+1; // Make i point to ']' in ":]"
+                        if (mode == 3) {
+                            // Entire pattern is a category; leave parse loop
+                            pairsBuf.append(nestedPairs);
+                            break;
+                        }
+                    } else {
+                        // Recurse to get the pairs for this nested set.
+                        pos.setIndex(i); // Add 2 to point AFTER op
+                        nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
+                        i = pos.getIndex() - 1; // - 1 to point at ']'
                    }
-                    nestedPairs = getCategoryPairs(pattern.substring(i, j));
-                    i = j+1; // Make i point to ']' in ":]"
-                    if (mode == 3) {
-                        // Entire pattern is a category; leave parse loop
-                        pairsBuf.append(nestedPairs);
-                        break;
-                    }
-                } else {
-                    // Recurse to get the pairs for this nested set.
-                    pos.setIndex(i); // Add 2 to point AFTER op
-                    nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
-                    i = pos.getIndex() - 1; // - 1 to point at ']'
                }
            }

@ -994,6 +1029,13 @@ public class UnicodeSet implements UnicodeFilter {
        }
        pos.setIndex(i+1);

+        if (false) {
+            // Debug parser
+            System.out.println("UnicodeSet(" + 
+                               pattern.substring(start, i+1) + ") -> " +
+                               pairsBuf.toString());
+        }
+
        return pairsBuf;
    }

--- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.26 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.27 $
 *
 *****************************************************************************************
 */
@ -20,17 +20,17 @@ import com.ibm.util.Utility;
 /**
 * <strong>RuleBasedTransliterator</strong> is a transliterator
 * that reads a set of rules in order to determine how to perform
- * translations. Rules are stored in resource bundles indexed by
- * name. Rules are separated by semicolons (';'). To include a
- * literal semicolon, prefix it with a backslash ('\'). Whitespace,
- * as defined by <code>Character.isWhitespace()</code>, is ignored.
- * If the first non-blank character on a line is '#', the entire
- * line is ignored as a comment. </p>
+ * translations. Rule sets are stored in resource bundles indexed by
+ * name. Rules within a rule set are separated by semicolons (';').
+ * To include a literal semicolon, prefix it with a backslash ('\').
+ * Whitespace, as defined by <code>Character.isWhitespace()</code>,
+ * is ignored. If the first non-blank character on a line is '#',
+ * the entire line is ignored as a comment. </p>
 * 
 * <p>Each set of rules consists of two groups, one forward, and one
 * reverse. This is a convention that is not enforced; rules for one
 * direction may be omitted, with the result that translations in
- * that direction will not modify the source text. Alternatively,
+ * that direction will not modify the source text. In addition,
 * bidirectional forward-reverse rules may be specified for
 * symmetrical transformations.</p>
 * 
@ -39,69 +39,27 @@ import com.ibm.util.Utility;
 * <p>Rule statements take one of the following forms: </p>
 * 
 * <dl>
- *     <dt><code>$alefmadda=\u0622</code></dt>
+ *     <dt><code>$alefmadda=\u0622;</code></dt>
 *     <dd><strong>Variable definition.</strong> The name on the
- *         left is assigned the character or expression on the
- *         right. Names must begin with a letter and consist only of
- *         letters, digits, and underscores. Case is significant.
- *         Duplicate names (including duplicates of simple variables
- *         or category names) cause an exception to be thrown. If
- *         the right hand side consists of one character, then the
- *         variable stands for that character. In this example,
+ *         left is assigned the text on the right. In this example,
 *         after this statement, instances of the left hand name,
 *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
- *         the Unicode character U+0622. The right hand side must be
- *         exactly one character long (current limitation).</dd>
- *     <dt>&nbsp;</dt>
- *     <dt><code>$softvowel=[eiyEIY]</code></dt>
- *     <dd><strong>Category definition.</strong> The name on the
- *         left is assigned to stand for a set of characters. The
- *         same rules for names of simple variables apply. After
- *         this statement, the left hand variable will be
- *         interpreted as indicating a set of characters in
- *         appropriate contexts. The pattern syntax defining sets of
- *         characters is defined by {@link UnicodeSet}. Examples of
- *         valid patterns are:<table border="0">
- *             <tr>
- *                 <td valign="top" nowrap><code>[abc]</code></td>
- *                 <td valign="top">The set containing the
- *                 characters 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^abc]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[A-Z]</code></td>
- *                 <td valign="top">The set of all characters from
- *                 'A' to 'Z' in Unicode order.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[:Lu:]</code></td>
- *                 <td valign="top">The set of Unicode uppercase
- *                 letters. See <a href="http://www.unicode.org">www.unicode.org</a>
- *                 for a complete list of categories and their
- *                 two-letter codes.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a' through 'z' and uppercase or lowercase
- *                 letters.</td>
- *             </tr>
- *         </table>
- *         <p>Patterns may contain variable references, such as
- *         &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;. See
- *         {@link UnicodeSet} for more documentation and examples. </p>
- *     </dd>
- *     <dt><code>ai&gt;$alefmadda</code></dt>
+ *         the Unicode character U+0622. Variable names must begin
+ *         with a letter and consist only of letters, digits, and
+ *         underscores. Case is significant. Duplicate names cause
+ *         an exception to be thrown, that is, variables cannot be
+ *         redefined. The right hand side may contain well-formed
+ *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
+ *         The right hand side may contain embedded <code>UnicodeSet</code>
+ *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
+ *     <dd>&nbsp;</dd>
+ *     <dt><code>ai&gt;$alefmadda;</code></dt>
 *     <dd><strong>Forward translation rule.</strong> This rule
 *         states that the string on the left will be changed to the
 *         string on the right when performing forward
 *         transliteration.</dd>
 *     <dt>&nbsp;</dt>
- *     <dt><code>ai&lt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;$alefmadda;</code></dt>
 *     <dd><strong>Reverse translation rule.</strong> This rule
 *         states that the string on the right will be changed to
 *         the string on the left when performing reverse
@ -109,7 +67,7 @@ import com.ibm.util.Utility;
 * </dl>
 * 
 * <dl>
- *     <dt><code>ai&lt;&gt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
 *     <dd><strong>Bidirectional translation rule.</strong> This
 *         rule states that the string on the right will be changed
 *         to the string on the left when performing forward
@ -151,9 +109,16 @@ import com.ibm.util.Utility;
 *     y and z</code></p>
 * </blockquote>
 * 
- * <p>In addition to being defined in variables, <code>UnicodeSet</code>
- * patterns may be embedded directly into rule strings. Thus, the
- * following two rules are equivalent:</p>
+ * <p><b>UnicodeSet</b></p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may appear anywhere that
+ * makes sense. They may appear in variable definitions.
+ * Contrariwise, <code>UnicodeSet</code> patterns may themselves
+ * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
+ * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may also be embedded directly
+ * into rule strings. Thus, the following two rules are equivalent:</p>
 * 
 * <blockquote>
 *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
@ -162,6 +127,8 @@ import com.ibm.util.Utility;
 *     Another way</code></p>
 * </blockquote>
 * 
+ * <p>See {@link UnicodeSet} for more documentation and examples.</p>
+ * 
 * <p><b>Segments</b></p>
 * 
 * <p>Segments of the input string can be matched and copied to the
@ -169,7 +136,8 @@ import com.ibm.util.Utility;
 * general, and makes reordering possible. For example:</p>
 * 
 * <blockquote>
- *     <p><code>([a-z]) &gt; $1 $1; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
+ *     <p><code>([a-z]) &gt; $1 $1;
+ *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
 *     double lowercase letters<br>
 *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
 * </blockquote>
@ -284,7 +252,7 @@ import com.ibm.util.Utility;
 * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
 * 
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $
 */
 public class RuleBasedTransliterator extends Transliterator {

@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator {
        public TransliterationRuleSet ruleSet;

        /**
-         * Map variable name (String) to variable (Character).  A variable
-         * name may correspond to a single literal character, in which
-         * case the character is stored in this hash.  It may also
-         * correspond to a UnicodeSet, in which case a character is
-         * again stored in this hash, but the character is a stand-in: it
-         * is an index for a secondary lookup in data.setVariables.  The stand-in
-         * also represents the UnicodeSet in the stored rules.
+         * Map variable name (String) to variable (char[]).  A variable name
+         * corresponds to zero or more characters, stored in a char[] array in
+         * this hash.  One or more of these chars may also correspond to a
+         * UnicodeSet, in which case the character in the char[] in this hash is
+         * a stand-in: it is an index for a secondary lookup in
+         * data.setVariables.  The stand-in also represents the UnicodeSet in
+         * the stored rules.
         */
-        public Hashtable variableNames;
+        private Hashtable variableNames;

        /**
         * Map category variable (Character) to set (UnicodeSet).
@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator {
         * stored in the rule text to represent the set of characters.
         * setVariables[i] represents character (setVariablesBase + i).
         */
-        public UnicodeSet[] setVariables;
+        private UnicodeSet[] setVariables;

        /**
         * The character that represents setVariables[0].  Characters
         * setVariablesBase through setVariablesBase +
         * setVariables.length - 1 represent UnicodeSet objects.
         */
-        public char setVariablesBase;
-
-        /**
-         * Return the UnicodeSet represented by the given character, or
-         * null if none.
-         */
-        public UnicodeSet lookup(char c) {
-            int i = c - setVariablesBase;
-            return (i >= 0 && i < setVariables.length)
-                ? setVariables[i] : null;
-        }
+        private char setVariablesBase;

        /**
         * The character that represents segment 1.  Characters segmentBase
         * through segmentBase + 8 represent segments 1 through 9.
         */
-        public char segmentBase;
+        private char segmentBase;
+
+        /**
+         * Return the UnicodeSet represented by the given character, or
+         * null if none.
+         */
+        public UnicodeSet lookupSet(char c) {
+            int i = c - setVariablesBase;
+            return (i >= 0 && i < setVariables.length)
+                ? setVariables[i] : null;
+        }

        /**
         * Return the zero-based index of the segment represented by the given
@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator {
        private class ParseData implements SymbolTable {
            
            /**
-             * Implement SymbolTable API.  Lookup a variable, returning
-             * either a Character, a UnicodeSet, or null.
+             * Implement SymbolTable API.
             */
-            public Object lookup(String name) {
-                Character ch = (Character) data.variableNames.get(name);
-                if (ch != null) {
-                    int i = ch.charValue() - data.setVariablesBase;
-                    if (i >= 0 && i < setVariablesVector.size()) {
-                        return setVariablesVector.elementAt(i);
-                    }
+            public char[] lookup(String name) {
+                return (char[]) data.variableNames.get(name);
+            }
+
+            /**
+             * Implement SymbolTable API.
+             */
+            public UnicodeSet lookupSet(char ch) {
+                // Note that we cannot use data.lookupSet() because the
+                // set array has not been constructed yet.
+                int i = ch - data.setVariablesBase;
+                if (i >= 0 && i < setVariablesVector.size()) {
+                    return (UnicodeSet) setVariablesVector.elementAt(i);
                }
-                return ch;
+                return null;
            }

            /**
@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator {
                                String name = parser.parseData.
                                                parseReference(rule, pp, limit);
                                pos = pp.getIndex();
-                                // If this is a variable definition statement, then the LHS
-                                // variable will be undefined.  In that case getVariableName()
-                                // will return the special placeholder variableLimit-1.
-                                buf.append(parser.getVariableDef(name));
+                                // If this is a variable definition statement,
+                                // then the LHS variable will be undefined.  In
+                                // that case appendVariableDef() will append the
+                                // special placeholder char variableLimit-1.
+
+                                //buf.append(parser.getVariableDef(name));
+                                parser.appendVariableDef(name, buf);
                            }
                        }
                        break;
@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator {
                if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
                    syntaxError("Malformed LHS", rule, start);
                }
-                if (right.text.length() != 1) {
-                    syntaxError("Malformed RHS", rule, start);
-                }
-                data.variableNames.put(undefinedVariableName,
-                                       new Character(right.text.charAt(0)));
+                // We allow anything on the right, including an empty string.
+                int n = right.text.length();
+                char[] value = new char[n];
+                right.text.getChars(0, n, value, 0);
+                data.variableNames.put(undefinedVariableName, value);
+
                ++variableLimit;
                return pos;
            }
@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator {
        }

        /**
-         * Returns the single character value of the given variable name.  Defined
-         * names are recognized.
+         * Append the value of the given variable name to the given
+         * StringBuffer.
         * @exception IllegalArgumentException if the name is unknown.
         */
-        private char getVariableDef(String name) {
-            Character ch = (Character) data.variableNames.get(name);
+        private void appendVariableDef(String name, StringBuffer buf) {
+            char[] ch = (char[]) data.variableNames.get(name);
            if (ch == null) {
                // We allow one undefined variable so that variable definition
                // statements work.  For the first undefined variable we return
@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator {
                    if (variableNext >= variableLimit) {
                        throw new RuntimeException("Private use variables exhausted");
                    }
-                    return --variableLimit;
+                    buf.append((char) --variableLimit);
+                } else {
+                    throw new IllegalArgumentException("Undefined variable $"
+                                                       + name);
                }
-                throw new IllegalArgumentException("Undefined variable $"
-                                                   + name);
+            } else {
+                buf.append(ch);
            }
-            return ch.charValue();
        }

        /**
@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator {

 /**
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.27  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
 * Revision 1.26  2000/04/22 01:25:10  alan
 * Add support for cursor positioner '@'; update javadoc
 *
--- a/icu4j/src/com/ibm/text/SymbolTable.java
+++ b/icu4j/src/com/ibm/text/SymbolTable.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.3 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.4 $
 *
 *****************************************************************************************
 */
@ -32,10 +32,17 @@ public interface SymbolTable {
    final char SYMBOL_REF = '$';

    /**
-     * Lookup the object associated with this string and return it.
-     * Return <tt>null</tt> if no such name exists.
+     * Lookup the characters associated with this string and return it.
+     * Return <tt>null</tt> if no such name exists.  The resultant
+     * array may have length zero.
     */
-    Object lookup(String s);
+    char[] lookup(String s);
+
+    /**
+     * Lookup the UnicodeSet associated with the given character, and
+     * return it.  Return <tt>null</tt> if not found.
+     */
+    UnicodeSet lookupSet(char ch);

    /**
     * Parse a symbol reference name from the given string, starting
--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
 */
 class TransliterationRule {
    /**
@ -240,7 +240,7 @@ class TransliterationRule {
            return -1;
        }
        char c = pattern.charAt(anteContextLength);
-        return variables.lookup(c) == null ? (c & 0xFF) : -1;
+        return variables.lookupSet(c) == null ? (c & 0xFF) : -1;
    }

    /**
@ -300,7 +300,7 @@ class TransliterationRule {
            return true;
        }
        char c = pattern.charAt(anteContextLength);
-        UnicodeSet set = variables.lookup(c);
+        UnicodeSet set = variables.lookupSet(c);
        return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
    }

@ -486,13 +486,16 @@ class TransliterationRule {
                                               UnicodeFilter filter) {
        UnicodeSet set = null;
        return (filter == null || filter.contains(textChar)) &&
-            (((set = variables.lookup(keyChar)) == null) ?
+            (((set = variables.lookupSet(keyChar)) == null) ?
             keyChar == textChar : set.contains(textChar));
    }
 }

 /**
 * $Log: TransliterationRule.java,v $
+ * Revision 1.19  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
 * Revision 1.18  2000/04/22 01:25:10  alan
 * Add support for cursor positioner '@'; update javadoc
 *
--- a/icu4j/src/com/ibm/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
 *
 *****************************************************************************************
 */
@ -241,7 +241,7 @@ import java.text.*;
 * *Unsupported by Java (and hence unsupported by UnicodeSet).
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
 */
 public class UnicodeSet implements UnicodeFilter {
    /**
@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter {
        int start = pos.getIndex();
        int i = start;
        int limit = pattern.length();
-        for (; i<limit; ++i) {
+        /* In the case of an embedded SymbolTable variable, we look it up and
+         * then take characters from the resultant char[] array.  These chars
+         * are subjected to an extra level of lookup in the SymbolTable in case
+         * they are stand-ins for a nested UnicodeSet.  */
+        char[] varValueBuffer = null;
+        int ivarValueBuffer = 0;
+        for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
            /* If the next element is a single character, c will be set to it,
             * and nestedPairs will be null.  In this case isLiteral indicates
             * whether the character should assume special meaning if it has
@ -783,9 +789,23 @@ public class UnicodeSet implements UnicodeFilter {
             * nestedPairs will be set to the pairs list for the nested set, and
             * c's value should be ignored.
             */
-            char c = pattern.charAt(i);
            String nestedPairs = null;
            boolean isLiteral = false;
+            char c;
+            if (varValueBuffer != null) {
+                if (ivarValueBuffer < varValueBuffer.length) {
+                    c = varValueBuffer[ivarValueBuffer++];
+                    UnicodeSet set = symbols.lookupSet(c);
+                    if (set != null) {
+                        nestedPairs = set.pairs.toString();
+                    }
+                } else {
+                    varValueBuffer = null;
+                    c = pattern.charAt(i);
+                }
+            } else {
+                c = pattern.charAt(i);
+            }

            // Ignore whitespace.  This is not Unicode whitespace, but Java
            // whitespace, a subset of Unicode whitespace.
@ -829,82 +849,97 @@ public class UnicodeSet implements UnicodeFilter {
            // will be 2 if we want a closing ']', or 3 if we should parse a
            // category and close with ":]".

-            /* Handle escapes.  If a character is escaped, then it assumes its
-             * literal value.  This is true for all characters, both special
-             * characters and characters with no special meaning.  We also
-             * interpret '\\uxxxx' Unicode escapes here (as literals).
-             */
-            if (c == '\\') {
-                ++i;
-                if (i < limit) {
-                    c = pattern.charAt(i);
-                    isLiteral = true;
-                    if (c == 'u') {
-                        if ((i+4) >= limit) {
-                            throw new IllegalArgumentException("Invalid \\u escape");
-                        }
-                        c = '\u0000';
-                        for (int j=(++i)+4; i<j; ++i) { // [sic]
-                            int digit = Character.digit(pattern.charAt(i), 16);
-                            if (digit<0) {
+            // Only process escapes, variable references, and nested sets
+            // if we are _not_ retrieving characters from the variable
+            // buffer.  Characters in the variable buffer have already
+            // benn through escape and variable reference processing.
+            if (varValueBuffer == null) {
+                /* Handle escapes.  If a character is escaped, then it assumes its
+                 * literal value.  This is true for all characters, both special
+                 * characters and characters with no special meaning.  We also
+                 * interpret '\\uxxxx' Unicode escapes here (as literals).
+                 */
+                if (c == '\\') {
+                    ++i;
+                    if (i < limit) {
+                        c = pattern.charAt(i);
+                        isLiteral = true;
+                        if (c == 'u') {
+                            if ((i+4) >= limit) {
                                throw new IllegalArgumentException("Invalid \\u escape");
                            }
-                            c = (char) ((c << 4) | digit);
+                            c = '\u0000';
+                            for (int j=(++i)+4; i<j; ++i) { // [sic]
+                                int digit = Character.digit(pattern.charAt(i), 16);
+                                if (digit<0) {
+                                    throw new IllegalArgumentException("Invalid \\u escape");
+                                }
+                                c = (char) ((c << 4) | digit);
+                            }
+                            --i; // Move i back to last parsed character
                        }
-                        --i; // Move i back to last parsed character
+                    } else {
+                        throw new IllegalArgumentException("Trailing '\\'");
                    }
-                } else {
-                    throw new IllegalArgumentException("Trailing '\\'");
                }
-            }

-            /* Parse variable references.  These are treated as literals.  If a
-             * variable refers to a UnicodeSet, nestedPairs is assigned here.
-             * Variable names are only parsed if varNameToChar is not null.
-             * Set variables are only looked up if varCharToSet is not null.
-             */
-            else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
-                pos.setIndex(++i);
-                String name = symbols.parseReference(pattern, pos, limit);
-                Object obj = symbols.lookup(name);
-                if (obj == null) {
-                    throw new IllegalArgumentException("Undefined variable: "
-                                                       + name);
+                /* Parse variable references.  These are treated as literals.  If a
+                 * variable refers to a UnicodeSet, nestedPairs is assigned here.
+                 * Variable names are only parsed if varNameToChar is not null.
+                 * Set variables are only looked up if varCharToSet is not null.
+                 */
+                else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
+                    pos.setIndex(++i);
+                    String name = symbols.parseReference(pattern, pos, limit);
+                    /*
+                    Object obj = symbols.lookup(name);
+                    if (obj == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    isLiteral = true;
+                    if (obj instanceof Character) {
+                        c = ((Character) obj).charValue();
+                    } else {
+                        nestedPairs = ((UnicodeSet) obj).pairs.toString();
+                    }
+                    */
+                    varValueBuffer = symbols.lookup(name);
+                    if (varValueBuffer == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    ivarValueBuffer = 0;
+                    i = pos.getIndex()-1; // Make i point at last char of var name
+                    continue; // Back to the top to get varValueBuffer[0]
                }
-                isLiteral = true;
-                if (obj instanceof Character) {
-                    c = ((Character) obj).charValue();
-                } else {
-                    nestedPairs = ((UnicodeSet) obj).pairs.toString();
-                }
-                i = pos.getIndex()-1; // Make i point at last char of var name
-            }

-            /* An opening bracket indicates the first bracket of a nested
-             * subpattern, either a normal pattern or a category pattern.  We
-             * recognize these here and set nestedPairs accordingly.
-             */
-            else if (!isLiteral && c == '[') {
-                // Handle "[:...:]", representing a character category
-                char d = charAfter(pattern, i);
-                if (d == ':') {
-                    i += 2;
-                    int j = pattern.indexOf(":]", i);
-                    if (j < 0) {
-                        throw new IllegalArgumentException("Missing \":]\"");
+                /* An opening bracket indicates the first bracket of a nested
+                 * subpattern, either a normal pattern or a category pattern.  We
+                 * recognize these here and set nestedPairs accordingly.
+                 */
+                else if (!isLiteral && c == '[') {
+                    // Handle "[:...:]", representing a character category
+                    char d = charAfter(pattern, i);
+                    if (d == ':') {
+                        i += 2;
+                        int j = pattern.indexOf(":]", i);
+                        if (j < 0) {
+                            throw new IllegalArgumentException("Missing \":]\"");
+                        }
+                        nestedPairs = getCategoryPairs(pattern.substring(i, j));
+                        i = j+1; // Make i point to ']' in ":]"
+                        if (mode == 3) {
+                            // Entire pattern is a category; leave parse loop
+                            pairsBuf.append(nestedPairs);
+                            break;
+                        }
+                    } else {
+                        // Recurse to get the pairs for this nested set.
+                        pos.setIndex(i); // Add 2 to point AFTER op
+                        nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
+                        i = pos.getIndex() - 1; // - 1 to point at ']'
                    }
-                    nestedPairs = getCategoryPairs(pattern.substring(i, j));
-                    i = j+1; // Make i point to ']' in ":]"
-                    if (mode == 3) {
-                        // Entire pattern is a category; leave parse loop
-                        pairsBuf.append(nestedPairs);
-                        break;
-                    }
-                } else {
-                    // Recurse to get the pairs for this nested set.
-                    pos.setIndex(i); // Add 2 to point AFTER op
-                    nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
-                    i = pos.getIndex() - 1; // - 1 to point at ']'
                }
            }

@ -994,6 +1029,13 @@ public class UnicodeSet implements UnicodeFilter {
        }
        pos.setIndex(i+1);

+        if (false) {
+            // Debug parser
+            System.out.println("UnicodeSet(" + 
+                               pattern.substring(start, i+1) + ") -> " +
+                               pairsBuf.toString());
+        }
+
        return pairsBuf;
    }