From 7a49adef39a9ac65d16d46b3c03cd59b3921b1f0 Mon Sep 17 00:00:00 2001
From: Alan Liu <alansliu@gmail.com>
Date: Tue, 25 Apr 2000 01:42:58 +0000
Subject: [PATCH] Allow arbitrary length variable values. Clean up Data API.
 Update javadocs.

X-SVN-Rev: 1242
---
 .../ibm/icu/text/RuleBasedTransliterator.java | 218 ++++++++----------
 icu4j/src/com/ibm/icu/text/SymbolTable.java   |  17 +-
 .../com/ibm/icu/text/TransliterationRule.java |  15 +-
 icu4j/src/com/ibm/icu/text/UnicodeSet.java    | 186 +++++++++------
 .../com/ibm/text/RuleBasedTransliterator.java | 218 ++++++++----------
 icu4j/src/com/ibm/text/SymbolTable.java       |  17 +-
 .../src/com/ibm/text/TransliterationRule.java |  15 +-
 icu4j/src/com/ibm/text/UnicodeSet.java        | 186 +++++++++------
 8 files changed, 470 insertions(+), 402 deletions(-)
diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
index fd98190750b..19a9649a407 100755
--- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.26 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.27 $
  *
  *****************************************************************************************
  */
@@ -20,17 +20,17 @@ import com.ibm.util.Utility;
 /**
  * <strong>RuleBasedTransliterator</strong> is a transliterator
  * that reads a set of rules in order to determine how to perform
- * translations. Rules are stored in resource bundles indexed by
- * name. Rules are separated by semicolons (';'). To include a
- * literal semicolon, prefix it with a backslash ('\'). Whitespace,
- * as defined by <code>Character.isWhitespace()</code>, is ignored.
- * If the first non-blank character on a line is '#', the entire
- * line is ignored as a comment. </p>
+ * translations. Rule sets are stored in resource bundles indexed by
+ * name. Rules within a rule set are separated by semicolons (';').
+ * To include a literal semicolon, prefix it with a backslash ('\').
+ * Whitespace, as defined by <code>Character.isWhitespace()</code>,
+ * is ignored. If the first non-blank character on a line is '#',
+ * the entire line is ignored as a comment. </p>
  * 
  * <p>Each set of rules consists of two groups, one forward, and one
  * reverse. This is a convention that is not enforced; rules for one
  * direction may be omitted, with the result that translations in
- * that direction will not modify the source text. Alternatively,
+ * that direction will not modify the source text. In addition,
  * bidirectional forward-reverse rules may be specified for
  * symmetrical transformations.</p>
  * 
@@ -39,69 +39,27 @@ import com.ibm.util.Utility;
  * <p>Rule statements take one of the following forms: </p>
  * 
  * <dl>
- *     <dt><code>$alefmadda=\u0622</code></dt>
+ *     <dt><code>$alefmadda=\u0622;</code></dt>
  *     <dd><strong>Variable definition.</strong> The name on the
- *         left is assigned the character or expression on the
- *         right. Names must begin with a letter and consist only of
- *         letters, digits, and underscores. Case is significant.
- *         Duplicate names (including duplicates of simple variables
- *         or category names) cause an exception to be thrown. If
- *         the right hand side consists of one character, then the
- *         variable stands for that character. In this example,
+ *         left is assigned the text on the right. In this example,
  *         after this statement, instances of the left hand name,
  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
- *         the Unicode character U+0622. The right hand side must be
- *         exactly one character long (current limitation).</dd>
- *     <dt>&nbsp;</dt>
- *     <dt><code>$softvowel=[eiyEIY]</code></dt>
- *     <dd><strong>Category definition.</strong> The name on the
- *         left is assigned to stand for a set of characters. The
- *         same rules for names of simple variables apply. After
- *         this statement, the left hand variable will be
- *         interpreted as indicating a set of characters in
- *         appropriate contexts. The pattern syntax defining sets of
- *         characters is defined by {@link UnicodeSet}. Examples of
- *         valid patterns are:<table border="0">
- *             <tr>
- *                 <td valign="top" nowrap><code>[abc]</code></td>
- *                 <td valign="top">The set containing the
- *                 characters 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^abc]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[A-Z]</code></td>
- *                 <td valign="top">The set of all characters from
- *                 'A' to 'Z' in Unicode order.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[:Lu:]</code></td>
- *                 <td valign="top">The set of Unicode uppercase
- *                 letters. See <a href="http://www.unicode.org">www.unicode.org</a>
- *                 for a complete list of categories and their
- *                 two-letter codes.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a' through 'z' and uppercase or lowercase
- *                 letters.</td>
- *             </tr>
- *         </table>
- *         <p>Patterns may contain variable references, such as
- *         &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;. See
- *         {@link UnicodeSet} for more documentation and examples. </p>
- *     </dd>
- *     <dt><code>ai&gt;$alefmadda</code></dt>
+ *         the Unicode character U+0622. Variable names must begin
+ *         with a letter and consist only of letters, digits, and
+ *         underscores. Case is significant. Duplicate names cause
+ *         an exception to be thrown, that is, variables cannot be
+ *         redefined. The right hand side may contain well-formed
+ *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
+ *         The right hand side may contain embedded <code>UnicodeSet</code>
+ *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
+ *     <dd>&nbsp;</dd>
+ *     <dt><code>ai&gt;$alefmadda;</code></dt>
  *     <dd><strong>Forward translation rule.</strong> This rule
  *         states that the string on the left will be changed to the
  *         string on the right when performing forward
  *         transliteration.</dd>
  *     <dt>&nbsp;</dt>
- *     <dt><code>ai&lt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;$alefmadda;</code></dt>
  *     <dd><strong>Reverse translation rule.</strong> This rule
  *         states that the string on the right will be changed to
  *         the string on the left when performing reverse
@@ -109,7 +67,7 @@ import com.ibm.util.Utility;
  * </dl>
  * 
  * <dl>
- *     <dt><code>ai&lt;&gt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
  *     <dd><strong>Bidirectional translation rule.</strong> This
  *         rule states that the string on the right will be changed
  *         to the string on the left when performing forward
@@ -151,9 +109,16 @@ import com.ibm.util.Utility;
  *     y and z</code></p>
  * </blockquote>
  * 
- * <p>In addition to being defined in variables, <code>UnicodeSet</code>
- * patterns may be embedded directly into rule strings. Thus, the
- * following two rules are equivalent:</p>
+ * <p><b>UnicodeSet</b></p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may appear anywhere that
+ * makes sense. They may appear in variable definitions.
+ * Contrariwise, <code>UnicodeSet</code> patterns may themselves
+ * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
+ * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may also be embedded directly
+ * into rule strings. Thus, the following two rules are equivalent:</p>
  * 
  * <blockquote>
  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
@@ -162,6 +127,8 @@ import com.ibm.util.Utility;
  *     Another way</code></p>
  * </blockquote>
  * 
+ * <p>See {@link UnicodeSet} for more documentation and examples.</p>
+ * 
  * <p><b>Segments</b></p>
  * 
  * <p>Segments of the input string can be matched and copied to the
@@ -169,7 +136,8 @@ import com.ibm.util.Utility;
  * general, and makes reordering possible. For example:</p>
  * 
  * <blockquote>
- *     <p><code>([a-z]) &gt; $1 $1; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
+ *     <p><code>([a-z]) &gt; $1 $1;
+ *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
  *     double lowercase letters<br>
  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
  * </blockquote>
@@ -284,7 +252,7 @@ import com.ibm.util.Utility;
  * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
  * 
  * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $
  */
 public class RuleBasedTransliterator extends Transliterator {
 
@@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator {
         public TransliterationRuleSet ruleSet;
 
         /**
-         * Map variable name (String) to variable (Character).  A variable
-         * name may correspond to a single literal character, in which
-         * case the character is stored in this hash.  It may also
-         * correspond to a UnicodeSet, in which case a character is
-         * again stored in this hash, but the character is a stand-in: it
-         * is an index for a secondary lookup in data.setVariables.  The stand-in
-         * also represents the UnicodeSet in the stored rules.
+         * Map variable name (String) to variable (char[]).  A variable name
+         * corresponds to zero or more characters, stored in a char[] array in
+         * this hash.  One or more of these chars may also correspond to a
+         * UnicodeSet, in which case the character in the char[] in this hash is
+         * a stand-in: it is an index for a secondary lookup in
+         * data.setVariables.  The stand-in also represents the UnicodeSet in
+         * the stored rules.
          */
-        public Hashtable variableNames;
+        private Hashtable variableNames;
 
         /**
          * Map category variable (Character) to set (UnicodeSet).
@@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator {
          * stored in the rule text to represent the set of characters.
          * setVariables[i] represents character (setVariablesBase + i).
          */
-        public UnicodeSet[] setVariables;
+        private UnicodeSet[] setVariables;
 
         /**
          * The character that represents setVariables[0].  Characters
          * setVariablesBase through setVariablesBase +
          * setVariables.length - 1 represent UnicodeSet objects.
          */
-        public char setVariablesBase;
-
-        /**
-         * Return the UnicodeSet represented by the given character, or
-         * null if none.
-         */
-        public UnicodeSet lookup(char c) {
-            int i = c - setVariablesBase;
-            return (i >= 0 && i < setVariables.length)
-                ? setVariables[i] : null;
-        }
+        private char setVariablesBase;
 
         /**
          * The character that represents segment 1.  Characters segmentBase
          * through segmentBase + 8 represent segments 1 through 9.
          */
-        public char segmentBase;
+        private char segmentBase;
+
+        /**
+         * Return the UnicodeSet represented by the given character, or
+         * null if none.
+         */
+        public UnicodeSet lookupSet(char c) {
+            int i = c - setVariablesBase;
+            return (i >= 0 && i < setVariables.length)
+                ? setVariables[i] : null;
+        }
 
         /**
          * Return the zero-based index of the segment represented by the given
@@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator {
         private class ParseData implements SymbolTable {
             
             /**
-             * Implement SymbolTable API.  Lookup a variable, returning
-             * either a Character, a UnicodeSet, or null.
+             * Implement SymbolTable API.
              */
-            public Object lookup(String name) {
-                Character ch = (Character) data.variableNames.get(name);
-                if (ch != null) {
-                    int i = ch.charValue() - data.setVariablesBase;
-                    if (i >= 0 && i < setVariablesVector.size()) {
-                        return setVariablesVector.elementAt(i);
-                    }
+            public char[] lookup(String name) {
+                return (char[]) data.variableNames.get(name);
+            }
+
+            /**
+             * Implement SymbolTable API.
+             */
+            public UnicodeSet lookupSet(char ch) {
+                // Note that we cannot use data.lookupSet() because the
+                // set array has not been constructed yet.
+                int i = ch - data.setVariablesBase;
+                if (i >= 0 && i < setVariablesVector.size()) {
+                    return (UnicodeSet) setVariablesVector.elementAt(i);
                 }
-                return ch;
+                return null;
             }
 
             /**
@@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator {
                                 String name = parser.parseData.
                                                 parseReference(rule, pp, limit);
                                 pos = pp.getIndex();
-                                // If this is a variable definition statement, then the LHS
-                                // variable will be undefined.  In that case getVariableName()
-                                // will return the special placeholder variableLimit-1.
-                                buf.append(parser.getVariableDef(name));
+                                // If this is a variable definition statement,
+                                // then the LHS variable will be undefined.  In
+                                // that case appendVariableDef() will append the
+                                // special placeholder char variableLimit-1.
+
+                                //buf.append(parser.getVariableDef(name));
+                                parser.appendVariableDef(name, buf);
                             }
                         }
                         break;
@@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator {
                 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
                     syntaxError("Malformed LHS", rule, start);
                 }
-                if (right.text.length() != 1) {
-                    syntaxError("Malformed RHS", rule, start);
-                }
-                data.variableNames.put(undefinedVariableName,
-                                       new Character(right.text.charAt(0)));
+                // We allow anything on the right, including an empty string.
+                int n = right.text.length();
+                char[] value = new char[n];
+                right.text.getChars(0, n, value, 0);
+                data.variableNames.put(undefinedVariableName, value);
+
                 ++variableLimit;
                 return pos;
             }
@@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator {
         }
 
         /**
-         * Returns the single character value of the given variable name.  Defined
-         * names are recognized.
+         * Append the value of the given variable name to the given
+         * StringBuffer.
          * @exception IllegalArgumentException if the name is unknown.
          */
-        private char getVariableDef(String name) {
-            Character ch = (Character) data.variableNames.get(name);
+        private void appendVariableDef(String name, StringBuffer buf) {
+            char[] ch = (char[]) data.variableNames.get(name);
             if (ch == null) {
                 // We allow one undefined variable so that variable definition
                 // statements work.  For the first undefined variable we return
@@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator {
                     if (variableNext >= variableLimit) {
                         throw new RuntimeException("Private use variables exhausted");
                     }
-                    return --variableLimit;
+                    buf.append((char) --variableLimit);
+                } else {
+                    throw new IllegalArgumentException("Undefined variable $"
+                                                       + name);
                 }
-                throw new IllegalArgumentException("Undefined variable $"
-                                                   + name);
+            } else {
+                buf.append(ch);
             }
-            return ch.charValue();
         }
 
         /**
@@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator {
 
 /**
  * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.27  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
  * Revision 1.26  2000/04/22 01:25:10  alan
  * Add support for cursor positioner '@'; update javadoc
  *
diff --git a/icu4j/src/com/ibm/icu/text/SymbolTable.java b/icu4j/src/com/ibm/icu/text/SymbolTable.java
index c3f9a36f410..cf75c2334b9 100755
--- a/icu4j/src/com/ibm/icu/text/SymbolTable.java
+++ b/icu4j/src/com/ibm/icu/text/SymbolTable.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.3 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.4 $
  *
  *****************************************************************************************
  */
@@ -32,10 +32,17 @@ public interface SymbolTable {
     final char SYMBOL_REF = '$';
 
     /**
-     * Lookup the object associated with this string and return it.
-     * Return <tt>null</tt> if no such name exists.
+     * Lookup the characters associated with this string and return it.
+     * Return <tt>null</tt> if no such name exists.  The resultant
+     * array may have length zero.
      */
-    Object lookup(String s);
+    char[] lookup(String s);
+
+    /**
+     * Lookup the UnicodeSet associated with the given character, and
+     * return it.  Return <tt>null</tt> if not found.
+     */
+    UnicodeSet lookupSet(char ch);
 
     /**
      * Parse a symbol reference name from the given string, starting
diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
index f476594416f..3815b33dabf 100755
--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
  *
  *****************************************************************************************
  */
@@ -44,7 +44,7 @@ import com.ibm.util.Utility;
  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
  *
  * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
  */
 class TransliterationRule {
     /**
@@ -240,7 +240,7 @@ class TransliterationRule {
             return -1;
         }
         char c = pattern.charAt(anteContextLength);
-        return variables.lookup(c) == null ? (c & 0xFF) : -1;
+        return variables.lookupSet(c) == null ? (c & 0xFF) : -1;
     }
 
     /**
@@ -300,7 +300,7 @@ class TransliterationRule {
             return true;
         }
         char c = pattern.charAt(anteContextLength);
-        UnicodeSet set = variables.lookup(c);
+        UnicodeSet set = variables.lookupSet(c);
         return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
     }
 
@@ -486,13 +486,16 @@ class TransliterationRule {
                                                UnicodeFilter filter) {
         UnicodeSet set = null;
         return (filter == null || filter.contains(textChar)) &&
-            (((set = variables.lookup(keyChar)) == null) ?
+            (((set = variables.lookupSet(keyChar)) == null) ?
              keyChar == textChar : set.contains(textChar));
     }
 }
 
 /**
  * $Log: TransliterationRule.java,v $
+ * Revision 1.19  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
  * Revision 1.18  2000/04/22 01:25:10  alan
  * Add support for cursor positioner '@'; update javadoc
  *
diff --git a/icu4j/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
index b5316a4d2ae..8e67ca27ba2 100755
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
  *
  *****************************************************************************************
  */
@@ -241,7 +241,7 @@ import java.text.*;
  * *Unsupported by Java (and hence unsupported by UnicodeSet).
  *
  * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
  */
 public class UnicodeSet implements UnicodeFilter {
     /**
@@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter {
         int start = pos.getIndex();
         int i = start;
         int limit = pattern.length();
-        for (; i<limit; ++i) {
+        /* In the case of an embedded SymbolTable variable, we look it up and
+         * then take characters from the resultant char[] array.  These chars
+         * are subjected to an extra level of lookup in the SymbolTable in case
+         * they are stand-ins for a nested UnicodeSet.  */
+        char[] varValueBuffer = null;
+        int ivarValueBuffer = 0;
+        for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
             /* If the next element is a single character, c will be set to it,
              * and nestedPairs will be null.  In this case isLiteral indicates
              * whether the character should assume special meaning if it has
@@ -783,9 +789,23 @@ public class UnicodeSet implements UnicodeFilter {
              * nestedPairs will be set to the pairs list for the nested set, and
              * c's value should be ignored.
              */
-            char c = pattern.charAt(i);
             String nestedPairs = null;
             boolean isLiteral = false;
+            char c;
+            if (varValueBuffer != null) {
+                if (ivarValueBuffer < varValueBuffer.length) {
+                    c = varValueBuffer[ivarValueBuffer++];
+                    UnicodeSet set = symbols.lookupSet(c);
+                    if (set != null) {
+                        nestedPairs = set.pairs.toString();
+                    }
+                } else {
+                    varValueBuffer = null;
+                    c = pattern.charAt(i);
+                }
+            } else {
+                c = pattern.charAt(i);
+            }
 
             // Ignore whitespace.  This is not Unicode whitespace, but Java
             // whitespace, a subset of Unicode whitespace.
@@ -829,82 +849,97 @@ public class UnicodeSet implements UnicodeFilter {
             // will be 2 if we want a closing ']', or 3 if we should parse a
             // category and close with ":]".
 
-            /* Handle escapes.  If a character is escaped, then it assumes its
-             * literal value.  This is true for all characters, both special
-             * characters and characters with no special meaning.  We also
-             * interpret '\\uxxxx' Unicode escapes here (as literals).
-             */
-            if (c == '\\') {
-                ++i;
-                if (i < limit) {
-                    c = pattern.charAt(i);
-                    isLiteral = true;
-                    if (c == 'u') {
-                        if ((i+4) >= limit) {
-                            throw new IllegalArgumentException("Invalid \\u escape");
-                        }
-                        c = '\u0000';
-                        for (int j=(++i)+4; i<j; ++i) { // [sic]
-                            int digit = Character.digit(pattern.charAt(i), 16);
-                            if (digit<0) {
+            // Only process escapes, variable references, and nested sets
+            // if we are _not_ retrieving characters from the variable
+            // buffer.  Characters in the variable buffer have already
+            // benn through escape and variable reference processing.
+            if (varValueBuffer == null) {
+                /* Handle escapes.  If a character is escaped, then it assumes its
+                 * literal value.  This is true for all characters, both special
+                 * characters and characters with no special meaning.  We also
+                 * interpret '\\uxxxx' Unicode escapes here (as literals).
+                 */
+                if (c == '\\') {
+                    ++i;
+                    if (i < limit) {
+                        c = pattern.charAt(i);
+                        isLiteral = true;
+                        if (c == 'u') {
+                            if ((i+4) >= limit) {
                                 throw new IllegalArgumentException("Invalid \\u escape");
                             }
-                            c = (char) ((c << 4) | digit);
+                            c = '\u0000';
+                            for (int j=(++i)+4; i<j; ++i) { // [sic]
+                                int digit = Character.digit(pattern.charAt(i), 16);
+                                if (digit<0) {
+                                    throw new IllegalArgumentException("Invalid \\u escape");
+                                }
+                                c = (char) ((c << 4) | digit);
+                            }
+                            --i; // Move i back to last parsed character
                         }
-                        --i; // Move i back to last parsed character
+                    } else {
+                        throw new IllegalArgumentException("Trailing '\\'");
                     }
-                } else {
-                    throw new IllegalArgumentException("Trailing '\\'");
                 }
-            }
 
-            /* Parse variable references.  These are treated as literals.  If a
-             * variable refers to a UnicodeSet, nestedPairs is assigned here.
-             * Variable names are only parsed if varNameToChar is not null.
-             * Set variables are only looked up if varCharToSet is not null.
-             */
-            else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
-                pos.setIndex(++i);
-                String name = symbols.parseReference(pattern, pos, limit);
-                Object obj = symbols.lookup(name);
-                if (obj == null) {
-                    throw new IllegalArgumentException("Undefined variable: "
-                                                       + name);
+                /* Parse variable references.  These are treated as literals.  If a
+                 * variable refers to a UnicodeSet, nestedPairs is assigned here.
+                 * Variable names are only parsed if varNameToChar is not null.
+                 * Set variables are only looked up if varCharToSet is not null.
+                 */
+                else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
+                    pos.setIndex(++i);
+                    String name = symbols.parseReference(pattern, pos, limit);
+                    /*
+                    Object obj = symbols.lookup(name);
+                    if (obj == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    isLiteral = true;
+                    if (obj instanceof Character) {
+                        c = ((Character) obj).charValue();
+                    } else {
+                        nestedPairs = ((UnicodeSet) obj).pairs.toString();
+                    }
+                    */
+                    varValueBuffer = symbols.lookup(name);
+                    if (varValueBuffer == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    ivarValueBuffer = 0;
+                    i = pos.getIndex()-1; // Make i point at last char of var name
+                    continue; // Back to the top to get varValueBuffer[0]
                 }
-                isLiteral = true;
-                if (obj instanceof Character) {
-                    c = ((Character) obj).charValue();
-                } else {
-                    nestedPairs = ((UnicodeSet) obj).pairs.toString();
-                }
-                i = pos.getIndex()-1; // Make i point at last char of var name
-            }
 
-            /* An opening bracket indicates the first bracket of a nested
-             * subpattern, either a normal pattern or a category pattern.  We
-             * recognize these here and set nestedPairs accordingly.
-             */
-            else if (!isLiteral && c == '[') {
-                // Handle "[:...:]", representing a character category
-                char d = charAfter(pattern, i);
-                if (d == ':') {
-                    i += 2;
-                    int j = pattern.indexOf(":]", i);
-                    if (j < 0) {
-                        throw new IllegalArgumentException("Missing \":]\"");
+                /* An opening bracket indicates the first bracket of a nested
+                 * subpattern, either a normal pattern or a category pattern.  We
+                 * recognize these here and set nestedPairs accordingly.
+                 */
+                else if (!isLiteral && c == '[') {
+                    // Handle "[:...:]", representing a character category
+                    char d = charAfter(pattern, i);
+                    if (d == ':') {
+                        i += 2;
+                        int j = pattern.indexOf(":]", i);
+                        if (j < 0) {
+                            throw new IllegalArgumentException("Missing \":]\"");
+                        }
+                        nestedPairs = getCategoryPairs(pattern.substring(i, j));
+                        i = j+1; // Make i point to ']' in ":]"
+                        if (mode == 3) {
+                            // Entire pattern is a category; leave parse loop
+                            pairsBuf.append(nestedPairs);
+                            break;
+                        }
+                    } else {
+                        // Recurse to get the pairs for this nested set.
+                        pos.setIndex(i); // Add 2 to point AFTER op
+                        nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
+                        i = pos.getIndex() - 1; // - 1 to point at ']'
                     }
-                    nestedPairs = getCategoryPairs(pattern.substring(i, j));
-                    i = j+1; // Make i point to ']' in ":]"
-                    if (mode == 3) {
-                        // Entire pattern is a category; leave parse loop
-                        pairsBuf.append(nestedPairs);
-                        break;
-                    }
-                } else {
-                    // Recurse to get the pairs for this nested set.
-                    pos.setIndex(i); // Add 2 to point AFTER op
-                    nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
-                    i = pos.getIndex() - 1; // - 1 to point at ']'
                 }
             }
 
@@ -994,6 +1029,13 @@ public class UnicodeSet implements UnicodeFilter {
         }
         pos.setIndex(i+1);
 
+        if (false) {
+            // Debug parser
+            System.out.println("UnicodeSet(" + 
+                               pattern.substring(start, i+1) + ") -> " +
+                               pairsBuf.toString());
+        }
+
         return pairsBuf;
     }
 
diff --git a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
index dcbe7b52dcf..7c438304385 100755
--- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.26 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.27 $
  *
  *****************************************************************************************
  */
@@ -20,17 +20,17 @@ import com.ibm.util.Utility;
 /**
  * <strong>RuleBasedTransliterator</strong> is a transliterator
  * that reads a set of rules in order to determine how to perform
- * translations. Rules are stored in resource bundles indexed by
- * name. Rules are separated by semicolons (';'). To include a
- * literal semicolon, prefix it with a backslash ('\'). Whitespace,
- * as defined by <code>Character.isWhitespace()</code>, is ignored.
- * If the first non-blank character on a line is '#', the entire
- * line is ignored as a comment. </p>
+ * translations. Rule sets are stored in resource bundles indexed by
+ * name. Rules within a rule set are separated by semicolons (';').
+ * To include a literal semicolon, prefix it with a backslash ('\').
+ * Whitespace, as defined by <code>Character.isWhitespace()</code>,
+ * is ignored. If the first non-blank character on a line is '#',
+ * the entire line is ignored as a comment. </p>
  * 
  * <p>Each set of rules consists of two groups, one forward, and one
  * reverse. This is a convention that is not enforced; rules for one
  * direction may be omitted, with the result that translations in
- * that direction will not modify the source text. Alternatively,
+ * that direction will not modify the source text. In addition,
  * bidirectional forward-reverse rules may be specified for
  * symmetrical transformations.</p>
  * 
@@ -39,69 +39,27 @@ import com.ibm.util.Utility;
  * <p>Rule statements take one of the following forms: </p>
  * 
  * <dl>
- *     <dt><code>$alefmadda=\u0622</code></dt>
+ *     <dt><code>$alefmadda=\u0622;</code></dt>
  *     <dd><strong>Variable definition.</strong> The name on the
- *         left is assigned the character or expression on the
- *         right. Names must begin with a letter and consist only of
- *         letters, digits, and underscores. Case is significant.
- *         Duplicate names (including duplicates of simple variables
- *         or category names) cause an exception to be thrown. If
- *         the right hand side consists of one character, then the
- *         variable stands for that character. In this example,
+ *         left is assigned the text on the right. In this example,
  *         after this statement, instances of the left hand name,
  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
- *         the Unicode character U+0622. The right hand side must be
- *         exactly one character long (current limitation).</dd>
- *     <dt>&nbsp;</dt>
- *     <dt><code>$softvowel=[eiyEIY]</code></dt>
- *     <dd><strong>Category definition.</strong> The name on the
- *         left is assigned to stand for a set of characters. The
- *         same rules for names of simple variables apply. After
- *         this statement, the left hand variable will be
- *         interpreted as indicating a set of characters in
- *         appropriate contexts. The pattern syntax defining sets of
- *         characters is defined by {@link UnicodeSet}. Examples of
- *         valid patterns are:<table border="0">
- *             <tr>
- *                 <td valign="top" nowrap><code>[abc]</code></td>
- *                 <td valign="top">The set containing the
- *                 characters 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^abc]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a', 'b', and 'c'.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[A-Z]</code></td>
- *                 <td valign="top">The set of all characters from
- *                 'A' to 'Z' in Unicode order.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[:Lu:]</code></td>
- *                 <td valign="top">The set of Unicode uppercase
- *                 letters. See <a href="http://www.unicode.org">www.unicode.org</a>
- *                 for a complete list of categories and their
- *                 two-letter codes.</td>
- *             </tr>
- *             <tr>
- *                 <td valign="top" nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
- *                 <td valign="top">The set of all characters <em>except</em>
- *                 'a' through 'z' and uppercase or lowercase
- *                 letters.</td>
- *             </tr>
- *         </table>
- *         <p>Patterns may contain variable references, such as
- *         &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;. See
- *         {@link UnicodeSet} for more documentation and examples. </p>
- *     </dd>
- *     <dt><code>ai&gt;$alefmadda</code></dt>
+ *         the Unicode character U+0622. Variable names must begin
+ *         with a letter and consist only of letters, digits, and
+ *         underscores. Case is significant. Duplicate names cause
+ *         an exception to be thrown, that is, variables cannot be
+ *         redefined. The right hand side may contain well-formed
+ *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
+ *         The right hand side may contain embedded <code>UnicodeSet</code>
+ *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
+ *     <dd>&nbsp;</dd>
+ *     <dt><code>ai&gt;$alefmadda;</code></dt>
  *     <dd><strong>Forward translation rule.</strong> This rule
  *         states that the string on the left will be changed to the
  *         string on the right when performing forward
  *         transliteration.</dd>
  *     <dt>&nbsp;</dt>
- *     <dt><code>ai&lt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;$alefmadda;</code></dt>
  *     <dd><strong>Reverse translation rule.</strong> This rule
  *         states that the string on the right will be changed to
  *         the string on the left when performing reverse
@@ -109,7 +67,7 @@ import com.ibm.util.Utility;
  * </dl>
  * 
  * <dl>
- *     <dt><code>ai&lt;&gt;$alefmadda</code></dt>
+ *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
  *     <dd><strong>Bidirectional translation rule.</strong> This
  *         rule states that the string on the right will be changed
  *         to the string on the left when performing forward
@@ -151,9 +109,16 @@ import com.ibm.util.Utility;
  *     y and z</code></p>
  * </blockquote>
  * 
- * <p>In addition to being defined in variables, <code>UnicodeSet</code>
- * patterns may be embedded directly into rule strings. Thus, the
- * following two rules are equivalent:</p>
+ * <p><b>UnicodeSet</b></p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may appear anywhere that
+ * makes sense. They may appear in variable definitions.
+ * Contrariwise, <code>UnicodeSet</code> patterns may themselves
+ * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
+ * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
+ * 
+ * <p><code>UnicodeSet</code> patterns may also be embedded directly
+ * into rule strings. Thus, the following two rules are equivalent:</p>
  * 
  * <blockquote>
  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
@@ -162,6 +127,8 @@ import com.ibm.util.Utility;
  *     Another way</code></p>
  * </blockquote>
  * 
+ * <p>See {@link UnicodeSet} for more documentation and examples.</p>
+ * 
  * <p><b>Segments</b></p>
  * 
  * <p>Segments of the input string can be matched and copied to the
@@ -169,7 +136,8 @@ import com.ibm.util.Utility;
  * general, and makes reordering possible. For example:</p>
  * 
  * <blockquote>
- *     <p><code>([a-z]) &gt; $1 $1; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
+ *     <p><code>([a-z]) &gt; $1 $1;
+ *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
  *     double lowercase letters<br>
  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
  * </blockquote>
@@ -284,7 +252,7 @@ import com.ibm.util.Utility;
  * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
  * 
  * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $
  */
 public class RuleBasedTransliterator extends Transliterator {
 
@@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator {
         public TransliterationRuleSet ruleSet;
 
         /**
-         * Map variable name (String) to variable (Character).  A variable
-         * name may correspond to a single literal character, in which
-         * case the character is stored in this hash.  It may also
-         * correspond to a UnicodeSet, in which case a character is
-         * again stored in this hash, but the character is a stand-in: it
-         * is an index for a secondary lookup in data.setVariables.  The stand-in
-         * also represents the UnicodeSet in the stored rules.
+         * Map variable name (String) to variable (char[]).  A variable name
+         * corresponds to zero or more characters, stored in a char[] array in
+         * this hash.  One or more of these chars may also correspond to a
+         * UnicodeSet, in which case the character in the char[] in this hash is
+         * a stand-in: it is an index for a secondary lookup in
+         * data.setVariables.  The stand-in also represents the UnicodeSet in
+         * the stored rules.
          */
-        public Hashtable variableNames;
+        private Hashtable variableNames;
 
         /**
          * Map category variable (Character) to set (UnicodeSet).
@@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator {
          * stored in the rule text to represent the set of characters.
          * setVariables[i] represents character (setVariablesBase + i).
          */
-        public UnicodeSet[] setVariables;
+        private UnicodeSet[] setVariables;
 
         /**
          * The character that represents setVariables[0].  Characters
          * setVariablesBase through setVariablesBase +
          * setVariables.length - 1 represent UnicodeSet objects.
          */
-        public char setVariablesBase;
-
-        /**
-         * Return the UnicodeSet represented by the given character, or
-         * null if none.
-         */
-        public UnicodeSet lookup(char c) {
-            int i = c - setVariablesBase;
-            return (i >= 0 && i < setVariables.length)
-                ? setVariables[i] : null;
-        }
+        private char setVariablesBase;
 
         /**
          * The character that represents segment 1.  Characters segmentBase
          * through segmentBase + 8 represent segments 1 through 9.
          */
-        public char segmentBase;
+        private char segmentBase;
+
+        /**
+         * Return the UnicodeSet represented by the given character, or
+         * null if none.
+         */
+        public UnicodeSet lookupSet(char c) {
+            int i = c - setVariablesBase;
+            return (i >= 0 && i < setVariables.length)
+                ? setVariables[i] : null;
+        }
 
         /**
          * Return the zero-based index of the segment represented by the given
@@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator {
         private class ParseData implements SymbolTable {
             
             /**
-             * Implement SymbolTable API.  Lookup a variable, returning
-             * either a Character, a UnicodeSet, or null.
+             * Implement SymbolTable API.
              */
-            public Object lookup(String name) {
-                Character ch = (Character) data.variableNames.get(name);
-                if (ch != null) {
-                    int i = ch.charValue() - data.setVariablesBase;
-                    if (i >= 0 && i < setVariablesVector.size()) {
-                        return setVariablesVector.elementAt(i);
-                    }
+            public char[] lookup(String name) {
+                return (char[]) data.variableNames.get(name);
+            }
+
+            /**
+             * Implement SymbolTable API.
+             */
+            public UnicodeSet lookupSet(char ch) {
+                // Note that we cannot use data.lookupSet() because the
+                // set array has not been constructed yet.
+                int i = ch - data.setVariablesBase;
+                if (i >= 0 && i < setVariablesVector.size()) {
+                    return (UnicodeSet) setVariablesVector.elementAt(i);
                 }
-                return ch;
+                return null;
             }
 
             /**
@@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator {
                                 String name = parser.parseData.
                                                 parseReference(rule, pp, limit);
                                 pos = pp.getIndex();
-                                // If this is a variable definition statement, then the LHS
-                                // variable will be undefined.  In that case getVariableName()
-                                // will return the special placeholder variableLimit-1.
-                                buf.append(parser.getVariableDef(name));
+                                // If this is a variable definition statement,
+                                // then the LHS variable will be undefined.  In
+                                // that case appendVariableDef() will append the
+                                // special placeholder char variableLimit-1.
+
+                                //buf.append(parser.getVariableDef(name));
+                                parser.appendVariableDef(name, buf);
                             }
                         }
                         break;
@@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator {
                 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
                     syntaxError("Malformed LHS", rule, start);
                 }
-                if (right.text.length() != 1) {
-                    syntaxError("Malformed RHS", rule, start);
-                }
-                data.variableNames.put(undefinedVariableName,
-                                       new Character(right.text.charAt(0)));
+                // We allow anything on the right, including an empty string.
+                int n = right.text.length();
+                char[] value = new char[n];
+                right.text.getChars(0, n, value, 0);
+                data.variableNames.put(undefinedVariableName, value);
+
                 ++variableLimit;
                 return pos;
             }
@@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator {
         }
 
         /**
-         * Returns the single character value of the given variable name.  Defined
-         * names are recognized.
+         * Append the value of the given variable name to the given
+         * StringBuffer.
          * @exception IllegalArgumentException if the name is unknown.
          */
-        private char getVariableDef(String name) {
-            Character ch = (Character) data.variableNames.get(name);
+        private void appendVariableDef(String name, StringBuffer buf) {
+            char[] ch = (char[]) data.variableNames.get(name);
             if (ch == null) {
                 // We allow one undefined variable so that variable definition
                 // statements work.  For the first undefined variable we return
@@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator {
                     if (variableNext >= variableLimit) {
                         throw new RuntimeException("Private use variables exhausted");
                     }
-                    return --variableLimit;
+                    buf.append((char) --variableLimit);
+                } else {
+                    throw new IllegalArgumentException("Undefined variable $"
+                                                       + name);
                 }
-                throw new IllegalArgumentException("Undefined variable $"
-                                                   + name);
+            } else {
+                buf.append(ch);
             }
-            return ch.charValue();
         }
 
         /**
@@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator {
 
 /**
  * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.27  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
  * Revision 1.26  2000/04/22 01:25:10  alan
  * Add support for cursor positioner '@'; update javadoc
  *
diff --git a/icu4j/src/com/ibm/text/SymbolTable.java b/icu4j/src/com/ibm/text/SymbolTable.java
index 714bae4b66d..60487048569 100755
--- a/icu4j/src/com/ibm/text/SymbolTable.java
+++ b/icu4j/src/com/ibm/text/SymbolTable.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.3 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.4 $
  *
  *****************************************************************************************
  */
@@ -32,10 +32,17 @@ public interface SymbolTable {
     final char SYMBOL_REF = '$';
 
     /**
-     * Lookup the object associated with this string and return it.
-     * Return <tt>null</tt> if no such name exists.
+     * Lookup the characters associated with this string and return it.
+     * Return <tt>null</tt> if no such name exists.  The resultant
+     * array may have length zero.
      */
-    Object lookup(String s);
+    char[] lookup(String s);
+
+    /**
+     * Lookup the UnicodeSet associated with the given character, and
+     * return it.  Return <tt>null</tt> if not found.
+     */
+    UnicodeSet lookupSet(char ch);
 
     /**
      * Parse a symbol reference name from the given string, starting
diff --git a/icu4j/src/com/ibm/text/TransliterationRule.java b/icu4j/src/com/ibm/text/TransliterationRule.java
index 817aa334ccb..b78cfed5e86 100755
--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ 
- * $Date: 2000/04/22 01:25:10 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
  *
  *****************************************************************************************
  */
@@ -44,7 +44,7 @@ import com.ibm.util.Utility;
  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
  *
  * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
  */
 class TransliterationRule {
     /**
@@ -240,7 +240,7 @@ class TransliterationRule {
             return -1;
         }
         char c = pattern.charAt(anteContextLength);
-        return variables.lookup(c) == null ? (c & 0xFF) : -1;
+        return variables.lookupSet(c) == null ? (c & 0xFF) : -1;
     }
 
     /**
@@ -300,7 +300,7 @@ class TransliterationRule {
             return true;
         }
         char c = pattern.charAt(anteContextLength);
-        UnicodeSet set = variables.lookup(c);
+        UnicodeSet set = variables.lookupSet(c);
         return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
     }
 
@@ -486,13 +486,16 @@ class TransliterationRule {
                                                UnicodeFilter filter) {
         UnicodeSet set = null;
         return (filter == null || filter.contains(textChar)) &&
-            (((set = variables.lookup(keyChar)) == null) ?
+            (((set = variables.lookupSet(keyChar)) == null) ?
              keyChar == textChar : set.contains(textChar));
     }
 }
 
 /**
  * $Log: TransliterationRule.java,v $
+ * Revision 1.19  2000/04/25 01:42:58  alan
+ * Allow arbitrary length variable values. Clean up Data API. Update javadocs.
+ *
  * Revision 1.18  2000/04/22 01:25:10  alan
  * Add support for cursor positioner '@'; update javadoc
  *
diff --git a/icu4j/src/com/ibm/text/UnicodeSet.java b/icu4j/src/com/ibm/text/UnicodeSet.java
index 82ed7bbbe92..d0072e29702 100755
--- a/icu4j/src/com/ibm/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/text/UnicodeSet.java
@@ -5,8 +5,8 @@
  *******************************************************************************
  *
  * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $ 
- * $Date: 2000/04/21 22:16:29 $ 
- * $Revision: 1.18 $
+ * $Date: 2000/04/25 01:42:58 $ 
+ * $Revision: 1.19 $
  *
  *****************************************************************************************
  */
@@ -241,7 +241,7 @@ import java.text.*;
  * *Unsupported by Java (and hence unsupported by UnicodeSet).
  *
  * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
  */
 public class UnicodeSet implements UnicodeFilter {
     /**
@@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter {
         int start = pos.getIndex();
         int i = start;
         int limit = pattern.length();
-        for (; i<limit; ++i) {
+        /* In the case of an embedded SymbolTable variable, we look it up and
+         * then take characters from the resultant char[] array.  These chars
+         * are subjected to an extra level of lookup in the SymbolTable in case
+         * they are stand-ins for a nested UnicodeSet.  */
+        char[] varValueBuffer = null;
+        int ivarValueBuffer = 0;
+        for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
             /* If the next element is a single character, c will be set to it,
              * and nestedPairs will be null.  In this case isLiteral indicates
              * whether the character should assume special meaning if it has
@@ -783,9 +789,23 @@ public class UnicodeSet implements UnicodeFilter {
              * nestedPairs will be set to the pairs list for the nested set, and
              * c's value should be ignored.
              */
-            char c = pattern.charAt(i);
             String nestedPairs = null;
             boolean isLiteral = false;
+            char c;
+            if (varValueBuffer != null) {
+                if (ivarValueBuffer < varValueBuffer.length) {
+                    c = varValueBuffer[ivarValueBuffer++];
+                    UnicodeSet set = symbols.lookupSet(c);
+                    if (set != null) {
+                        nestedPairs = set.pairs.toString();
+                    }
+                } else {
+                    varValueBuffer = null;
+                    c = pattern.charAt(i);
+                }
+            } else {
+                c = pattern.charAt(i);
+            }
 
             // Ignore whitespace.  This is not Unicode whitespace, but Java
             // whitespace, a subset of Unicode whitespace.
@@ -829,82 +849,97 @@ public class UnicodeSet implements UnicodeFilter {
             // will be 2 if we want a closing ']', or 3 if we should parse a
             // category and close with ":]".
 
-            /* Handle escapes.  If a character is escaped, then it assumes its
-             * literal value.  This is true for all characters, both special
-             * characters and characters with no special meaning.  We also
-             * interpret '\\uxxxx' Unicode escapes here (as literals).
-             */
-            if (c == '\\') {
-                ++i;
-                if (i < limit) {
-                    c = pattern.charAt(i);
-                    isLiteral = true;
-                    if (c == 'u') {
-                        if ((i+4) >= limit) {
-                            throw new IllegalArgumentException("Invalid \\u escape");
-                        }
-                        c = '\u0000';
-                        for (int j=(++i)+4; i<j; ++i) { // [sic]
-                            int digit = Character.digit(pattern.charAt(i), 16);
-                            if (digit<0) {
+            // Only process escapes, variable references, and nested sets
+            // if we are _not_ retrieving characters from the variable
+            // buffer.  Characters in the variable buffer have already
+            // benn through escape and variable reference processing.
+            if (varValueBuffer == null) {
+                /* Handle escapes.  If a character is escaped, then it assumes its
+                 * literal value.  This is true for all characters, both special
+                 * characters and characters with no special meaning.  We also
+                 * interpret '\\uxxxx' Unicode escapes here (as literals).
+                 */
+                if (c == '\\') {
+                    ++i;
+                    if (i < limit) {
+                        c = pattern.charAt(i);
+                        isLiteral = true;
+                        if (c == 'u') {
+                            if ((i+4) >= limit) {
                                 throw new IllegalArgumentException("Invalid \\u escape");
                             }
-                            c = (char) ((c << 4) | digit);
+                            c = '\u0000';
+                            for (int j=(++i)+4; i<j; ++i) { // [sic]
+                                int digit = Character.digit(pattern.charAt(i), 16);
+                                if (digit<0) {
+                                    throw new IllegalArgumentException("Invalid \\u escape");
+                                }
+                                c = (char) ((c << 4) | digit);
+                            }
+                            --i; // Move i back to last parsed character
                         }
-                        --i; // Move i back to last parsed character
+                    } else {
+                        throw new IllegalArgumentException("Trailing '\\'");
                     }
-                } else {
-                    throw new IllegalArgumentException("Trailing '\\'");
                 }
-            }
 
-            /* Parse variable references.  These are treated as literals.  If a
-             * variable refers to a UnicodeSet, nestedPairs is assigned here.
-             * Variable names are only parsed if varNameToChar is not null.
-             * Set variables are only looked up if varCharToSet is not null.
-             */
-            else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
-                pos.setIndex(++i);
-                String name = symbols.parseReference(pattern, pos, limit);
-                Object obj = symbols.lookup(name);
-                if (obj == null) {
-                    throw new IllegalArgumentException("Undefined variable: "
-                                                       + name);
+                /* Parse variable references.  These are treated as literals.  If a
+                 * variable refers to a UnicodeSet, nestedPairs is assigned here.
+                 * Variable names are only parsed if varNameToChar is not null.
+                 * Set variables are only looked up if varCharToSet is not null.
+                 */
+                else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
+                    pos.setIndex(++i);
+                    String name = symbols.parseReference(pattern, pos, limit);
+                    /*
+                    Object obj = symbols.lookup(name);
+                    if (obj == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    isLiteral = true;
+                    if (obj instanceof Character) {
+                        c = ((Character) obj).charValue();
+                    } else {
+                        nestedPairs = ((UnicodeSet) obj).pairs.toString();
+                    }
+                    */
+                    varValueBuffer = symbols.lookup(name);
+                    if (varValueBuffer == null) {
+                        throw new IllegalArgumentException("Undefined variable: "
+                                                           + name);
+                    }
+                    ivarValueBuffer = 0;
+                    i = pos.getIndex()-1; // Make i point at last char of var name
+                    continue; // Back to the top to get varValueBuffer[0]
                 }
-                isLiteral = true;
-                if (obj instanceof Character) {
-                    c = ((Character) obj).charValue();
-                } else {
-                    nestedPairs = ((UnicodeSet) obj).pairs.toString();
-                }
-                i = pos.getIndex()-1; // Make i point at last char of var name
-            }
 
-            /* An opening bracket indicates the first bracket of a nested
-             * subpattern, either a normal pattern or a category pattern.  We
-             * recognize these here and set nestedPairs accordingly.
-             */
-            else if (!isLiteral && c == '[') {
-                // Handle "[:...:]", representing a character category
-                char d = charAfter(pattern, i);
-                if (d == ':') {
-                    i += 2;
-                    int j = pattern.indexOf(":]", i);
-                    if (j < 0) {
-                        throw new IllegalArgumentException("Missing \":]\"");
+                /* An opening bracket indicates the first bracket of a nested
+                 * subpattern, either a normal pattern or a category pattern.  We
+                 * recognize these here and set nestedPairs accordingly.
+                 */
+                else if (!isLiteral && c == '[') {
+                    // Handle "[:...:]", representing a character category
+                    char d = charAfter(pattern, i);
+                    if (d == ':') {
+                        i += 2;
+                        int j = pattern.indexOf(":]", i);
+                        if (j < 0) {
+                            throw new IllegalArgumentException("Missing \":]\"");
+                        }
+                        nestedPairs = getCategoryPairs(pattern.substring(i, j));
+                        i = j+1; // Make i point to ']' in ":]"
+                        if (mode == 3) {
+                            // Entire pattern is a category; leave parse loop
+                            pairsBuf.append(nestedPairs);
+                            break;
+                        }
+                    } else {
+                        // Recurse to get the pairs for this nested set.
+                        pos.setIndex(i); // Add 2 to point AFTER op
+                        nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
+                        i = pos.getIndex() - 1; // - 1 to point at ']'
                     }
-                    nestedPairs = getCategoryPairs(pattern.substring(i, j));
-                    i = j+1; // Make i point to ']' in ":]"
-                    if (mode == 3) {
-                        // Entire pattern is a category; leave parse loop
-                        pairsBuf.append(nestedPairs);
-                        break;
-                    }
-                } else {
-                    // Recurse to get the pairs for this nested set.
-                    pos.setIndex(i); // Add 2 to point AFTER op
-                    nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
-                    i = pos.getIndex() - 1; // - 1 to point at ']'
                 }
             }
 
@@ -994,6 +1029,13 @@ public class UnicodeSet implements UnicodeFilter {
         }
         pos.setIndex(i+1);
 
+        if (false) {
+            // Debug parser
+            System.out.println("UnicodeSet(" + 
+                               pattern.substring(start, i+1) + ") -> " +
+                               pairsBuf.toString());
+        }
+
         return pairsBuf;
     }
 

`[abc]`	The set containing the - * characters 'a', 'b', and 'c'.
`[^abc]`	The set of all characters except - * 'a', 'b', and 'c'.
`[A-Z]`	The set of all characters from - * 'A' to 'Z' in Unicode order.
`[:Lu:]`	The set of Unicode uppercase - * letters. See www.unicode.org - * for a complete list of categories and their - * two-letter codes.
`[^a-z[:Lu:][:Ll:]]`	The set of all characters except - * 'a' through 'z' and uppercase or lowercase - * letters.