Add rule indexing, and move masking check to TransliterationRuleSet.

X-SVN-Rev: 486
2025-04-08 23:10:40 +00:00 · 2000-01-04 21:43:58 +00:00 · 2000-01-04 21:43:58 +00:00 · 8f00680b7b
commit 8f00680b7b
parent 920e2757f9
8 changed files with 426 additions and 122 deletions
--- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@ -181,9 +181,12 @@ import java.util.Vector;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.5 $ $Date: 1999/12/22 01:40:54 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.6 $ $Date: 2000/01/04 21:43:57 $
 *
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.6  2000/01/04 21:43:57  Alan
+ * Add rule indexing, and move masking check to TransliterationRuleSet.
+ *
 * Revision 1.5  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
@ -632,25 +635,14 @@ public class RuleBasedTransliterator extends Transliterator {
                i = limit + 1;
            }

-            // Check for masking, O(n^2).
-            // Build time, no checking  : 3400 ms
-            // Build time, with checking: 8200 ms
-            if (CHECK_MASKING) {
-                n = data.ruleSet.size();
-                for (i=0; i<n-1; ++i) {
-                    TransliterationRule r1 = data.ruleSet.elementAt(i);
-                    // Earlier rules must not mask later ones
-                    for (int j=i+1; j<n; ++j) {
-                        TransliterationRule r2 = data.ruleSet.elementAt(j);
-                        if (r1.masks(r2)) {
-                            if (errors == null) {
-                                errors = new StringBuffer();
-                            } else {
-                                errors.append("\n");
-                            }
-                            errors.append("Rule " + r1 + " masks " + r2);
-                        }
-                    }
+            // Index the rules
+            try {
+                data.ruleSet.freeze(data.setVariables);
+            } catch (IllegalArgumentException e) {
+                if (errors == null) {
+                    errors = new StringBuffer(e.getMessage());
+                } else {
+                    errors.append("\n").append(e.getMessage());
                }
            }

--- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@ -21,9 +21,12 @@ import java.util.Dictionary;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
 *
 * $Log: TransliterationRule.java,v $
+ * Revision 1.5  2000/01/04 21:43:57  Alan
+ * Add rule indexing, and move masking check to TransliterationRuleSet.
+ *
 * Revision 1.4  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
@ -164,6 +167,32 @@ class TransliterationRule {
        return anteContextLength;
    }

+    /**
+     * Internal method.  Returns 8-bit index value for this rule.
+     * This is the low byte of the first character of the key,
+     * unless the first character of the key is a set.  If it's a
+     * set, the index value is -1.
+     */
+    final int getIndexValue(Dictionary variables) {
+        char c = pattern.charAt(anteContextLength);
+        return variables.get(new Character(c)) == null ? (c & 0xFF) : -1;
+    }
+
+    /**
+     * Internal method.  Returns true if this rule matches the given
+     * index value.  The index value is an 8-bit integer, 0..255,
+     * representing the low byte of the first character of the key.
+     * It matches this rule if it matches the first character of the
+     * key, or if the first character of the key is a set, and the set
+     * contains any character with a low byte equal to the index
+     * value.
+     */
+    final boolean matchesIndexValue(int v, Dictionary variables) {
+        char c = pattern.charAt(anteContextLength);
+        UnicodeSet set = (UnicodeSet) variables.get(new Character(c));
+        return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
+    }
+
    /**
     * Return true if this rule masks another rule.  If r1 masks r2 then
     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
@ -449,8 +478,8 @@ class TransliterationRule {
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     */
-    protected static boolean charMatches(char keyChar, char textChar,
-                                         Dictionary variables, UnicodeFilter filter) {
+    protected static final boolean charMatches(char keyChar, char textChar,
+                                               Dictionary variables, UnicodeFilter filter) {
        UnicodeSet set = null;
        return (filter == null || filter.isIn(textChar)) &&
            ((set = (UnicodeSet) variables.get(new Character(keyChar)))
--- a/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java
@ -15,9 +15,12 @@ import java.util.*;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
+ * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
 *
 * $Log: TransliterationRuleSet.java,v $
+ * Revision 1.5  2000/01/04 21:43:57  Alan
+ * Add rule indexing, and move masking check to TransliterationRuleSet.
+ *
 * Revision 1.4  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
@ -29,30 +32,31 @@ import java.util.*;
 *
 */
 class TransliterationRuleSet {
-    /* Note: There was an old implementation that indexed by first letter of
-     * key.  Problem with this is that key may not have a meaningful first
-     * letter; e.g., {Lu}>*.  One solution is to keep a separate vector of all
-     * rules whose intial key letter is a category variable.  However, the
-     * problem is that they must be kept in order with respect to other rules.
-     * One solution -- add a sequence number to each rule.  Do the usual
-     * first-letter lookup, and also a lookup from the spare bin with rules like
-     * {Lu}>*.  Take the lower sequence number.  This seems complex and not
-     * worth the trouble, but we may revisit this later.  For documentation (or
-     * possible resurrection) the old code is included below, commented out
-     * with the remark "// OLD INDEXED IMPLEMENTATION".  Under the old
-     * implementation, <code>rules</code> is a Hashtable, not a Vector.
-     */
-
    /**
-     * Vector of rules, in the order added.
+     * Vector of rules, in the order added.  This is only used while the rule
+     * set is getting built.  After that, freeze() reorders and indexes the
+     * rules, and this Vector is freed.
     */
-    private Vector rules;
+    private Vector ruleVector;

    /**
     * Length of the longest preceding context
     */
    private int maxContextLength;

+    /**
+     * Sorted and indexed table of rules.  This is created by freeze() from
+     * the rules in ruleVector.
+     */
+    private TransliterationRule[] rules;
+
+    /**
+     * Index table.  For text having a first character c, compute x = c&0xFF.
+     * Now use rules[index[x]..index[x+1]-1].  This index table is created by
+     * freeze().
+     */
+    private int[] index;
+
    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999. All rights reserved.";

@ -60,7 +64,7 @@ class TransliterationRuleSet {
     * Construct a new empty rule set.
     */
    public TransliterationRuleSet() {
-        rules = new Vector();
+        ruleVector = new Vector();
        maxContextLength = 0;
    }

@ -78,19 +82,106 @@ class TransliterationRuleSet {
     * @param rule the rule to add
     */
    public void addRule(TransliterationRule rule) {
-        rules.addElement(rule);
+        if (ruleVector == null) {
+            throw new IllegalArgumentException("Cannot add rules after freezing");
+        }
+        ruleVector.addElement(rule);
        int len;
        if ((len = rule.getAnteContextLength()) > maxContextLength) {
            maxContextLength = len;
        }
    }

-    public final int size() {
-        return rules.size();
-    }
+    /**
+     * Close this rule set to further additions, check it for masked rules,
+     * and index it to optimize performance.  Once this method is called,
+     * addRule() can no longer be called.
+     * @exception IllegalArgumentException if some rules are masked
+     */
+    public void freeze(Dictionary variables) {
+        /* Construct the rule array and index table.  We reorder the
+         * rules by sorting them into 256 bins.  Each bin contains all
+         * rules matching the index value for that bin.  A rule
+         * matches an index value if string whose first key character
+         * has a low byte equal to the index value can match the rule.
+         *
+         * Each bin contains zero or more rules, in the same order
+         * they were found originally.  However, the total rules in
+         * the bins may exceed the number in the original vector,
+         * since rules that have a variable as their first key
+         * character will generally fall into more than one bin.
+         *
+         * That is, each bin contains all rules that either have that
+         * first index value as their first key character, or have
+         * a set containing the index value as their first character.
+         */
+        int n = ruleVector.size();
+        index = new int[257]; // [sic]
+        Vector v = new Vector(2*n); // heuristic; adjust as needed

-    public final TransliterationRule elementAt(int i) {
-        return (TransliterationRule) rules.elementAt(i);
+        /* Precompute the index values.  This saves a LOT of time.
+         */
+        int[] indexValue = new int[n];
+        for (int j=0; j<n; ++j) {
+            TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
+            indexValue[j] = r.getIndexValue(variables);
+        }
+        for (int x=0; x<256; ++x) {
+            index[x] = v.size();
+            for (int j=0; j<n; ++j) {
+                if (indexValue[j] >= 0) {
+                    if (indexValue[j] == x) {
+                        v.addElement(ruleVector.elementAt(j));
+                    }
+                } else {
+                    // If the indexValue is < 0, then the first key character is
+                    // a set, and we must use the more time-consuming
+                    // matchesIndexValue check.  In practice this happens
+                    // rarely, so we seldom tread this code path.
+                    TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
+                    if (r.matchesIndexValue(x, variables)) {
+                        v.addElement(r);
+                    }
+                }
+            }
+        }
+        index[256] = v.size();
+
+        /* Freeze things into an array.
+         */
+        rules = new TransliterationRule[v.size()];
+        v.copyInto(rules);
+        ruleVector = null;
+
+        StringBuffer errors = null;
+
+        /* Check for masking.  This is MUCH faster than our old check,
+         * which was each rule against each following rule, since we
+         * only have to check for masking within each bin now.  It's
+         * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
+         * count, and n2 is the per-bin rule count.  But n2<<n1, so
+         * it's a big win.
+         */
+        for (int x=0; x<256; ++x) {
+            for (int j=index[x]; j<index[x+1]-1; ++j) {
+                TransliterationRule r1 = rules[j];
+                for (int k=j+1; k<index[x+1]; ++k) {
+                    TransliterationRule r2 = rules[k];
+                    if (r1.masks(r2)) {
+                        if (errors == null) {
+                            errors = new StringBuffer();
+                        } else {
+                            errors.append("\n");
+                        }
+                        errors.append("Rule " + r1 + " masks " + r2);
+                    }
+                }
+            }
+        }
+
+        if (errors != null) {
+            throw new IllegalArgumentException(errors.toString());
+        }
    }

    /**
@ -118,15 +209,21 @@ class TransliterationRuleSet {
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @return the matching rule, or null if none found.
+
     */
    public TransliterationRule findMatch(String text, int start, int limit,
                                         StringBuffer result, int cursor,
                                         Dictionary variables,
                                         UnicodeFilter filter) {
-        for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
-            TransliterationRule rule = (TransliterationRule) e.nextElement();
-            if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
-                return rule;
+        /* We only need to check our indexed bin of the rule table,
+         * based on the low byte of the first key character.
+         */
+        int rlen = result.length();
+        int x = 0xFF & (cursor < rlen ? result.charAt(cursor)
+                        : text.charAt(cursor - rlen + start));
+        for (int i=index[x]; i<index[x+1]; ++i) {
+            if (rules[i].matches(text, start, limit, result, cursor, variables, filter)) {
+                return rules[i];
            }
        }
        return null;
@ -154,10 +251,13 @@ class TransliterationRuleSet {
                                         int cursor,
                                         Dictionary variables,
                                         UnicodeFilter filter) {
-        for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
-            TransliterationRule rule = (TransliterationRule) e.nextElement();
-            if (rule.matches(text, start, limit, cursor, variables, filter)) {
-                return rule;
+        /* We only need to check our indexed bin of the rule table,
+         * based on the low byte of the first key character.
+         */
+        int x = text.charAt(cursor) & 0xFF;
+        for (int i=index[x]; i<index[x+1]; ++i) {
+            if (rules[i].matches(text, start, limit, cursor, variables, filter)) {
+                return rules[i];
            }
        }
        return null;
@ -195,14 +295,17 @@ class TransliterationRuleSet {
                                                    Dictionary variables,
                                                    boolean partial[],
                                                    UnicodeFilter filter) {
+        /* We only need to check our indexed bin of the rule table,
+         * based on the low byte of the first key character.
+         */
        partial[0] = false;
-        for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
-            TransliterationRule rule = (TransliterationRule) e.nextElement();
-            int match = rule.getMatchDegree(text, start, limit, cursor,
-                                            variables, filter);
+        int x = text.charAt(cursor) & 0xFF;
+        for (int i=index[x]; i<index[x+1]; ++i) {
+            int match = rules[i].getMatchDegree(text, start, limit, cursor,
+                                                variables, filter);
            switch (match) {
            case TransliterationRule.FULL_MATCH:
-                return rule;
+                return rules[i];
            case TransliterationRule.PARTIAL_MATCH:
                partial[0] = true;
                return null;
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@ -225,7 +225,7 @@ import java.text.*;
 * *Unsupported by Java (and hence unsupported by UnicodeSet).
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.2 $ $Date: 2000/01/04 21:43:58 $ */
 public class UnicodeSet {
    /**
     * The internal representation is a StringBuffer of even length.
@ -456,6 +456,34 @@ public class UnicodeSet {
        return contains(c, c);
    }

+    /**
+     * Returns <tt>true</tt> if this set contains any character whose low byte
+     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
+     * indexing.
+     */
+    public boolean containsIndexValue(int v) {
+        /* The index value v, in the range [0,255], is contained in this set if
+         * it is contained in any pair of this set.  Pairs either have the high
+         * bytes equal, or unequal.  If the high bytes are equal, then we have
+         * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
+         * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
+         * Then v is contained if xx <= v || v <= yy.  (This is identical to the
+         * time zone month containment logic.)
+         */
+        for (int i=0; i<pairs.length(); i+=2) {
+            char low = pairs.charAt(i);
+            char high = pairs.charAt(i+1);
+            if ((low & 0xFF00) == (high & 0xFF00)) {
+                if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
+                    return true;
+                }
+            } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    /**
     * Adds the specified range to this set if it is not already
     * present.  If this set already contains the specified range,
--- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@ -181,9 +181,12 @@ import java.util.Vector;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.5 $ $Date: 1999/12/22 01:40:54 $
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.6 $ $Date: 2000/01/04 21:43:57 $
 *
 * $Log: RuleBasedTransliterator.java,v $
+ * Revision 1.6  2000/01/04 21:43:57  Alan
+ * Add rule indexing, and move masking check to TransliterationRuleSet.
+ *
 * Revision 1.5  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
@ -632,25 +635,14 @@ public class RuleBasedTransliterator extends Transliterator {
                i = limit + 1;
            }

-            // Check for masking, O(n^2).
-            // Build time, no checking  : 3400 ms
-            // Build time, with checking: 8200 ms
-            if (CHECK_MASKING) {
-                n = data.ruleSet.size();
-                for (i=0; i<n-1; ++i) {
-                    TransliterationRule r1 = data.ruleSet.elementAt(i);
-                    // Earlier rules must not mask later ones
-                    for (int j=i+1; j<n; ++j) {
-                        TransliterationRule r2 = data.ruleSet.elementAt(j);
-                        if (r1.masks(r2)) {
-                            if (errors == null) {
-                                errors = new StringBuffer();
-                            } else {
-                                errors.append("\n");
-                            }
-                            errors.append("Rule " + r1 + " masks " + r2);
-                        }
-                    }
+            // Index the rules
+            try {
+                data.ruleSet.freeze(data.setVariables);
+            } catch (IllegalArgumentException e) {
+                if (errors == null) {
+                    errors = new StringBuffer(e.getMessage());
+                } else {
+                    errors.append("\n").append(e.getMessage());
                }
            }

--- a/icu4j/src/com/ibm/text/TransliterationRule.java
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@ -21,9 +21,12 @@ import java.util.Dictionary;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
 *
 * $Log: TransliterationRule.java,v $
+ * Revision 1.5  2000/01/04 21:43:57  Alan
+ * Add rule indexing, and move masking check to TransliterationRuleSet.
+ *
 * Revision 1.4  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
@ -164,6 +167,32 @@ class TransliterationRule {
        return anteContextLength;
    }

+    /**
+     * Internal method.  Returns 8-bit index value for this rule.
+     * This is the low byte of the first character of the key,
+     * unless the first character of the key is a set.  If it's a
+     * set, the index value is -1.
+     */
+    final int getIndexValue(Dictionary variables) {
+        char c = pattern.charAt(anteContextLength);
+        return variables.get(new Character(c)) == null ? (c & 0xFF) : -1;
+    }
+
+    /**
+     * Internal method.  Returns true if this rule matches the given
+     * index value.  The index value is an 8-bit integer, 0..255,
+     * representing the low byte of the first character of the key.
+     * It matches this rule if it matches the first character of the
+     * key, or if the first character of the key is a set, and the set
+     * contains any character with a low byte equal to the index
+     * value.
+     */
+    final boolean matchesIndexValue(int v, Dictionary variables) {
+        char c = pattern.charAt(anteContextLength);
+        UnicodeSet set = (UnicodeSet) variables.get(new Character(c));
+        return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
+    }
+
    /**
     * Return true if this rule masks another rule.  If r1 masks r2 then
     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
@ -449,8 +478,8 @@ class TransliterationRule {
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     */
-    protected static boolean charMatches(char keyChar, char textChar,
-                                         Dictionary variables, UnicodeFilter filter) {
+    protected static final boolean charMatches(char keyChar, char textChar,
+                                               Dictionary variables, UnicodeFilter filter) {
        UnicodeSet set = null;
        return (filter == null || filter.isIn(textChar)) &&
            ((set = (UnicodeSet) variables.get(new Character(keyChar)))
--- a/icu4j/src/com/ibm/text/TransliterationRuleSet.java
+++ b/icu4j/src/com/ibm/text/TransliterationRuleSet.java
@ -15,9 +15,12 @@ import java.util.*;
 * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
 *
 * @author Alan Liu
- * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
+ * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
 *
 * $Log: TransliterationRuleSet.java,v $
+ * Revision 1.5  2000/01/04 21:43:57  Alan
+ * Add rule indexing, and move masking check to TransliterationRuleSet.
+ *
 * Revision 1.4  1999/12/22 01:40:54  Alan
 * Consolidate rule pattern anteContext, key, and postContext into one string.
 *
@ -29,30 +32,31 @@ import java.util.*;
 *
 */
 class TransliterationRuleSet {
-    /* Note: There was an old implementation that indexed by first letter of
-     * key.  Problem with this is that key may not have a meaningful first
-     * letter; e.g., {Lu}>*.  One solution is to keep a separate vector of all
-     * rules whose intial key letter is a category variable.  However, the
-     * problem is that they must be kept in order with respect to other rules.
-     * One solution -- add a sequence number to each rule.  Do the usual
-     * first-letter lookup, and also a lookup from the spare bin with rules like
-     * {Lu}>*.  Take the lower sequence number.  This seems complex and not
-     * worth the trouble, but we may revisit this later.  For documentation (or
-     * possible resurrection) the old code is included below, commented out
-     * with the remark "// OLD INDEXED IMPLEMENTATION".  Under the old
-     * implementation, <code>rules</code> is a Hashtable, not a Vector.
-     */
-
    /**
-     * Vector of rules, in the order added.
+     * Vector of rules, in the order added.  This is only used while the rule
+     * set is getting built.  After that, freeze() reorders and indexes the
+     * rules, and this Vector is freed.
     */
-    private Vector rules;
+    private Vector ruleVector;

    /**
     * Length of the longest preceding context
     */
    private int maxContextLength;

+    /**
+     * Sorted and indexed table of rules.  This is created by freeze() from
+     * the rules in ruleVector.
+     */
+    private TransliterationRule[] rules;
+
+    /**
+     * Index table.  For text having a first character c, compute x = c&0xFF.
+     * Now use rules[index[x]..index[x+1]-1].  This index table is created by
+     * freeze().
+     */
+    private int[] index;
+
    private static final String COPYRIGHT =
        "\u00A9 IBM Corporation 1999. All rights reserved.";

@ -60,7 +64,7 @@ class TransliterationRuleSet {
     * Construct a new empty rule set.
     */
    public TransliterationRuleSet() {
-        rules = new Vector();
+        ruleVector = new Vector();
        maxContextLength = 0;
    }

@ -78,19 +82,106 @@ class TransliterationRuleSet {
     * @param rule the rule to add
     */
    public void addRule(TransliterationRule rule) {
-        rules.addElement(rule);
+        if (ruleVector == null) {
+            throw new IllegalArgumentException("Cannot add rules after freezing");
+        }
+        ruleVector.addElement(rule);
        int len;
        if ((len = rule.getAnteContextLength()) > maxContextLength) {
            maxContextLength = len;
        }
    }

-    public final int size() {
-        return rules.size();
-    }
+    /**
+     * Close this rule set to further additions, check it for masked rules,
+     * and index it to optimize performance.  Once this method is called,
+     * addRule() can no longer be called.
+     * @exception IllegalArgumentException if some rules are masked
+     */
+    public void freeze(Dictionary variables) {
+        /* Construct the rule array and index table.  We reorder the
+         * rules by sorting them into 256 bins.  Each bin contains all
+         * rules matching the index value for that bin.  A rule
+         * matches an index value if string whose first key character
+         * has a low byte equal to the index value can match the rule.
+         *
+         * Each bin contains zero or more rules, in the same order
+         * they were found originally.  However, the total rules in
+         * the bins may exceed the number in the original vector,
+         * since rules that have a variable as their first key
+         * character will generally fall into more than one bin.
+         *
+         * That is, each bin contains all rules that either have that
+         * first index value as their first key character, or have
+         * a set containing the index value as their first character.
+         */
+        int n = ruleVector.size();
+        index = new int[257]; // [sic]
+        Vector v = new Vector(2*n); // heuristic; adjust as needed

-    public final TransliterationRule elementAt(int i) {
-        return (TransliterationRule) rules.elementAt(i);
+        /* Precompute the index values.  This saves a LOT of time.
+         */
+        int[] indexValue = new int[n];
+        for (int j=0; j<n; ++j) {
+            TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
+            indexValue[j] = r.getIndexValue(variables);
+        }
+        for (int x=0; x<256; ++x) {
+            index[x] = v.size();
+            for (int j=0; j<n; ++j) {
+                if (indexValue[j] >= 0) {
+                    if (indexValue[j] == x) {
+                        v.addElement(ruleVector.elementAt(j));
+                    }
+                } else {
+                    // If the indexValue is < 0, then the first key character is
+                    // a set, and we must use the more time-consuming
+                    // matchesIndexValue check.  In practice this happens
+                    // rarely, so we seldom tread this code path.
+                    TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
+                    if (r.matchesIndexValue(x, variables)) {
+                        v.addElement(r);
+                    }
+                }
+            }
+        }
+        index[256] = v.size();
+
+        /* Freeze things into an array.
+         */
+        rules = new TransliterationRule[v.size()];
+        v.copyInto(rules);
+        ruleVector = null;
+
+        StringBuffer errors = null;
+
+        /* Check for masking.  This is MUCH faster than our old check,
+         * which was each rule against each following rule, since we
+         * only have to check for masking within each bin now.  It's
+         * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
+         * count, and n2 is the per-bin rule count.  But n2<<n1, so
+         * it's a big win.
+         */
+        for (int x=0; x<256; ++x) {
+            for (int j=index[x]; j<index[x+1]-1; ++j) {
+                TransliterationRule r1 = rules[j];
+                for (int k=j+1; k<index[x+1]; ++k) {
+                    TransliterationRule r2 = rules[k];
+                    if (r1.masks(r2)) {
+                        if (errors == null) {
+                            errors = new StringBuffer();
+                        } else {
+                            errors.append("\n");
+                        }
+                        errors.append("Rule " + r1 + " masks " + r2);
+                    }
+                }
+            }
+        }
+
+        if (errors != null) {
+            throw new IllegalArgumentException(errors.toString());
+        }
    }

    /**
@ -118,15 +209,21 @@ class TransliterationRuleSet {
     * altered by this transliterator.  If <tt>filter</tt> is
     * <tt>null</tt> then no filtering is applied.
     * @return the matching rule, or null if none found.
+
     */
    public TransliterationRule findMatch(String text, int start, int limit,
                                         StringBuffer result, int cursor,
                                         Dictionary variables,
                                         UnicodeFilter filter) {
-        for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
-            TransliterationRule rule = (TransliterationRule) e.nextElement();
-            if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
-                return rule;
+        /* We only need to check our indexed bin of the rule table,
+         * based on the low byte of the first key character.
+         */
+        int rlen = result.length();
+        int x = 0xFF & (cursor < rlen ? result.charAt(cursor)
+                        : text.charAt(cursor - rlen + start));
+        for (int i=index[x]; i<index[x+1]; ++i) {
+            if (rules[i].matches(text, start, limit, result, cursor, variables, filter)) {
+                return rules[i];
            }
        }
        return null;
@ -154,10 +251,13 @@ class TransliterationRuleSet {
                                         int cursor,
                                         Dictionary variables,
                                         UnicodeFilter filter) {
-        for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
-            TransliterationRule rule = (TransliterationRule) e.nextElement();
-            if (rule.matches(text, start, limit, cursor, variables, filter)) {
-                return rule;
+        /* We only need to check our indexed bin of the rule table,
+         * based on the low byte of the first key character.
+         */
+        int x = text.charAt(cursor) & 0xFF;
+        for (int i=index[x]; i<index[x+1]; ++i) {
+            if (rules[i].matches(text, start, limit, cursor, variables, filter)) {
+                return rules[i];
            }
        }
        return null;
@ -195,14 +295,17 @@ class TransliterationRuleSet {
                                                    Dictionary variables,
                                                    boolean partial[],
                                                    UnicodeFilter filter) {
+        /* We only need to check our indexed bin of the rule table,
+         * based on the low byte of the first key character.
+         */
        partial[0] = false;
-        for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
-            TransliterationRule rule = (TransliterationRule) e.nextElement();
-            int match = rule.getMatchDegree(text, start, limit, cursor,
-                                            variables, filter);
+        int x = text.charAt(cursor) & 0xFF;
+        for (int i=index[x]; i<index[x+1]; ++i) {
+            int match = rules[i].getMatchDegree(text, start, limit, cursor,
+                                                variables, filter);
            switch (match) {
            case TransliterationRule.FULL_MATCH:
-                return rule;
+                return rules[i];
            case TransliterationRule.PARTIAL_MATCH:
                partial[0] = true;
                return null;
--- a/icu4j/src/com/ibm/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/text/UnicodeSet.java
@ -225,7 +225,7 @@ import java.text.*;
 * *Unsupported by Java (and hence unsupported by UnicodeSet).
 *
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.2 $ $Date: 2000/01/04 21:43:58 $ */
 public class UnicodeSet {
    /**
     * The internal representation is a StringBuffer of even length.
@ -456,6 +456,34 @@ public class UnicodeSet {
        return contains(c, c);
    }

+    /**
+     * Returns <tt>true</tt> if this set contains any character whose low byte
+     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
+     * indexing.
+     */
+    public boolean containsIndexValue(int v) {
+        /* The index value v, in the range [0,255], is contained in this set if
+         * it is contained in any pair of this set.  Pairs either have the high
+         * bytes equal, or unequal.  If the high bytes are equal, then we have
+         * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
+         * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
+         * Then v is contained if xx <= v || v <= yy.  (This is identical to the
+         * time zone month containment logic.)
+         */
+        for (int i=0; i<pairs.length(); i+=2) {
+            char low = pairs.charAt(i);
+            char high = pairs.charAt(i+1);
+            if ((low & 0xFF00) == (high & 0xFF00)) {
+                if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
+                    return true;
+                }
+            } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    /**
     * Adds the specified range to this set if it is not already
     * present.  If this set already contains the specified range,