mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 23:10:40 +00:00
Add rule indexing, and move masking check to TransliterationRuleSet.
X-SVN-Rev: 486
This commit is contained in:
parent
920e2757f9
commit
8f00680b7b
8 changed files with 426 additions and 122 deletions
|
@ -181,9 +181,12 @@ import java.util.Vector;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.5 $ $Date: 1999/12/22 01:40:54 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.6 $ $Date: 2000/01/04 21:43:57 $
|
||||
*
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.6 2000/01/04 21:43:57 Alan
|
||||
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
||||
*
|
||||
* Revision 1.5 1999/12/22 01:40:54 Alan
|
||||
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
||||
*
|
||||
|
@ -632,25 +635,14 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
i = limit + 1;
|
||||
}
|
||||
|
||||
// Check for masking, O(n^2).
|
||||
// Build time, no checking : 3400 ms
|
||||
// Build time, with checking: 8200 ms
|
||||
if (CHECK_MASKING) {
|
||||
n = data.ruleSet.size();
|
||||
for (i=0; i<n-1; ++i) {
|
||||
TransliterationRule r1 = data.ruleSet.elementAt(i);
|
||||
// Earlier rules must not mask later ones
|
||||
for (int j=i+1; j<n; ++j) {
|
||||
TransliterationRule r2 = data.ruleSet.elementAt(j);
|
||||
if (r1.masks(r2)) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer();
|
||||
} else {
|
||||
errors.append("\n");
|
||||
}
|
||||
errors.append("Rule " + r1 + " masks " + r2);
|
||||
}
|
||||
}
|
||||
// Index the rules
|
||||
try {
|
||||
data.ruleSet.freeze(data.setVariables);
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer(e.getMessage());
|
||||
} else {
|
||||
errors.append("\n").append(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,12 @@ import java.util.Dictionary;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
|
||||
*
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.5 2000/01/04 21:43:57 Alan
|
||||
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
||||
*
|
||||
* Revision 1.4 1999/12/22 01:40:54 Alan
|
||||
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
||||
*
|
||||
|
@ -164,6 +167,32 @@ class TransliterationRule {
|
|||
return anteContextLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns 8-bit index value for this rule.
|
||||
* This is the low byte of the first character of the key,
|
||||
* unless the first character of the key is a set. If it's a
|
||||
* set, the index value is -1.
|
||||
*/
|
||||
final int getIndexValue(Dictionary variables) {
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
return variables.get(new Character(c)) == null ? (c & 0xFF) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
* index value. The index value is an 8-bit integer, 0..255,
|
||||
* representing the low byte of the first character of the key.
|
||||
* It matches this rule if it matches the first character of the
|
||||
* key, or if the first character of the key is a set, and the set
|
||||
* contains any character with a low byte equal to the index
|
||||
* value.
|
||||
*/
|
||||
final boolean matchesIndexValue(int v, Dictionary variables) {
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
UnicodeSet set = (UnicodeSet) variables.get(new Character(c));
|
||||
return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
|
@ -449,8 +478,8 @@ class TransliterationRule {
|
|||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
protected static boolean charMatches(char keyChar, char textChar,
|
||||
Dictionary variables, UnicodeFilter filter) {
|
||||
protected static final boolean charMatches(char keyChar, char textChar,
|
||||
Dictionary variables, UnicodeFilter filter) {
|
||||
UnicodeSet set = null;
|
||||
return (filter == null || filter.isIn(textChar)) &&
|
||||
((set = (UnicodeSet) variables.get(new Character(keyChar)))
|
||||
|
|
|
@ -15,9 +15,12 @@ import java.util.*;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
|
||||
*
|
||||
* $Log: TransliterationRuleSet.java,v $
|
||||
* Revision 1.5 2000/01/04 21:43:57 Alan
|
||||
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
||||
*
|
||||
* Revision 1.4 1999/12/22 01:40:54 Alan
|
||||
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
||||
*
|
||||
|
@ -29,30 +32,31 @@ import java.util.*;
|
|||
*
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
/* Note: There was an old implementation that indexed by first letter of
|
||||
* key. Problem with this is that key may not have a meaningful first
|
||||
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
|
||||
* rules whose intial key letter is a category variable. However, the
|
||||
* problem is that they must be kept in order with respect to other rules.
|
||||
* One solution -- add a sequence number to each rule. Do the usual
|
||||
* first-letter lookup, and also a lookup from the spare bin with rules like
|
||||
* {Lu}>*. Take the lower sequence number. This seems complex and not
|
||||
* worth the trouble, but we may revisit this later. For documentation (or
|
||||
* possible resurrection) the old code is included below, commented out
|
||||
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
|
||||
* implementation, <code>rules</code> is a Hashtable, not a Vector.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Vector of rules, in the order added.
|
||||
* Vector of rules, in the order added. This is only used while the rule
|
||||
* set is getting built. After that, freeze() reorders and indexes the
|
||||
* rules, and this Vector is freed.
|
||||
*/
|
||||
private Vector rules;
|
||||
private Vector ruleVector;
|
||||
|
||||
/**
|
||||
* Length of the longest preceding context
|
||||
*/
|
||||
private int maxContextLength;
|
||||
|
||||
/**
|
||||
* Sorted and indexed table of rules. This is created by freeze() from
|
||||
* the rules in ruleVector.
|
||||
*/
|
||||
private TransliterationRule[] rules;
|
||||
|
||||
/**
|
||||
* Index table. For text having a first character c, compute x = c&0xFF.
|
||||
* Now use rules[index[x]..index[x+1]-1]. This index table is created by
|
||||
* freeze().
|
||||
*/
|
||||
private int[] index;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
|
@ -60,7 +64,7 @@ class TransliterationRuleSet {
|
|||
* Construct a new empty rule set.
|
||||
*/
|
||||
public TransliterationRuleSet() {
|
||||
rules = new Vector();
|
||||
ruleVector = new Vector();
|
||||
maxContextLength = 0;
|
||||
}
|
||||
|
||||
|
@ -78,19 +82,106 @@ class TransliterationRuleSet {
|
|||
* @param rule the rule to add
|
||||
*/
|
||||
public void addRule(TransliterationRule rule) {
|
||||
rules.addElement(rule);
|
||||
if (ruleVector == null) {
|
||||
throw new IllegalArgumentException("Cannot add rules after freezing");
|
||||
}
|
||||
ruleVector.addElement(rule);
|
||||
int len;
|
||||
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
}
|
||||
}
|
||||
|
||||
public final int size() {
|
||||
return rules.size();
|
||||
}
|
||||
/**
|
||||
* Close this rule set to further additions, check it for masked rules,
|
||||
* and index it to optimize performance. Once this method is called,
|
||||
* addRule() can no longer be called.
|
||||
* @exception IllegalArgumentException if some rules are masked
|
||||
*/
|
||||
public void freeze(Dictionary variables) {
|
||||
/* Construct the rule array and index table. We reorder the
|
||||
* rules by sorting them into 256 bins. Each bin contains all
|
||||
* rules matching the index value for that bin. A rule
|
||||
* matches an index value if string whose first key character
|
||||
* has a low byte equal to the index value can match the rule.
|
||||
*
|
||||
* Each bin contains zero or more rules, in the same order
|
||||
* they were found originally. However, the total rules in
|
||||
* the bins may exceed the number in the original vector,
|
||||
* since rules that have a variable as their first key
|
||||
* character will generally fall into more than one bin.
|
||||
*
|
||||
* That is, each bin contains all rules that either have that
|
||||
* first index value as their first key character, or have
|
||||
* a set containing the index value as their first character.
|
||||
*/
|
||||
int n = ruleVector.size();
|
||||
index = new int[257]; // [sic]
|
||||
Vector v = new Vector(2*n); // heuristic; adjust as needed
|
||||
|
||||
public final TransliterationRule elementAt(int i) {
|
||||
return (TransliterationRule) rules.elementAt(i);
|
||||
/* Precompute the index values. This saves a LOT of time.
|
||||
*/
|
||||
int[] indexValue = new int[n];
|
||||
for (int j=0; j<n; ++j) {
|
||||
TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
|
||||
indexValue[j] = r.getIndexValue(variables);
|
||||
}
|
||||
for (int x=0; x<256; ++x) {
|
||||
index[x] = v.size();
|
||||
for (int j=0; j<n; ++j) {
|
||||
if (indexValue[j] >= 0) {
|
||||
if (indexValue[j] == x) {
|
||||
v.addElement(ruleVector.elementAt(j));
|
||||
}
|
||||
} else {
|
||||
// If the indexValue is < 0, then the first key character is
|
||||
// a set, and we must use the more time-consuming
|
||||
// matchesIndexValue check. In practice this happens
|
||||
// rarely, so we seldom tread this code path.
|
||||
TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
|
||||
if (r.matchesIndexValue(x, variables)) {
|
||||
v.addElement(r);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
index[256] = v.size();
|
||||
|
||||
/* Freeze things into an array.
|
||||
*/
|
||||
rules = new TransliterationRule[v.size()];
|
||||
v.copyInto(rules);
|
||||
ruleVector = null;
|
||||
|
||||
StringBuffer errors = null;
|
||||
|
||||
/* Check for masking. This is MUCH faster than our old check,
|
||||
* which was each rule against each following rule, since we
|
||||
* only have to check for masking within each bin now. It's
|
||||
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
|
||||
* count, and n2 is the per-bin rule count. But n2<<n1, so
|
||||
* it's a big win.
|
||||
*/
|
||||
for (int x=0; x<256; ++x) {
|
||||
for (int j=index[x]; j<index[x+1]-1; ++j) {
|
||||
TransliterationRule r1 = rules[j];
|
||||
for (int k=j+1; k<index[x+1]; ++k) {
|
||||
TransliterationRule r2 = rules[k];
|
||||
if (r1.masks(r2)) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer();
|
||||
} else {
|
||||
errors.append("\n");
|
||||
}
|
||||
errors.append("Rule " + r1 + " masks " + r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != null) {
|
||||
throw new IllegalArgumentException(errors.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -118,15 +209,21 @@ class TransliterationRuleSet {
|
|||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
|
||||
*/
|
||||
public TransliterationRule findMatch(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
|
||||
return rule;
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int rlen = result.length();
|
||||
int x = 0xFF & (cursor < rlen ? result.charAt(cursor)
|
||||
: text.charAt(cursor - rlen + start));
|
||||
for (int i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i].matches(text, start, limit, result, cursor, variables, filter)) {
|
||||
return rules[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -154,10 +251,13 @@ class TransliterationRuleSet {
|
|||
int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, cursor, variables, filter)) {
|
||||
return rule;
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int x = text.charAt(cursor) & 0xFF;
|
||||
for (int i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i].matches(text, start, limit, cursor, variables, filter)) {
|
||||
return rules[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -195,14 +295,17 @@ class TransliterationRuleSet {
|
|||
Dictionary variables,
|
||||
boolean partial[],
|
||||
UnicodeFilter filter) {
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
partial[0] = false;
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
int match = rule.getMatchDegree(text, start, limit, cursor,
|
||||
variables, filter);
|
||||
int x = text.charAt(cursor) & 0xFF;
|
||||
for (int i=index[x]; i<index[x+1]; ++i) {
|
||||
int match = rules[i].getMatchDegree(text, start, limit, cursor,
|
||||
variables, filter);
|
||||
switch (match) {
|
||||
case TransliterationRule.FULL_MATCH:
|
||||
return rule;
|
||||
return rules[i];
|
||||
case TransliterationRule.PARTIAL_MATCH:
|
||||
partial[0] = true;
|
||||
return null;
|
||||
|
|
|
@ -225,7 +225,7 @@ import java.text.*;
|
|||
* *Unsupported by Java (and hence unsupported by UnicodeSet).
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.2 $ $Date: 2000/01/04 21:43:58 $ */
|
||||
public class UnicodeSet {
|
||||
/**
|
||||
* The internal representation is a StringBuffer of even length.
|
||||
|
@ -456,6 +456,34 @@ public class UnicodeSet {
|
|||
return contains(c, c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> if this set contains any character whose low byte
|
||||
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
* indexing.
|
||||
*/
|
||||
public boolean containsIndexValue(int v) {
|
||||
/* The index value v, in the range [0,255], is contained in this set if
|
||||
* it is contained in any pair of this set. Pairs either have the high
|
||||
* bytes equal, or unequal. If the high bytes are equal, then we have
|
||||
* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
|
||||
* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
|
||||
* Then v is contained if xx <= v || v <= yy. (This is identical to the
|
||||
* time zone month containment logic.)
|
||||
*/
|
||||
for (int i=0; i<pairs.length(); i+=2) {
|
||||
char low = pairs.charAt(i);
|
||||
char high = pairs.charAt(i+1);
|
||||
if ((low & 0xFF00) == (high & 0xFF00)) {
|
||||
if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
|
||||
return true;
|
||||
}
|
||||
} else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the specified range to this set if it is not already
|
||||
* present. If this set already contains the specified range,
|
||||
|
|
|
@ -181,9 +181,12 @@ import java.util.Vector;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.5 $ $Date: 1999/12/22 01:40:54 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.6 $ $Date: 2000/01/04 21:43:57 $
|
||||
*
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.6 2000/01/04 21:43:57 Alan
|
||||
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
||||
*
|
||||
* Revision 1.5 1999/12/22 01:40:54 Alan
|
||||
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
||||
*
|
||||
|
@ -632,25 +635,14 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
i = limit + 1;
|
||||
}
|
||||
|
||||
// Check for masking, O(n^2).
|
||||
// Build time, no checking : 3400 ms
|
||||
// Build time, with checking: 8200 ms
|
||||
if (CHECK_MASKING) {
|
||||
n = data.ruleSet.size();
|
||||
for (i=0; i<n-1; ++i) {
|
||||
TransliterationRule r1 = data.ruleSet.elementAt(i);
|
||||
// Earlier rules must not mask later ones
|
||||
for (int j=i+1; j<n; ++j) {
|
||||
TransliterationRule r2 = data.ruleSet.elementAt(j);
|
||||
if (r1.masks(r2)) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer();
|
||||
} else {
|
||||
errors.append("\n");
|
||||
}
|
||||
errors.append("Rule " + r1 + " masks " + r2);
|
||||
}
|
||||
}
|
||||
// Index the rules
|
||||
try {
|
||||
data.ruleSet.freeze(data.setVariables);
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer(e.getMessage());
|
||||
} else {
|
||||
errors.append("\n").append(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,12 @@ import java.util.Dictionary;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
|
||||
*
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.5 2000/01/04 21:43:57 Alan
|
||||
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
||||
*
|
||||
* Revision 1.4 1999/12/22 01:40:54 Alan
|
||||
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
||||
*
|
||||
|
@ -164,6 +167,32 @@ class TransliterationRule {
|
|||
return anteContextLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns 8-bit index value for this rule.
|
||||
* This is the low byte of the first character of the key,
|
||||
* unless the first character of the key is a set. If it's a
|
||||
* set, the index value is -1.
|
||||
*/
|
||||
final int getIndexValue(Dictionary variables) {
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
return variables.get(new Character(c)) == null ? (c & 0xFF) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal method. Returns true if this rule matches the given
|
||||
* index value. The index value is an 8-bit integer, 0..255,
|
||||
* representing the low byte of the first character of the key.
|
||||
* It matches this rule if it matches the first character of the
|
||||
* key, or if the first character of the key is a set, and the set
|
||||
* contains any character with a low byte equal to the index
|
||||
* value.
|
||||
*/
|
||||
final boolean matchesIndexValue(int v, Dictionary variables) {
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
UnicodeSet set = (UnicodeSet) variables.get(new Character(c));
|
||||
return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
|
@ -449,8 +478,8 @@ class TransliterationRule {
|
|||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
protected static boolean charMatches(char keyChar, char textChar,
|
||||
Dictionary variables, UnicodeFilter filter) {
|
||||
protected static final boolean charMatches(char keyChar, char textChar,
|
||||
Dictionary variables, UnicodeFilter filter) {
|
||||
UnicodeSet set = null;
|
||||
return (filter == null || filter.isIn(textChar)) &&
|
||||
((set = (UnicodeSet) variables.get(new Character(keyChar)))
|
||||
|
|
|
@ -15,9 +15,12 @@ import java.util.*;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:40:54 $
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.5 $ $Date: 2000/01/04 21:43:57 $
|
||||
*
|
||||
* $Log: TransliterationRuleSet.java,v $
|
||||
* Revision 1.5 2000/01/04 21:43:57 Alan
|
||||
* Add rule indexing, and move masking check to TransliterationRuleSet.
|
||||
*
|
||||
* Revision 1.4 1999/12/22 01:40:54 Alan
|
||||
* Consolidate rule pattern anteContext, key, and postContext into one string.
|
||||
*
|
||||
|
@ -29,30 +32,31 @@ import java.util.*;
|
|||
*
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
/* Note: There was an old implementation that indexed by first letter of
|
||||
* key. Problem with this is that key may not have a meaningful first
|
||||
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
|
||||
* rules whose intial key letter is a category variable. However, the
|
||||
* problem is that they must be kept in order with respect to other rules.
|
||||
* One solution -- add a sequence number to each rule. Do the usual
|
||||
* first-letter lookup, and also a lookup from the spare bin with rules like
|
||||
* {Lu}>*. Take the lower sequence number. This seems complex and not
|
||||
* worth the trouble, but we may revisit this later. For documentation (or
|
||||
* possible resurrection) the old code is included below, commented out
|
||||
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
|
||||
* implementation, <code>rules</code> is a Hashtable, not a Vector.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Vector of rules, in the order added.
|
||||
* Vector of rules, in the order added. This is only used while the rule
|
||||
* set is getting built. After that, freeze() reorders and indexes the
|
||||
* rules, and this Vector is freed.
|
||||
*/
|
||||
private Vector rules;
|
||||
private Vector ruleVector;
|
||||
|
||||
/**
|
||||
* Length of the longest preceding context
|
||||
*/
|
||||
private int maxContextLength;
|
||||
|
||||
/**
|
||||
* Sorted and indexed table of rules. This is created by freeze() from
|
||||
* the rules in ruleVector.
|
||||
*/
|
||||
private TransliterationRule[] rules;
|
||||
|
||||
/**
|
||||
* Index table. For text having a first character c, compute x = c&0xFF.
|
||||
* Now use rules[index[x]..index[x+1]-1]. This index table is created by
|
||||
* freeze().
|
||||
*/
|
||||
private int[] index;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
|
@ -60,7 +64,7 @@ class TransliterationRuleSet {
|
|||
* Construct a new empty rule set.
|
||||
*/
|
||||
public TransliterationRuleSet() {
|
||||
rules = new Vector();
|
||||
ruleVector = new Vector();
|
||||
maxContextLength = 0;
|
||||
}
|
||||
|
||||
|
@ -78,19 +82,106 @@ class TransliterationRuleSet {
|
|||
* @param rule the rule to add
|
||||
*/
|
||||
public void addRule(TransliterationRule rule) {
|
||||
rules.addElement(rule);
|
||||
if (ruleVector == null) {
|
||||
throw new IllegalArgumentException("Cannot add rules after freezing");
|
||||
}
|
||||
ruleVector.addElement(rule);
|
||||
int len;
|
||||
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
}
|
||||
}
|
||||
|
||||
public final int size() {
|
||||
return rules.size();
|
||||
}
|
||||
/**
|
||||
* Close this rule set to further additions, check it for masked rules,
|
||||
* and index it to optimize performance. Once this method is called,
|
||||
* addRule() can no longer be called.
|
||||
* @exception IllegalArgumentException if some rules are masked
|
||||
*/
|
||||
public void freeze(Dictionary variables) {
|
||||
/* Construct the rule array and index table. We reorder the
|
||||
* rules by sorting them into 256 bins. Each bin contains all
|
||||
* rules matching the index value for that bin. A rule
|
||||
* matches an index value if string whose first key character
|
||||
* has a low byte equal to the index value can match the rule.
|
||||
*
|
||||
* Each bin contains zero or more rules, in the same order
|
||||
* they were found originally. However, the total rules in
|
||||
* the bins may exceed the number in the original vector,
|
||||
* since rules that have a variable as their first key
|
||||
* character will generally fall into more than one bin.
|
||||
*
|
||||
* That is, each bin contains all rules that either have that
|
||||
* first index value as their first key character, or have
|
||||
* a set containing the index value as their first character.
|
||||
*/
|
||||
int n = ruleVector.size();
|
||||
index = new int[257]; // [sic]
|
||||
Vector v = new Vector(2*n); // heuristic; adjust as needed
|
||||
|
||||
public final TransliterationRule elementAt(int i) {
|
||||
return (TransliterationRule) rules.elementAt(i);
|
||||
/* Precompute the index values. This saves a LOT of time.
|
||||
*/
|
||||
int[] indexValue = new int[n];
|
||||
for (int j=0; j<n; ++j) {
|
||||
TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
|
||||
indexValue[j] = r.getIndexValue(variables);
|
||||
}
|
||||
for (int x=0; x<256; ++x) {
|
||||
index[x] = v.size();
|
||||
for (int j=0; j<n; ++j) {
|
||||
if (indexValue[j] >= 0) {
|
||||
if (indexValue[j] == x) {
|
||||
v.addElement(ruleVector.elementAt(j));
|
||||
}
|
||||
} else {
|
||||
// If the indexValue is < 0, then the first key character is
|
||||
// a set, and we must use the more time-consuming
|
||||
// matchesIndexValue check. In practice this happens
|
||||
// rarely, so we seldom tread this code path.
|
||||
TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
|
||||
if (r.matchesIndexValue(x, variables)) {
|
||||
v.addElement(r);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
index[256] = v.size();
|
||||
|
||||
/* Freeze things into an array.
|
||||
*/
|
||||
rules = new TransliterationRule[v.size()];
|
||||
v.copyInto(rules);
|
||||
ruleVector = null;
|
||||
|
||||
StringBuffer errors = null;
|
||||
|
||||
/* Check for masking. This is MUCH faster than our old check,
|
||||
* which was each rule against each following rule, since we
|
||||
* only have to check for masking within each bin now. It's
|
||||
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
|
||||
* count, and n2 is the per-bin rule count. But n2<<n1, so
|
||||
* it's a big win.
|
||||
*/
|
||||
for (int x=0; x<256; ++x) {
|
||||
for (int j=index[x]; j<index[x+1]-1; ++j) {
|
||||
TransliterationRule r1 = rules[j];
|
||||
for (int k=j+1; k<index[x+1]; ++k) {
|
||||
TransliterationRule r2 = rules[k];
|
||||
if (r1.masks(r2)) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer();
|
||||
} else {
|
||||
errors.append("\n");
|
||||
}
|
||||
errors.append("Rule " + r1 + " masks " + r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != null) {
|
||||
throw new IllegalArgumentException(errors.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -118,15 +209,21 @@ class TransliterationRuleSet {
|
|||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
|
||||
*/
|
||||
public TransliterationRule findMatch(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
|
||||
return rule;
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int rlen = result.length();
|
||||
int x = 0xFF & (cursor < rlen ? result.charAt(cursor)
|
||||
: text.charAt(cursor - rlen + start));
|
||||
for (int i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i].matches(text, start, limit, result, cursor, variables, filter)) {
|
||||
return rules[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -154,10 +251,13 @@ class TransliterationRuleSet {
|
|||
int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, cursor, variables, filter)) {
|
||||
return rule;
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
int x = text.charAt(cursor) & 0xFF;
|
||||
for (int i=index[x]; i<index[x+1]; ++i) {
|
||||
if (rules[i].matches(text, start, limit, cursor, variables, filter)) {
|
||||
return rules[i];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -195,14 +295,17 @@ class TransliterationRuleSet {
|
|||
Dictionary variables,
|
||||
boolean partial[],
|
||||
UnicodeFilter filter) {
|
||||
/* We only need to check our indexed bin of the rule table,
|
||||
* based on the low byte of the first key character.
|
||||
*/
|
||||
partial[0] = false;
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
int match = rule.getMatchDegree(text, start, limit, cursor,
|
||||
variables, filter);
|
||||
int x = text.charAt(cursor) & 0xFF;
|
||||
for (int i=index[x]; i<index[x+1]; ++i) {
|
||||
int match = rules[i].getMatchDegree(text, start, limit, cursor,
|
||||
variables, filter);
|
||||
switch (match) {
|
||||
case TransliterationRule.FULL_MATCH:
|
||||
return rule;
|
||||
return rules[i];
|
||||
case TransliterationRule.PARTIAL_MATCH:
|
||||
partial[0] = true;
|
||||
return null;
|
||||
|
|
|
@ -225,7 +225,7 @@ import java.text.*;
|
|||
* *Unsupported by Java (and hence unsupported by UnicodeSet).
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.2 $ $Date: 2000/01/04 21:43:58 $ */
|
||||
public class UnicodeSet {
|
||||
/**
|
||||
* The internal representation is a StringBuffer of even length.
|
||||
|
@ -456,6 +456,34 @@ public class UnicodeSet {
|
|||
return contains(c, c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> if this set contains any character whose low byte
|
||||
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
* indexing.
|
||||
*/
|
||||
public boolean containsIndexValue(int v) {
|
||||
/* The index value v, in the range [0,255], is contained in this set if
|
||||
* it is contained in any pair of this set. Pairs either have the high
|
||||
* bytes equal, or unequal. If the high bytes are equal, then we have
|
||||
* aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
|
||||
* v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
|
||||
* Then v is contained if xx <= v || v <= yy. (This is identical to the
|
||||
* time zone month containment logic.)
|
||||
*/
|
||||
for (int i=0; i<pairs.length(); i+=2) {
|
||||
char low = pairs.charAt(i);
|
||||
char high = pairs.charAt(i+1);
|
||||
if ((low & 0xFF00) == (high & 0xFF00)) {
|
||||
if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
|
||||
return true;
|
||||
}
|
||||
} else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the specified range to this set if it is not already
|
||||
* present. If this set already contains the specified range,
|
||||
|
|
Loading…
Add table
Reference in a new issue