mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
Improve masking checking; turn it off by default, for better performance
X-SVN-Rev: 455
This commit is contained in:
parent
96844ec160
commit
78c974e7ba
6 changed files with 176 additions and 78 deletions
|
@ -181,7 +181,12 @@ import java.util.Vector;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.3 $ $Date: 1999/12/20 20:25:00 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:05:54 $
|
||||
*
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.4 1999/12/22 01:05:54 Alan
|
||||
* Improve masking checking; turn it off by default, for better performance
|
||||
*
|
||||
*/
|
||||
public class RuleBasedTransliterator extends Transliterator {
|
||||
/**
|
||||
|
@ -200,6 +205,8 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static final boolean CHECK_MASKING = false;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
|
@ -593,6 +600,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
private void parseRules() {
|
||||
determineVariableRange();
|
||||
|
||||
StringBuffer errors = null;
|
||||
int n = rules.length();
|
||||
int i = 0;
|
||||
while (i<n) {
|
||||
|
@ -608,11 +616,45 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
}
|
||||
// Skip over empty lines and line starting with #
|
||||
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
|
||||
applyRule(i, limit);
|
||||
try {
|
||||
applyRule(i, limit);
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer(e.getMessage());
|
||||
} else {
|
||||
errors.append("\n").append(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
i = limit + 1;
|
||||
}
|
||||
|
||||
// Check for masking, O(n^2).
|
||||
// Build time, no checking : 3400 ms
|
||||
// Build time, with checking: 8200 ms
|
||||
if (CHECK_MASKING) {
|
||||
n = data.ruleSet.size();
|
||||
for (i=0; i<n-1; ++i) {
|
||||
TransliterationRule r1 = data.ruleSet.elementAt(i);
|
||||
// Earlier rules must not mask later ones
|
||||
for (int j=i+1; j<n; ++j) {
|
||||
TransliterationRule r2 = data.ruleSet.elementAt(j);
|
||||
if (r1.masks(r2)) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer();
|
||||
} else {
|
||||
errors.append("\n");
|
||||
}
|
||||
errors.append("Rule " + r1 + " masks " + r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != null) {
|
||||
throw new IllegalArgumentException(errors.toString());
|
||||
}
|
||||
|
||||
data.ruleSet.freeze();
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,12 @@ import java.util.Dictionary;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.2 $ $Date: 1999/12/21 23:58:44 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.3 $ $Date: 1999/12/22 01:05:54 $
|
||||
*
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.3 1999/12/22 01:05:54 Alan
|
||||
* Improve masking checking; turn it off by default, for better performance
|
||||
*
|
||||
* Revision 1.2 1999/12/21 23:58:44 Alan
|
||||
* Detect a>x masking a>y
|
||||
*
|
||||
|
@ -88,7 +91,8 @@ class TransliterationRule {
|
|||
private int cursorPos;
|
||||
|
||||
/**
|
||||
* A string used to implement masks().
|
||||
* A string used to implement masks(). It is the concatenated anteContext,
|
||||
* key, and postContext. See freeze() method.
|
||||
*/
|
||||
private String maskKey;
|
||||
|
||||
|
@ -134,7 +138,7 @@ class TransliterationRule {
|
|||
* This is what the freeze() method does. After freeze() has been
|
||||
* called, the method masks() must NOT be called.
|
||||
*/
|
||||
maskKey = key;
|
||||
maskKey = anteContext != null ? (anteContext + key) : key;
|
||||
if (postContext != null) {
|
||||
maskKey += postContext;
|
||||
}
|
||||
|
@ -190,17 +194,21 @@ class TransliterationRule {
|
|||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
public boolean masks(TransliterationRule r2) {
|
||||
/* There are three cases of masking. In each instance, rule1
|
||||
* masks rule2.
|
||||
/* Rule r1 masks rule r2 if the string formed of the
|
||||
* antecontext, key, and postcontext overlaps in the following
|
||||
* way:
|
||||
*
|
||||
* 1. KEY mask: len(key1) <= len(key2), key2 starts with key1.
|
||||
* 1<2 detects a>b masking ab>c; 1=2 detects a>b masking a>c.
|
||||
*
|
||||
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*
|
||||
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
* r1: aakkkpppp
|
||||
* r2: aaakkkkkpppp
|
||||
* ^
|
||||
*
|
||||
* The strings must be aligned at the first character of the
|
||||
* key. The length of r1 to the left of the alignment point
|
||||
* must be <= the length of r2 to the left; ditto for the
|
||||
* right. The characters of r1 must equal (or be a superset
|
||||
* of) the corresponding characters of r2. The superset
|
||||
* operation should be performed to check for UnicodeSet
|
||||
* masking.
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
|
@ -210,13 +218,13 @@ class TransliterationRule {
|
|||
* currently do not have. This can be added later.
|
||||
*/
|
||||
|
||||
// maskKey = key + postContext
|
||||
return ((maskKey.length() <= r2.maskKey.length() &&
|
||||
r2.maskKey.startsWith(maskKey)) ||
|
||||
(r2.anteContext != null && maskKey.equals(r2.maskKey) &&
|
||||
((anteContext == null) ||
|
||||
(anteContext.length() < r2.anteContext.length() &&
|
||||
r2.anteContext.endsWith(anteContext)))));
|
||||
// maskKey = anteContext + key + postContext
|
||||
int left = getAnteContextLength();
|
||||
int left2 = r2.getAnteContextLength();
|
||||
int right = maskKey.length() - left;
|
||||
int right2 = r2.maskKey.length() - left2;
|
||||
return left <= left2 && right <= right2 &&
|
||||
r2.maskKey.substring(left2 - left).startsWith(maskKey);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -232,7 +240,7 @@ class TransliterationRule {
|
|||
* @return string representation of this object
|
||||
*/
|
||||
public String toString() {
|
||||
return getClass().getName() + '['
|
||||
return getClass().getName() + '{'
|
||||
+ escape((anteContext != null ? ("[" + anteContext + ']') : "")
|
||||
+ key
|
||||
+ (postContext != null ? ("[" + postContext + ']') : "")
|
||||
|
@ -240,7 +248,7 @@ class TransliterationRule {
|
|||
+ (cursorPos < output.length()
|
||||
? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
|
||||
: output))
|
||||
+ ']';
|
||||
+ '}';
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -15,9 +15,12 @@ import java.util.*;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.2 $ $Date: 1999/12/22 00:01:36 $
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.3 $ $Date: 1999/12/22 01:05:54 $
|
||||
*
|
||||
* $Log: TransliterationRuleSet.java,v $
|
||||
* Revision 1.3 1999/12/22 01:05:54 Alan
|
||||
* Improve masking checking; turn it off by default, for better performance
|
||||
*
|
||||
* Revision 1.2 1999/12/22 00:01:36 Alan
|
||||
* Detect a>x masking a>y
|
||||
*
|
||||
|
@ -74,18 +77,6 @@ class TransliterationRuleSet {
|
|||
* @param rule the rule to add
|
||||
*/
|
||||
public void addRule(TransliterationRule rule) {
|
||||
|
||||
// Build time, no checking : 3562 ms
|
||||
// Build time, with checking: 6234 ms
|
||||
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule r = (TransliterationRule) rules.elementAt(i);
|
||||
if (r.masks(rule)) {
|
||||
throw new IllegalArgumentException("Rule " + r +
|
||||
" masks " + rule);
|
||||
}
|
||||
}
|
||||
|
||||
rules.addElement(rule);
|
||||
int len;
|
||||
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
||||
|
@ -93,9 +84,17 @@ class TransliterationRuleSet {
|
|||
}
|
||||
}
|
||||
|
||||
public final int size() {
|
||||
return rules.size();
|
||||
}
|
||||
|
||||
public final TransliterationRule elementAt(int i) {
|
||||
return (TransliterationRule) rules.elementAt(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
* Free up space. Once this method is called, the maskKey is
|
||||
* invalid.
|
||||
*/
|
||||
public void freeze() {
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
|
|
|
@ -181,7 +181,12 @@ import java.util.Vector;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.3 $ $Date: 1999/12/20 20:25:00 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.4 $ $Date: 1999/12/22 01:05:54 $
|
||||
*
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.4 1999/12/22 01:05:54 Alan
|
||||
* Improve masking checking; turn it off by default, for better performance
|
||||
*
|
||||
*/
|
||||
public class RuleBasedTransliterator extends Transliterator {
|
||||
/**
|
||||
|
@ -200,6 +205,8 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static final boolean CHECK_MASKING = false;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
|
@ -593,6 +600,7 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
private void parseRules() {
|
||||
determineVariableRange();
|
||||
|
||||
StringBuffer errors = null;
|
||||
int n = rules.length();
|
||||
int i = 0;
|
||||
while (i<n) {
|
||||
|
@ -608,11 +616,45 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
}
|
||||
// Skip over empty lines and line starting with #
|
||||
if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
|
||||
applyRule(i, limit);
|
||||
try {
|
||||
applyRule(i, limit);
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer(e.getMessage());
|
||||
} else {
|
||||
errors.append("\n").append(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
i = limit + 1;
|
||||
}
|
||||
|
||||
// Check for masking, O(n^2).
|
||||
// Build time, no checking : 3400 ms
|
||||
// Build time, with checking: 8200 ms
|
||||
if (CHECK_MASKING) {
|
||||
n = data.ruleSet.size();
|
||||
for (i=0; i<n-1; ++i) {
|
||||
TransliterationRule r1 = data.ruleSet.elementAt(i);
|
||||
// Earlier rules must not mask later ones
|
||||
for (int j=i+1; j<n; ++j) {
|
||||
TransliterationRule r2 = data.ruleSet.elementAt(j);
|
||||
if (r1.masks(r2)) {
|
||||
if (errors == null) {
|
||||
errors = new StringBuffer();
|
||||
} else {
|
||||
errors.append("\n");
|
||||
}
|
||||
errors.append("Rule " + r1 + " masks " + r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != null) {
|
||||
throw new IllegalArgumentException(errors.toString());
|
||||
}
|
||||
|
||||
data.ruleSet.freeze();
|
||||
}
|
||||
|
||||
|
|
|
@ -21,9 +21,12 @@ import java.util.Dictionary;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.2 $ $Date: 1999/12/21 23:58:44 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.3 $ $Date: 1999/12/22 01:05:54 $
|
||||
*
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.3 1999/12/22 01:05:54 Alan
|
||||
* Improve masking checking; turn it off by default, for better performance
|
||||
*
|
||||
* Revision 1.2 1999/12/21 23:58:44 Alan
|
||||
* Detect a>x masking a>y
|
||||
*
|
||||
|
@ -88,7 +91,8 @@ class TransliterationRule {
|
|||
private int cursorPos;
|
||||
|
||||
/**
|
||||
* A string used to implement masks().
|
||||
* A string used to implement masks(). It is the concatenated anteContext,
|
||||
* key, and postContext. See freeze() method.
|
||||
*/
|
||||
private String maskKey;
|
||||
|
||||
|
@ -134,7 +138,7 @@ class TransliterationRule {
|
|||
* This is what the freeze() method does. After freeze() has been
|
||||
* called, the method masks() must NOT be called.
|
||||
*/
|
||||
maskKey = key;
|
||||
maskKey = anteContext != null ? (anteContext + key) : key;
|
||||
if (postContext != null) {
|
||||
maskKey += postContext;
|
||||
}
|
||||
|
@ -190,17 +194,21 @@ class TransliterationRule {
|
|||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
public boolean masks(TransliterationRule r2) {
|
||||
/* There are three cases of masking. In each instance, rule1
|
||||
* masks rule2.
|
||||
/* Rule r1 masks rule r2 if the string formed of the
|
||||
* antecontext, key, and postcontext overlaps in the following
|
||||
* way:
|
||||
*
|
||||
* 1. KEY mask: len(key1) <= len(key2), key2 starts with key1.
|
||||
* 1<2 detects a>b masking ab>c; 1=2 detects a>b masking a>c.
|
||||
*
|
||||
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*
|
||||
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
* r1: aakkkpppp
|
||||
* r2: aaakkkkkpppp
|
||||
* ^
|
||||
*
|
||||
* The strings must be aligned at the first character of the
|
||||
* key. The length of r1 to the left of the alignment point
|
||||
* must be <= the length of r2 to the left; ditto for the
|
||||
* right. The characters of r1 must equal (or be a superset
|
||||
* of) the corresponding characters of r2. The superset
|
||||
* operation should be performed to check for UnicodeSet
|
||||
* masking.
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
|
@ -210,13 +218,13 @@ class TransliterationRule {
|
|||
* currently do not have. This can be added later.
|
||||
*/
|
||||
|
||||
// maskKey = key + postContext
|
||||
return ((maskKey.length() <= r2.maskKey.length() &&
|
||||
r2.maskKey.startsWith(maskKey)) ||
|
||||
(r2.anteContext != null && maskKey.equals(r2.maskKey) &&
|
||||
((anteContext == null) ||
|
||||
(anteContext.length() < r2.anteContext.length() &&
|
||||
r2.anteContext.endsWith(anteContext)))));
|
||||
// maskKey = anteContext + key + postContext
|
||||
int left = getAnteContextLength();
|
||||
int left2 = r2.getAnteContextLength();
|
||||
int right = maskKey.length() - left;
|
||||
int right2 = r2.maskKey.length() - left2;
|
||||
return left <= left2 && right <= right2 &&
|
||||
r2.maskKey.substring(left2 - left).startsWith(maskKey);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -232,7 +240,7 @@ class TransliterationRule {
|
|||
* @return string representation of this object
|
||||
*/
|
||||
public String toString() {
|
||||
return getClass().getName() + '['
|
||||
return getClass().getName() + '{'
|
||||
+ escape((anteContext != null ? ("[" + anteContext + ']') : "")
|
||||
+ key
|
||||
+ (postContext != null ? ("[" + postContext + ']') : "")
|
||||
|
@ -240,7 +248,7 @@ class TransliterationRule {
|
|||
+ (cursorPos < output.length()
|
||||
? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
|
||||
: output))
|
||||
+ ']';
|
||||
+ '}';
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -15,9 +15,12 @@ import java.util.*;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.2 $ $Date: 1999/12/22 00:01:36 $
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.3 $ $Date: 1999/12/22 01:05:54 $
|
||||
*
|
||||
* $Log: TransliterationRuleSet.java,v $
|
||||
* Revision 1.3 1999/12/22 01:05:54 Alan
|
||||
* Improve masking checking; turn it off by default, for better performance
|
||||
*
|
||||
* Revision 1.2 1999/12/22 00:01:36 Alan
|
||||
* Detect a>x masking a>y
|
||||
*
|
||||
|
@ -74,18 +77,6 @@ class TransliterationRuleSet {
|
|||
* @param rule the rule to add
|
||||
*/
|
||||
public void addRule(TransliterationRule rule) {
|
||||
|
||||
// Build time, no checking : 3562 ms
|
||||
// Build time, with checking: 6234 ms
|
||||
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule r = (TransliterationRule) rules.elementAt(i);
|
||||
if (r.masks(rule)) {
|
||||
throw new IllegalArgumentException("Rule " + r +
|
||||
" masks " + rule);
|
||||
}
|
||||
}
|
||||
|
||||
rules.addElement(rule);
|
||||
int len;
|
||||
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
||||
|
@ -93,9 +84,17 @@ class TransliterationRuleSet {
|
|||
}
|
||||
}
|
||||
|
||||
public final int size() {
|
||||
return rules.size();
|
||||
}
|
||||
|
||||
public final TransliterationRule elementAt(int i) {
|
||||
return (TransliterationRule) rules.elementAt(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
* Free up space. Once this method is called, the maskKey is
|
||||
* invalid.
|
||||
*/
|
||||
public void freeze() {
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
|
|
Loading…
Add table
Reference in a new issue