From 7a49adef39a9ac65d16d46b3c03cd59b3921b1f0 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Tue, 25 Apr 2000 01:42:58 +0000 Subject: [PATCH] Allow arbitrary length variable values. Clean up Data API. Update javadocs. X-SVN-Rev: 1242 --- .../ibm/icu/text/RuleBasedTransliterator.java | 218 ++++++++---------- icu4j/src/com/ibm/icu/text/SymbolTable.java | 17 +- .../com/ibm/icu/text/TransliterationRule.java | 15 +- icu4j/src/com/ibm/icu/text/UnicodeSet.java | 186 +++++++++------ .../com/ibm/text/RuleBasedTransliterator.java | 218 ++++++++---------- icu4j/src/com/ibm/text/SymbolTable.java | 17 +- .../src/com/ibm/text/TransliterationRule.java | 15 +- icu4j/src/com/ibm/text/UnicodeSet.java | 186 +++++++++------ 8 files changed, 470 insertions(+), 402 deletions(-) diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java index fd98190750b..19a9649a407 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ - * $Date: 2000/04/22 01:25:10 $ - * $Revision: 1.26 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.27 $ * ***************************************************************************************** */ @@ -20,17 +20,17 @@ import com.ibm.util.Utility; /** * RuleBasedTransliterator is a transliterator * that reads a set of rules in order to determine how to perform - * translations. Rules are stored in resource bundles indexed by - * name. Rules are separated by semicolons (';'). To include a - * literal semicolon, prefix it with a backslash ('\'). Whitespace, - * as defined by Character.isWhitespace(), is ignored. - * If the first non-blank character on a line is '#', the entire - * line is ignored as a comment.

+ * translations. Rule sets are stored in resource bundles indexed by + * name. Rules within a rule set are separated by semicolons (';'). + * To include a literal semicolon, prefix it with a backslash ('\'). + * Whitespace, as defined by Character.isWhitespace(), + * is ignored. If the first non-blank character on a line is '#', + * the entire line is ignored as a comment.

* *

Each set of rules consists of two groups, one forward, and one * reverse. This is a convention that is not enforced; rules for one * direction may be omitted, with the result that translations in - * that direction will not modify the source text. Alternatively, + * that direction will not modify the source text. In addition, * bidirectional forward-reverse rules may be specified for * symmetrical transformations.

* @@ -39,69 +39,27 @@ import com.ibm.util.Utility; *

Rule statements take one of the following forms:

* *
- *
$alefmadda=\u0622
+ *
$alefmadda=\u0622;
*
Variable definition. The name on the - * left is assigned the character or expression on the - * right. Names must begin with a letter and consist only of - * letters, digits, and underscores. Case is significant. - * Duplicate names (including duplicates of simple variables - * or category names) cause an exception to be thrown. If - * the right hand side consists of one character, then the - * variable stands for that character. In this example, + * left is assigned the text on the right. In this example, * after this statement, instances of the left hand name, * "$alefmadda", will be replaced by - * the Unicode character U+0622. The right hand side must be - * exactly one character long (current limitation).
- *
 
- *
$softvowel=[eiyEIY]
- *
Category definition. The name on the - * left is assigned to stand for a set of characters. The - * same rules for names of simple variables apply. After - * this statement, the left hand variable will be - * interpreted as indicating a set of characters in - * appropriate contexts. The pattern syntax defining sets of - * characters is defined by {@link UnicodeSet}. Examples of - * valid patterns are: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
[abc]The set containing the - * characters 'a', 'b', and 'c'.
[^abc]The set of all characters except - * 'a', 'b', and 'c'.
[A-Z]The set of all characters from - * 'A' to 'Z' in Unicode order.
[:Lu:]The set of Unicode uppercase - * letters. See www.unicode.org - * for a complete list of categories and their - * two-letter codes.
[^a-z[:Lu:][:Ll:]]The set of all characters except - * 'a' through 'z' and uppercase or lowercase - * letters.
- *

Patterns may contain variable references, such as - * "$a=[a-z];$not_a=[^$a]". See - * {@link UnicodeSet} for more documentation and examples.

- *
- *
ai>$alefmadda
+ * the Unicode character U+0622. Variable names must begin + * with a letter and consist only of letters, digits, and + * underscores. Case is significant. Duplicate names cause + * an exception to be thrown, that is, variables cannot be + * redefined. The right hand side may contain well-formed + * text of any length, including no text at all ("$empty=;"). + * The right hand side may contain embedded UnicodeSet + * patterns, for example, "$softvowel=[eiyEIY]". + *
 
+ *
ai>$alefmadda;
*
Forward translation rule. This rule * states that the string on the left will be changed to the * string on the right when performing forward * transliteration.
*
 
- *
ai<$alefmadda
+ *
ai<$alefmadda;
*
Reverse translation rule. This rule * states that the string on the right will be changed to * the string on the left when performing reverse @@ -109,7 +67,7 @@ import com.ibm.util.Utility; *
* *
- *
ai<>$alefmadda
+ *
ai<>$alefmadda;
*
Bidirectional translation rule. This * rule states that the string on the right will be changed * to the string on the left when performing forward @@ -151,9 +109,16 @@ import com.ibm.util.Utility; * y and z

* * - *

In addition to being defined in variables, UnicodeSet - * patterns may be embedded directly into rule strings. Thus, the - * following two rules are equivalent:

+ *

UnicodeSet

+ * + *

UnicodeSet patterns may appear anywhere that + * makes sense. They may appear in variable definitions. + * Contrariwise, UnicodeSet patterns may themselves + * contain variable references, such as "$a=[a-z];$not_a=[^$a]", + * or "$range=a-z;$ll=[$range]".

+ * + *

UnicodeSet patterns may also be embedded directly + * into rule strings. Thus, the following two rules are equivalent:

* *
*

$vowel=[aeiou]; $vowel>'*'; # One way to do this
@@ -162,6 +127,8 @@ import com.ibm.util.Utility; * Another way

*
* + *

See {@link UnicodeSet} for more documentation and examples.

+ * *

Segments

* *

Segments of the input string can be matched and copied to the @@ -169,7 +136,8 @@ import com.ibm.util.Utility; * general, and makes reordering possible. For example:

* *
- *

([a-z]) > $1 $1;           # + *

([a-z]) > $1 $1; + *           # * double lowercase letters
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs

*
@@ -284,7 +252,7 @@ import com.ibm.util.Utility; *

Copyright (c) IBM Corporation 1999-2000. All rights reserved.

* * @author Alan Liu - * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $ + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $ */ public class RuleBasedTransliterator extends Transliterator { @@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator { public TransliterationRuleSet ruleSet; /** - * Map variable name (String) to variable (Character). A variable - * name may correspond to a single literal character, in which - * case the character is stored in this hash. It may also - * correspond to a UnicodeSet, in which case a character is - * again stored in this hash, but the character is a stand-in: it - * is an index for a secondary lookup in data.setVariables. The stand-in - * also represents the UnicodeSet in the stored rules. + * Map variable name (String) to variable (char[]). A variable name + * corresponds to zero or more characters, stored in a char[] array in + * this hash. One or more of these chars may also correspond to a + * UnicodeSet, in which case the character in the char[] in this hash is + * a stand-in: it is an index for a secondary lookup in + * data.setVariables. The stand-in also represents the UnicodeSet in + * the stored rules. */ - public Hashtable variableNames; + private Hashtable variableNames; /** * Map category variable (Character) to set (UnicodeSet). @@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator { * stored in the rule text to represent the set of characters. * setVariables[i] represents character (setVariablesBase + i). */ - public UnicodeSet[] setVariables; + private UnicodeSet[] setVariables; /** * The character that represents setVariables[0]. Characters * setVariablesBase through setVariablesBase + * setVariables.length - 1 represent UnicodeSet objects. */ - public char setVariablesBase; - - /** - * Return the UnicodeSet represented by the given character, or - * null if none. - */ - public UnicodeSet lookup(char c) { - int i = c - setVariablesBase; - return (i >= 0 && i < setVariables.length) - ? setVariables[i] : null; - } + private char setVariablesBase; /** * The character that represents segment 1. Characters segmentBase * through segmentBase + 8 represent segments 1 through 9. */ - public char segmentBase; + private char segmentBase; + + /** + * Return the UnicodeSet represented by the given character, or + * null if none. + */ + public UnicodeSet lookupSet(char c) { + int i = c - setVariablesBase; + return (i >= 0 && i < setVariables.length) + ? setVariables[i] : null; + } /** * Return the zero-based index of the segment represented by the given @@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator { private class ParseData implements SymbolTable { /** - * Implement SymbolTable API. Lookup a variable, returning - * either a Character, a UnicodeSet, or null. + * Implement SymbolTable API. */ - public Object lookup(String name) { - Character ch = (Character) data.variableNames.get(name); - if (ch != null) { - int i = ch.charValue() - data.setVariablesBase; - if (i >= 0 && i < setVariablesVector.size()) { - return setVariablesVector.elementAt(i); - } + public char[] lookup(String name) { + return (char[]) data.variableNames.get(name); + } + + /** + * Implement SymbolTable API. + */ + public UnicodeSet lookupSet(char ch) { + // Note that we cannot use data.lookupSet() because the + // set array has not been constructed yet. + int i = ch - data.setVariablesBase; + if (i >= 0 && i < setVariablesVector.size()) { + return (UnicodeSet) setVariablesVector.elementAt(i); } - return ch; + return null; } /** @@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator { String name = parser.parseData. parseReference(rule, pp, limit); pos = pp.getIndex(); - // If this is a variable definition statement, then the LHS - // variable will be undefined. In that case getVariableName() - // will return the special placeholder variableLimit-1. - buf.append(parser.getVariableDef(name)); + // If this is a variable definition statement, + // then the LHS variable will be undefined. In + // that case appendVariableDef() will append the + // special placeholder char variableLimit-1. + + //buf.append(parser.getVariableDef(name)); + parser.appendVariableDef(name, buf); } } break; @@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator { if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { syntaxError("Malformed LHS", rule, start); } - if (right.text.length() != 1) { - syntaxError("Malformed RHS", rule, start); - } - data.variableNames.put(undefinedVariableName, - new Character(right.text.charAt(0))); + // We allow anything on the right, including an empty string. + int n = right.text.length(); + char[] value = new char[n]; + right.text.getChars(0, n, value, 0); + data.variableNames.put(undefinedVariableName, value); + ++variableLimit; return pos; } @@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator { } /** - * Returns the single character value of the given variable name. Defined - * names are recognized. + * Append the value of the given variable name to the given + * StringBuffer. * @exception IllegalArgumentException if the name is unknown. */ - private char getVariableDef(String name) { - Character ch = (Character) data.variableNames.get(name); + private void appendVariableDef(String name, StringBuffer buf) { + char[] ch = (char[]) data.variableNames.get(name); if (ch == null) { // We allow one undefined variable so that variable definition // statements work. For the first undefined variable we return @@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator { if (variableNext >= variableLimit) { throw new RuntimeException("Private use variables exhausted"); } - return --variableLimit; + buf.append((char) --variableLimit); + } else { + throw new IllegalArgumentException("Undefined variable $" + + name); } - throw new IllegalArgumentException("Undefined variable $" - + name); + } else { + buf.append(ch); } - return ch.charValue(); } /** @@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator { /** * $Log: RuleBasedTransliterator.java,v $ + * Revision 1.27 2000/04/25 01:42:58 alan + * Allow arbitrary length variable values. Clean up Data API. Update javadocs. + * * Revision 1.26 2000/04/22 01:25:10 alan * Add support for cursor positioner '@'; update javadoc * diff --git a/icu4j/src/com/ibm/icu/text/SymbolTable.java b/icu4j/src/com/ibm/icu/text/SymbolTable.java index c3f9a36f410..cf75c2334b9 100755 --- a/icu4j/src/com/ibm/icu/text/SymbolTable.java +++ b/icu4j/src/com/ibm/icu/text/SymbolTable.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $ - * $Date: 2000/04/21 22:16:29 $ - * $Revision: 1.3 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.4 $ * ***************************************************************************************** */ @@ -32,10 +32,17 @@ public interface SymbolTable { final char SYMBOL_REF = '$'; /** - * Lookup the object associated with this string and return it. - * Return null if no such name exists. + * Lookup the characters associated with this string and return it. + * Return null if no such name exists. The resultant + * array may have length zero. */ - Object lookup(String s); + char[] lookup(String s); + + /** + * Lookup the UnicodeSet associated with the given character, and + * return it. Return null if not found. + */ + UnicodeSet lookupSet(char ch); /** * Parse a symbol reference name from the given string, starting diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/src/com/ibm/icu/text/TransliterationRule.java index f476594416f..3815b33dabf 100755 --- a/icu4j/src/com/ibm/icu/text/TransliterationRule.java +++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $ - * $Date: 2000/04/22 01:25:10 $ - * $Revision: 1.18 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.19 $ * ***************************************************************************************** */ @@ -44,7 +44,7 @@ import com.ibm.util.Utility; *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $ + * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $ */ class TransliterationRule { /** @@ -240,7 +240,7 @@ class TransliterationRule { return -1; } char c = pattern.charAt(anteContextLength); - return variables.lookup(c) == null ? (c & 0xFF) : -1; + return variables.lookupSet(c) == null ? (c & 0xFF) : -1; } /** @@ -300,7 +300,7 @@ class TransliterationRule { return true; } char c = pattern.charAt(anteContextLength); - UnicodeSet set = variables.lookup(c); + UnicodeSet set = variables.lookupSet(c); return set == null ? (c & 0xFF) == v : set.containsIndexValue(v); } @@ -486,13 +486,16 @@ class TransliterationRule { UnicodeFilter filter) { UnicodeSet set = null; return (filter == null || filter.contains(textChar)) && - (((set = variables.lookup(keyChar)) == null) ? + (((set = variables.lookupSet(keyChar)) == null) ? keyChar == textChar : set.contains(textChar)); } } /** * $Log: TransliterationRule.java,v $ + * Revision 1.19 2000/04/25 01:42:58 alan + * Allow arbitrary length variable values. Clean up Data API. Update javadocs. + * * Revision 1.18 2000/04/22 01:25:10 alan * Add support for cursor positioner '@'; update javadoc * diff --git a/icu4j/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/src/com/ibm/icu/text/UnicodeSet.java index b5316a4d2ae..8e67ca27ba2 100755 --- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $ - * $Date: 2000/04/21 22:16:29 $ - * $Revision: 1.18 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.19 $ * ***************************************************************************************** */ @@ -241,7 +241,7 @@ import java.text.*; * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $ */ public class UnicodeSet implements UnicodeFilter { /** @@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter { int start = pos.getIndex(); int i = start; int limit = pattern.length(); - for (; i= limit) { - throw new IllegalArgumentException("Invalid \\u escape"); - } - c = '\u0000'; - for (int j=(++i)+4; i= limit) { throw new IllegalArgumentException("Invalid \\u escape"); } - c = (char) ((c << 4) | digit); + c = '\u0000'; + for (int j=(++i)+4; i " + + pairsBuf.toString()); + } + return pairsBuf; } diff --git a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java index dcbe7b52dcf..7c438304385 100755 --- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java +++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ - * $Date: 2000/04/22 01:25:10 $ - * $Revision: 1.26 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.27 $ * ***************************************************************************************** */ @@ -20,17 +20,17 @@ import com.ibm.util.Utility; /** * RuleBasedTransliterator is a transliterator * that reads a set of rules in order to determine how to perform - * translations. Rules are stored in resource bundles indexed by - * name. Rules are separated by semicolons (';'). To include a - * literal semicolon, prefix it with a backslash ('\'). Whitespace, - * as defined by Character.isWhitespace(), is ignored. - * If the first non-blank character on a line is '#', the entire - * line is ignored as a comment.

+ * translations. Rule sets are stored in resource bundles indexed by + * name. Rules within a rule set are separated by semicolons (';'). + * To include a literal semicolon, prefix it with a backslash ('\'). + * Whitespace, as defined by Character.isWhitespace(), + * is ignored. If the first non-blank character on a line is '#', + * the entire line is ignored as a comment.

* *

Each set of rules consists of two groups, one forward, and one * reverse. This is a convention that is not enforced; rules for one * direction may be omitted, with the result that translations in - * that direction will not modify the source text. Alternatively, + * that direction will not modify the source text. In addition, * bidirectional forward-reverse rules may be specified for * symmetrical transformations.

* @@ -39,69 +39,27 @@ import com.ibm.util.Utility; *

Rule statements take one of the following forms:

* *
- *
$alefmadda=\u0622
+ *
$alefmadda=\u0622;
*
Variable definition. The name on the - * left is assigned the character or expression on the - * right. Names must begin with a letter and consist only of - * letters, digits, and underscores. Case is significant. - * Duplicate names (including duplicates of simple variables - * or category names) cause an exception to be thrown. If - * the right hand side consists of one character, then the - * variable stands for that character. In this example, + * left is assigned the text on the right. In this example, * after this statement, instances of the left hand name, * "$alefmadda", will be replaced by - * the Unicode character U+0622. The right hand side must be - * exactly one character long (current limitation).
- *
 
- *
$softvowel=[eiyEIY]
- *
Category definition. The name on the - * left is assigned to stand for a set of characters. The - * same rules for names of simple variables apply. After - * this statement, the left hand variable will be - * interpreted as indicating a set of characters in - * appropriate contexts. The pattern syntax defining sets of - * characters is defined by {@link UnicodeSet}. Examples of - * valid patterns are: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
[abc]The set containing the - * characters 'a', 'b', and 'c'.
[^abc]The set of all characters except - * 'a', 'b', and 'c'.
[A-Z]The set of all characters from - * 'A' to 'Z' in Unicode order.
[:Lu:]The set of Unicode uppercase - * letters. See www.unicode.org - * for a complete list of categories and their - * two-letter codes.
[^a-z[:Lu:][:Ll:]]The set of all characters except - * 'a' through 'z' and uppercase or lowercase - * letters.
- *

Patterns may contain variable references, such as - * "$a=[a-z];$not_a=[^$a]". See - * {@link UnicodeSet} for more documentation and examples.

- *
- *
ai>$alefmadda
+ * the Unicode character U+0622. Variable names must begin + * with a letter and consist only of letters, digits, and + * underscores. Case is significant. Duplicate names cause + * an exception to be thrown, that is, variables cannot be + * redefined. The right hand side may contain well-formed + * text of any length, including no text at all ("$empty=;"). + * The right hand side may contain embedded UnicodeSet + * patterns, for example, "$softvowel=[eiyEIY]".
+ *
 
+ *
ai>$alefmadda;
*
Forward translation rule. This rule * states that the string on the left will be changed to the * string on the right when performing forward * transliteration.
*
 
- *
ai<$alefmadda
+ *
ai<$alefmadda;
*
Reverse translation rule. This rule * states that the string on the right will be changed to * the string on the left when performing reverse @@ -109,7 +67,7 @@ import com.ibm.util.Utility; *
* *
- *
ai<>$alefmadda
+ *
ai<>$alefmadda;
*
Bidirectional translation rule. This * rule states that the string on the right will be changed * to the string on the left when performing forward @@ -151,9 +109,16 @@ import com.ibm.util.Utility; * y and z

* * - *

In addition to being defined in variables, UnicodeSet - * patterns may be embedded directly into rule strings. Thus, the - * following two rules are equivalent:

+ *

UnicodeSet

+ * + *

UnicodeSet patterns may appear anywhere that + * makes sense. They may appear in variable definitions. + * Contrariwise, UnicodeSet patterns may themselves + * contain variable references, such as "$a=[a-z];$not_a=[^$a]", + * or "$range=a-z;$ll=[$range]".

+ * + *

UnicodeSet patterns may also be embedded directly + * into rule strings. Thus, the following two rules are equivalent:

* *
*

$vowel=[aeiou]; $vowel>'*'; # One way to do this
@@ -162,6 +127,8 @@ import com.ibm.util.Utility; * Another way

*
* + *

See {@link UnicodeSet} for more documentation and examples.

+ * *

Segments

* *

Segments of the input string can be matched and copied to the @@ -169,7 +136,8 @@ import com.ibm.util.Utility; * general, and makes reordering possible. For example:

* *
- *

([a-z]) > $1 $1;           # + *

([a-z]) > $1 $1; + *           # * double lowercase letters
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs

*
@@ -284,7 +252,7 @@ import com.ibm.util.Utility; *

Copyright (c) IBM Corporation 1999-2000. All rights reserved.

* * @author Alan Liu - * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $ + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $ */ public class RuleBasedTransliterator extends Transliterator { @@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator { public TransliterationRuleSet ruleSet; /** - * Map variable name (String) to variable (Character). A variable - * name may correspond to a single literal character, in which - * case the character is stored in this hash. It may also - * correspond to a UnicodeSet, in which case a character is - * again stored in this hash, but the character is a stand-in: it - * is an index for a secondary lookup in data.setVariables. The stand-in - * also represents the UnicodeSet in the stored rules. + * Map variable name (String) to variable (char[]). A variable name + * corresponds to zero or more characters, stored in a char[] array in + * this hash. One or more of these chars may also correspond to a + * UnicodeSet, in which case the character in the char[] in this hash is + * a stand-in: it is an index for a secondary lookup in + * data.setVariables. The stand-in also represents the UnicodeSet in + * the stored rules. */ - public Hashtable variableNames; + private Hashtable variableNames; /** * Map category variable (Character) to set (UnicodeSet). @@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator { * stored in the rule text to represent the set of characters. * setVariables[i] represents character (setVariablesBase + i). */ - public UnicodeSet[] setVariables; + private UnicodeSet[] setVariables; /** * The character that represents setVariables[0]. Characters * setVariablesBase through setVariablesBase + * setVariables.length - 1 represent UnicodeSet objects. */ - public char setVariablesBase; - - /** - * Return the UnicodeSet represented by the given character, or - * null if none. - */ - public UnicodeSet lookup(char c) { - int i = c - setVariablesBase; - return (i >= 0 && i < setVariables.length) - ? setVariables[i] : null; - } + private char setVariablesBase; /** * The character that represents segment 1. Characters segmentBase * through segmentBase + 8 represent segments 1 through 9. */ - public char segmentBase; + private char segmentBase; + + /** + * Return the UnicodeSet represented by the given character, or + * null if none. + */ + public UnicodeSet lookupSet(char c) { + int i = c - setVariablesBase; + return (i >= 0 && i < setVariables.length) + ? setVariables[i] : null; + } /** * Return the zero-based index of the segment represented by the given @@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator { private class ParseData implements SymbolTable { /** - * Implement SymbolTable API. Lookup a variable, returning - * either a Character, a UnicodeSet, or null. + * Implement SymbolTable API. */ - public Object lookup(String name) { - Character ch = (Character) data.variableNames.get(name); - if (ch != null) { - int i = ch.charValue() - data.setVariablesBase; - if (i >= 0 && i < setVariablesVector.size()) { - return setVariablesVector.elementAt(i); - } + public char[] lookup(String name) { + return (char[]) data.variableNames.get(name); + } + + /** + * Implement SymbolTable API. + */ + public UnicodeSet lookupSet(char ch) { + // Note that we cannot use data.lookupSet() because the + // set array has not been constructed yet. + int i = ch - data.setVariablesBase; + if (i >= 0 && i < setVariablesVector.size()) { + return (UnicodeSet) setVariablesVector.elementAt(i); } - return ch; + return null; } /** @@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator { String name = parser.parseData. parseReference(rule, pp, limit); pos = pp.getIndex(); - // If this is a variable definition statement, then the LHS - // variable will be undefined. In that case getVariableName() - // will return the special placeholder variableLimit-1. - buf.append(parser.getVariableDef(name)); + // If this is a variable definition statement, + // then the LHS variable will be undefined. In + // that case appendVariableDef() will append the + // special placeholder char variableLimit-1. + + //buf.append(parser.getVariableDef(name)); + parser.appendVariableDef(name, buf); } } break; @@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator { if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { syntaxError("Malformed LHS", rule, start); } - if (right.text.length() != 1) { - syntaxError("Malformed RHS", rule, start); - } - data.variableNames.put(undefinedVariableName, - new Character(right.text.charAt(0))); + // We allow anything on the right, including an empty string. + int n = right.text.length(); + char[] value = new char[n]; + right.text.getChars(0, n, value, 0); + data.variableNames.put(undefinedVariableName, value); + ++variableLimit; return pos; } @@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator { } /** - * Returns the single character value of the given variable name. Defined - * names are recognized. + * Append the value of the given variable name to the given + * StringBuffer. * @exception IllegalArgumentException if the name is unknown. */ - private char getVariableDef(String name) { - Character ch = (Character) data.variableNames.get(name); + private void appendVariableDef(String name, StringBuffer buf) { + char[] ch = (char[]) data.variableNames.get(name); if (ch == null) { // We allow one undefined variable so that variable definition // statements work. For the first undefined variable we return @@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator { if (variableNext >= variableLimit) { throw new RuntimeException("Private use variables exhausted"); } - return --variableLimit; + buf.append((char) --variableLimit); + } else { + throw new IllegalArgumentException("Undefined variable $" + + name); } - throw new IllegalArgumentException("Undefined variable $" - + name); + } else { + buf.append(ch); } - return ch.charValue(); } /** @@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator { /** * $Log: RuleBasedTransliterator.java,v $ + * Revision 1.27 2000/04/25 01:42:58 alan + * Allow arbitrary length variable values. Clean up Data API. Update javadocs. + * * Revision 1.26 2000/04/22 01:25:10 alan * Add support for cursor positioner '@'; update javadoc * diff --git a/icu4j/src/com/ibm/text/SymbolTable.java b/icu4j/src/com/ibm/text/SymbolTable.java index 714bae4b66d..60487048569 100755 --- a/icu4j/src/com/ibm/text/SymbolTable.java +++ b/icu4j/src/com/ibm/text/SymbolTable.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $ - * $Date: 2000/04/21 22:16:29 $ - * $Revision: 1.3 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.4 $ * ***************************************************************************************** */ @@ -32,10 +32,17 @@ public interface SymbolTable { final char SYMBOL_REF = '$'; /** - * Lookup the object associated with this string and return it. - * Return null if no such name exists. + * Lookup the characters associated with this string and return it. + * Return null if no such name exists. The resultant + * array may have length zero. */ - Object lookup(String s); + char[] lookup(String s); + + /** + * Lookup the UnicodeSet associated with the given character, and + * return it. Return null if not found. + */ + UnicodeSet lookupSet(char ch); /** * Parse a symbol reference name from the given string, starting diff --git a/icu4j/src/com/ibm/text/TransliterationRule.java b/icu4j/src/com/ibm/text/TransliterationRule.java index 817aa334ccb..b78cfed5e86 100755 --- a/icu4j/src/com/ibm/text/TransliterationRule.java +++ b/icu4j/src/com/ibm/text/TransliterationRule.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $ - * $Date: 2000/04/22 01:25:10 $ - * $Revision: 1.18 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.19 $ * ***************************************************************************************** */ @@ -44,7 +44,7 @@ import com.ibm.util.Utility; *

Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $ + * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $ */ class TransliterationRule { /** @@ -240,7 +240,7 @@ class TransliterationRule { return -1; } char c = pattern.charAt(anteContextLength); - return variables.lookup(c) == null ? (c & 0xFF) : -1; + return variables.lookupSet(c) == null ? (c & 0xFF) : -1; } /** @@ -300,7 +300,7 @@ class TransliterationRule { return true; } char c = pattern.charAt(anteContextLength); - UnicodeSet set = variables.lookup(c); + UnicodeSet set = variables.lookupSet(c); return set == null ? (c & 0xFF) == v : set.containsIndexValue(v); } @@ -486,13 +486,16 @@ class TransliterationRule { UnicodeFilter filter) { UnicodeSet set = null; return (filter == null || filter.contains(textChar)) && - (((set = variables.lookup(keyChar)) == null) ? + (((set = variables.lookupSet(keyChar)) == null) ? keyChar == textChar : set.contains(textChar)); } } /** * $Log: TransliterationRule.java,v $ + * Revision 1.19 2000/04/25 01:42:58 alan + * Allow arbitrary length variable values. Clean up Data API. Update javadocs. + * * Revision 1.18 2000/04/22 01:25:10 alan * Add support for cursor positioner '@'; update javadoc * diff --git a/icu4j/src/com/ibm/text/UnicodeSet.java b/icu4j/src/com/ibm/text/UnicodeSet.java index 82ed7bbbe92..d0072e29702 100755 --- a/icu4j/src/com/ibm/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/text/UnicodeSet.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $ - * $Date: 2000/04/21 22:16:29 $ - * $Revision: 1.18 $ + * $Date: 2000/04/25 01:42:58 $ + * $Revision: 1.19 $ * ***************************************************************************************** */ @@ -241,7 +241,7 @@ import java.text.*; * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $ */ public class UnicodeSet implements UnicodeFilter { /** @@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter { int start = pos.getIndex(); int i = start; int limit = pattern.length(); - for (; i= limit) { - throw new IllegalArgumentException("Invalid \\u escape"); - } - c = '\u0000'; - for (int j=(++i)+4; i= limit) { throw new IllegalArgumentException("Invalid \\u escape"); } - c = (char) ((c << 4) | digit); + c = '\u0000'; + for (int j=(++i)+4; i " + + pairsBuf.toString()); + } + return pairsBuf; }