mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
Allow arbitrary length variable values. Clean up Data API. Update javadocs.
X-SVN-Rev: 1242
This commit is contained in:
parent
cd1cfb9094
commit
7a49adef39
8 changed files with 470 additions and 402 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
|
||||
* $Date: 2000/04/22 01:25:10 $
|
||||
* $Revision: 1.26 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.27 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -20,17 +20,17 @@ import com.ibm.util.Utility;
|
|||
/**
|
||||
* <strong>RuleBasedTransliterator</strong> is a transliterator
|
||||
* that reads a set of rules in order to determine how to perform
|
||||
* translations. Rules are stored in resource bundles indexed by
|
||||
* name. Rules are separated by semicolons (';'). To include a
|
||||
* literal semicolon, prefix it with a backslash ('\'). Whitespace,
|
||||
* as defined by <code>Character.isWhitespace()</code>, is ignored.
|
||||
* If the first non-blank character on a line is '#', the entire
|
||||
* line is ignored as a comment. </p>
|
||||
* translations. Rule sets are stored in resource bundles indexed by
|
||||
* name. Rules within a rule set are separated by semicolons (';').
|
||||
* To include a literal semicolon, prefix it with a backslash ('\').
|
||||
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
|
||||
* is ignored. If the first non-blank character on a line is '#',
|
||||
* the entire line is ignored as a comment. </p>
|
||||
*
|
||||
* <p>Each set of rules consists of two groups, one forward, and one
|
||||
* reverse. This is a convention that is not enforced; rules for one
|
||||
* direction may be omitted, with the result that translations in
|
||||
* that direction will not modify the source text. Alternatively,
|
||||
* that direction will not modify the source text. In addition,
|
||||
* bidirectional forward-reverse rules may be specified for
|
||||
* symmetrical transformations.</p>
|
||||
*
|
||||
|
@ -39,69 +39,27 @@ import com.ibm.util.Utility;
|
|||
* <p>Rule statements take one of the following forms: </p>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>$alefmadda=\u0622</code></dt>
|
||||
* <dt><code>$alefmadda=\u0622;</code></dt>
|
||||
* <dd><strong>Variable definition.</strong> The name on the
|
||||
* left is assigned the character or expression on the
|
||||
* right. Names must begin with a letter and consist only of
|
||||
* letters, digits, and underscores. Case is significant.
|
||||
* Duplicate names (including duplicates of simple variables
|
||||
* or category names) cause an exception to be thrown. If
|
||||
* the right hand side consists of one character, then the
|
||||
* variable stands for that character. In this example,
|
||||
* left is assigned the text on the right. In this example,
|
||||
* after this statement, instances of the left hand name,
|
||||
* "<code>$alefmadda</code>", will be replaced by
|
||||
* the Unicode character U+0622. The right hand side must be
|
||||
* exactly one character long (current limitation).</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>$softvowel=[eiyEIY]</code></dt>
|
||||
* <dd><strong>Category definition.</strong> The name on the
|
||||
* left is assigned to stand for a set of characters. The
|
||||
* same rules for names of simple variables apply. After
|
||||
* this statement, the left hand variable will be
|
||||
* interpreted as indicating a set of characters in
|
||||
* appropriate contexts. The pattern syntax defining sets of
|
||||
* characters is defined by {@link UnicodeSet}. Examples of
|
||||
* valid patterns are:<table border="0">
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[abc]</code></td>
|
||||
* <td valign="top">The set containing the
|
||||
* characters 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[^abc]</code></td>
|
||||
* <td valign="top">The set of all characters <em>except</em>
|
||||
* 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[A-Z]</code></td>
|
||||
* <td valign="top">The set of all characters from
|
||||
* 'A' to 'Z' in Unicode order.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[:Lu:]</code></td>
|
||||
* <td valign="top">The set of Unicode uppercase
|
||||
* letters. See <a href="http://www.unicode.org">www.unicode.org</a>
|
||||
* for a complete list of categories and their
|
||||
* two-letter codes.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
|
||||
* <td valign="top">The set of all characters <em>except</em>
|
||||
* 'a' through 'z' and uppercase or lowercase
|
||||
* letters.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* <p>Patterns may contain variable references, such as
|
||||
* "<code>$a=[a-z];$not_a=[^$a]</code>". See
|
||||
* {@link UnicodeSet} for more documentation and examples. </p>
|
||||
* </dd>
|
||||
* <dt><code>ai>$alefmadda</code></dt>
|
||||
* the Unicode character U+0622. Variable names must begin
|
||||
* with a letter and consist only of letters, digits, and
|
||||
* underscores. Case is significant. Duplicate names cause
|
||||
* an exception to be thrown, that is, variables cannot be
|
||||
* redefined. The right hand side may contain well-formed
|
||||
* text of any length, including no text at all ("<code>$empty=;</code>").
|
||||
* The right hand side may contain embedded <code>UnicodeSet</code>
|
||||
* patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd>
|
||||
* <dd> </dd>
|
||||
* <dt><code>ai>$alefmadda;</code></dt>
|
||||
* <dd><strong>Forward translation rule.</strong> This rule
|
||||
* states that the string on the left will be changed to the
|
||||
* string on the right when performing forward
|
||||
* transliteration.</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>ai<$alefmadda</code></dt>
|
||||
* <dt><code>ai<$alefmadda;</code></dt>
|
||||
* <dd><strong>Reverse translation rule.</strong> This rule
|
||||
* states that the string on the right will be changed to
|
||||
* the string on the left when performing reverse
|
||||
|
@ -109,7 +67,7 @@ import com.ibm.util.Utility;
|
|||
* </dl>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>ai<>$alefmadda</code></dt>
|
||||
* <dt><code>ai<>$alefmadda;</code></dt>
|
||||
* <dd><strong>Bidirectional translation rule.</strong> This
|
||||
* rule states that the string on the right will be changed
|
||||
* to the string on the left when performing forward
|
||||
|
@ -151,9 +109,16 @@ import com.ibm.util.Utility;
|
|||
* y and z</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>In addition to being defined in variables, <code>UnicodeSet</code>
|
||||
* patterns may be embedded directly into rule strings. Thus, the
|
||||
* following two rules are equivalent:</p>
|
||||
* <p><b>UnicodeSet</b></p>
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may appear anywhere that
|
||||
* makes sense. They may appear in variable definitions.
|
||||
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
|
||||
* contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>",
|
||||
* or "<code>$range=a-z;$ll=[$range]</code>".</p>
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may also be embedded directly
|
||||
* into rule strings. Thus, the following two rules are equivalent:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br>
|
||||
|
@ -162,6 +127,8 @@ import com.ibm.util.Utility;
|
|||
* Another way</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
|
||||
*
|
||||
* <p><b>Segments</b></p>
|
||||
*
|
||||
* <p>Segments of the input string can be matched and copied to the
|
||||
|
@ -169,7 +136,8 @@ import com.ibm.util.Utility;
|
|||
* general, and makes reordering possible. For example:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>([a-z]) > $1 $1; #
|
||||
* <p><code>([a-z]) > $1 $1;
|
||||
* #
|
||||
* double lowercase letters<br>
|
||||
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p>
|
||||
* </blockquote>
|
||||
|
@ -284,7 +252,7 @@ import com.ibm.util.Utility;
|
|||
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $
|
||||
*/
|
||||
public class RuleBasedTransliterator extends Transliterator {
|
||||
|
||||
|
@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
public TransliterationRuleSet ruleSet;
|
||||
|
||||
/**
|
||||
* Map variable name (String) to variable (Character). A variable
|
||||
* name may correspond to a single literal character, in which
|
||||
* case the character is stored in this hash. It may also
|
||||
* correspond to a UnicodeSet, in which case a character is
|
||||
* again stored in this hash, but the character is a stand-in: it
|
||||
* is an index for a secondary lookup in data.setVariables. The stand-in
|
||||
* also represents the UnicodeSet in the stored rules.
|
||||
* Map variable name (String) to variable (char[]). A variable name
|
||||
* corresponds to zero or more characters, stored in a char[] array in
|
||||
* this hash. One or more of these chars may also correspond to a
|
||||
* UnicodeSet, in which case the character in the char[] in this hash is
|
||||
* a stand-in: it is an index for a secondary lookup in
|
||||
* data.setVariables. The stand-in also represents the UnicodeSet in
|
||||
* the stored rules.
|
||||
*/
|
||||
public Hashtable variableNames;
|
||||
private Hashtable variableNames;
|
||||
|
||||
/**
|
||||
* Map category variable (Character) to set (UnicodeSet).
|
||||
|
@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
* stored in the rule text to represent the set of characters.
|
||||
* setVariables[i] represents character (setVariablesBase + i).
|
||||
*/
|
||||
public UnicodeSet[] setVariables;
|
||||
private UnicodeSet[] setVariables;
|
||||
|
||||
/**
|
||||
* The character that represents setVariables[0]. Characters
|
||||
* setVariablesBase through setVariablesBase +
|
||||
* setVariables.length - 1 represent UnicodeSet objects.
|
||||
*/
|
||||
public char setVariablesBase;
|
||||
|
||||
/**
|
||||
* Return the UnicodeSet represented by the given character, or
|
||||
* null if none.
|
||||
*/
|
||||
public UnicodeSet lookup(char c) {
|
||||
int i = c - setVariablesBase;
|
||||
return (i >= 0 && i < setVariables.length)
|
||||
? setVariables[i] : null;
|
||||
}
|
||||
private char setVariablesBase;
|
||||
|
||||
/**
|
||||
* The character that represents segment 1. Characters segmentBase
|
||||
* through segmentBase + 8 represent segments 1 through 9.
|
||||
*/
|
||||
public char segmentBase;
|
||||
private char segmentBase;
|
||||
|
||||
/**
|
||||
* Return the UnicodeSet represented by the given character, or
|
||||
* null if none.
|
||||
*/
|
||||
public UnicodeSet lookupSet(char c) {
|
||||
int i = c - setVariablesBase;
|
||||
return (i >= 0 && i < setVariables.length)
|
||||
? setVariables[i] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the zero-based index of the segment represented by the given
|
||||
|
@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
private class ParseData implements SymbolTable {
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API. Lookup a variable, returning
|
||||
* either a Character, a UnicodeSet, or null.
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
public Object lookup(String name) {
|
||||
Character ch = (Character) data.variableNames.get(name);
|
||||
if (ch != null) {
|
||||
int i = ch.charValue() - data.setVariablesBase;
|
||||
if (i >= 0 && i < setVariablesVector.size()) {
|
||||
return setVariablesVector.elementAt(i);
|
||||
}
|
||||
public char[] lookup(String name) {
|
||||
return (char[]) data.variableNames.get(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
public UnicodeSet lookupSet(char ch) {
|
||||
// Note that we cannot use data.lookupSet() because the
|
||||
// set array has not been constructed yet.
|
||||
int i = ch - data.setVariablesBase;
|
||||
if (i >= 0 && i < setVariablesVector.size()) {
|
||||
return (UnicodeSet) setVariablesVector.elementAt(i);
|
||||
}
|
||||
return ch;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
String name = parser.parseData.
|
||||
parseReference(rule, pp, limit);
|
||||
pos = pp.getIndex();
|
||||
// If this is a variable definition statement, then the LHS
|
||||
// variable will be undefined. In that case getVariableName()
|
||||
// will return the special placeholder variableLimit-1.
|
||||
buf.append(parser.getVariableDef(name));
|
||||
// If this is a variable definition statement,
|
||||
// then the LHS variable will be undefined. In
|
||||
// that case appendVariableDef() will append the
|
||||
// special placeholder char variableLimit-1.
|
||||
|
||||
//buf.append(parser.getVariableDef(name));
|
||||
parser.appendVariableDef(name, buf);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
|
||||
syntaxError("Malformed LHS", rule, start);
|
||||
}
|
||||
if (right.text.length() != 1) {
|
||||
syntaxError("Malformed RHS", rule, start);
|
||||
}
|
||||
data.variableNames.put(undefinedVariableName,
|
||||
new Character(right.text.charAt(0)));
|
||||
// We allow anything on the right, including an empty string.
|
||||
int n = right.text.length();
|
||||
char[] value = new char[n];
|
||||
right.text.getChars(0, n, value, 0);
|
||||
data.variableNames.put(undefinedVariableName, value);
|
||||
|
||||
++variableLimit;
|
||||
return pos;
|
||||
}
|
||||
|
@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the single character value of the given variable name. Defined
|
||||
* names are recognized.
|
||||
* Append the value of the given variable name to the given
|
||||
* StringBuffer.
|
||||
* @exception IllegalArgumentException if the name is unknown.
|
||||
*/
|
||||
private char getVariableDef(String name) {
|
||||
Character ch = (Character) data.variableNames.get(name);
|
||||
private void appendVariableDef(String name, StringBuffer buf) {
|
||||
char[] ch = (char[]) data.variableNames.get(name);
|
||||
if (ch == null) {
|
||||
// We allow one undefined variable so that variable definition
|
||||
// statements work. For the first undefined variable we return
|
||||
|
@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
if (variableNext >= variableLimit) {
|
||||
throw new RuntimeException("Private use variables exhausted");
|
||||
}
|
||||
return --variableLimit;
|
||||
buf.append((char) --variableLimit);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Undefined variable $"
|
||||
+ name);
|
||||
}
|
||||
throw new IllegalArgumentException("Undefined variable $"
|
||||
+ name);
|
||||
} else {
|
||||
buf.append(ch);
|
||||
}
|
||||
return ch.charValue();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
|
||||
/**
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.27 2000/04/25 01:42:58 alan
|
||||
* Allow arbitrary length variable values. Clean up Data API. Update javadocs.
|
||||
*
|
||||
* Revision 1.26 2000/04/22 01:25:10 alan
|
||||
* Add support for cursor positioner '@'; update javadoc
|
||||
*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $
|
||||
* $Date: 2000/04/21 22:16:29 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -32,10 +32,17 @@ public interface SymbolTable {
|
|||
final char SYMBOL_REF = '$';
|
||||
|
||||
/**
|
||||
* Lookup the object associated with this string and return it.
|
||||
* Return <tt>null</tt> if no such name exists.
|
||||
* Lookup the characters associated with this string and return it.
|
||||
* Return <tt>null</tt> if no such name exists. The resultant
|
||||
* array may have length zero.
|
||||
*/
|
||||
Object lookup(String s);
|
||||
char[] lookup(String s);
|
||||
|
||||
/**
|
||||
* Lookup the UnicodeSet associated with the given character, and
|
||||
* return it. Return <tt>null</tt> if not found.
|
||||
*/
|
||||
UnicodeSet lookupSet(char ch);
|
||||
|
||||
/**
|
||||
* Parse a symbol reference name from the given string, starting
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliterationRule.java,v $
|
||||
* $Date: 2000/04/22 01:25:10 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
/**
|
||||
|
@ -240,7 +240,7 @@ class TransliterationRule {
|
|||
return -1;
|
||||
}
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
return variables.lookup(c) == null ? (c & 0xFF) : -1;
|
||||
return variables.lookupSet(c) == null ? (c & 0xFF) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -300,7 +300,7 @@ class TransliterationRule {
|
|||
return true;
|
||||
}
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
UnicodeSet set = variables.lookup(c);
|
||||
UnicodeSet set = variables.lookupSet(c);
|
||||
return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
|
||||
}
|
||||
|
||||
|
@ -486,13 +486,16 @@ class TransliterationRule {
|
|||
UnicodeFilter filter) {
|
||||
UnicodeSet set = null;
|
||||
return (filter == null || filter.contains(textChar)) &&
|
||||
(((set = variables.lookup(keyChar)) == null) ?
|
||||
(((set = variables.lookupSet(keyChar)) == null) ?
|
||||
keyChar == textChar : set.contains(textChar));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.19 2000/04/25 01:42:58 alan
|
||||
* Allow arbitrary length variable values. Clean up Data API. Update javadocs.
|
||||
*
|
||||
* Revision 1.18 2000/04/22 01:25:10 alan
|
||||
* Add support for cursor positioner '@'; update javadoc
|
||||
*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
|
||||
* $Date: 2000/04/21 22:16:29 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -241,7 +241,7 @@ import java.text.*;
|
|||
* *Unsupported by Java (and hence unsupported by UnicodeSet).
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
|
||||
*/
|
||||
public class UnicodeSet implements UnicodeFilter {
|
||||
/**
|
||||
|
@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
int start = pos.getIndex();
|
||||
int i = start;
|
||||
int limit = pattern.length();
|
||||
for (; i<limit; ++i) {
|
||||
/* In the case of an embedded SymbolTable variable, we look it up and
|
||||
* then take characters from the resultant char[] array. These chars
|
||||
* are subjected to an extra level of lookup in the SymbolTable in case
|
||||
* they are stand-ins for a nested UnicodeSet. */
|
||||
char[] varValueBuffer = null;
|
||||
int ivarValueBuffer = 0;
|
||||
for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
|
||||
/* If the next element is a single character, c will be set to it,
|
||||
* and nestedPairs will be null. In this case isLiteral indicates
|
||||
* whether the character should assume special meaning if it has
|
||||
|
@ -783,9 +789,23 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
* nestedPairs will be set to the pairs list for the nested set, and
|
||||
* c's value should be ignored.
|
||||
*/
|
||||
char c = pattern.charAt(i);
|
||||
String nestedPairs = null;
|
||||
boolean isLiteral = false;
|
||||
char c;
|
||||
if (varValueBuffer != null) {
|
||||
if (ivarValueBuffer < varValueBuffer.length) {
|
||||
c = varValueBuffer[ivarValueBuffer++];
|
||||
UnicodeSet set = symbols.lookupSet(c);
|
||||
if (set != null) {
|
||||
nestedPairs = set.pairs.toString();
|
||||
}
|
||||
} else {
|
||||
varValueBuffer = null;
|
||||
c = pattern.charAt(i);
|
||||
}
|
||||
} else {
|
||||
c = pattern.charAt(i);
|
||||
}
|
||||
|
||||
// Ignore whitespace. This is not Unicode whitespace, but Java
|
||||
// whitespace, a subset of Unicode whitespace.
|
||||
|
@ -829,82 +849,97 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
// will be 2 if we want a closing ']', or 3 if we should parse a
|
||||
// category and close with ":]".
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
++i;
|
||||
if (i < limit) {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = true;
|
||||
if (c == 'u') {
|
||||
if ((i+4) >= limit) {
|
||||
throw new IllegalArgumentException("Invalid \\u escape");
|
||||
}
|
||||
c = '\u0000';
|
||||
for (int j=(++i)+4; i<j; ++i) { // [sic]
|
||||
int digit = Character.digit(pattern.charAt(i), 16);
|
||||
if (digit<0) {
|
||||
// Only process escapes, variable references, and nested sets
|
||||
// if we are _not_ retrieving characters from the variable
|
||||
// buffer. Characters in the variable buffer have already
|
||||
// benn through escape and variable reference processing.
|
||||
if (varValueBuffer == null) {
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
++i;
|
||||
if (i < limit) {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = true;
|
||||
if (c == 'u') {
|
||||
if ((i+4) >= limit) {
|
||||
throw new IllegalArgumentException("Invalid \\u escape");
|
||||
}
|
||||
c = (char) ((c << 4) | digit);
|
||||
c = '\u0000';
|
||||
for (int j=(++i)+4; i<j; ++i) { // [sic]
|
||||
int digit = Character.digit(pattern.charAt(i), 16);
|
||||
if (digit<0) {
|
||||
throw new IllegalArgumentException("Invalid \\u escape");
|
||||
}
|
||||
c = (char) ((c << 4) | digit);
|
||||
}
|
||||
--i; // Move i back to last parsed character
|
||||
}
|
||||
--i; // Move i back to last parsed character
|
||||
} else {
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
}
|
||||
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, nestedPairs is assigned here.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
|
||||
pos.setIndex(++i);
|
||||
String name = symbols.parseReference(pattern, pos, limit);
|
||||
Object obj = symbols.lookup(name);
|
||||
if (obj == null) {
|
||||
throw new IllegalArgumentException("Undefined variable: "
|
||||
+ name);
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, nestedPairs is assigned here.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
|
||||
pos.setIndex(++i);
|
||||
String name = symbols.parseReference(pattern, pos, limit);
|
||||
/*
|
||||
Object obj = symbols.lookup(name);
|
||||
if (obj == null) {
|
||||
throw new IllegalArgumentException("Undefined variable: "
|
||||
+ name);
|
||||
}
|
||||
isLiteral = true;
|
||||
if (obj instanceof Character) {
|
||||
c = ((Character) obj).charValue();
|
||||
} else {
|
||||
nestedPairs = ((UnicodeSet) obj).pairs.toString();
|
||||
}
|
||||
*/
|
||||
varValueBuffer = symbols.lookup(name);
|
||||
if (varValueBuffer == null) {
|
||||
throw new IllegalArgumentException("Undefined variable: "
|
||||
+ name);
|
||||
}
|
||||
ivarValueBuffer = 0;
|
||||
i = pos.getIndex()-1; // Make i point at last char of var name
|
||||
continue; // Back to the top to get varValueBuffer[0]
|
||||
}
|
||||
isLiteral = true;
|
||||
if (obj instanceof Character) {
|
||||
c = ((Character) obj).charValue();
|
||||
} else {
|
||||
nestedPairs = ((UnicodeSet) obj).pairs.toString();
|
||||
}
|
||||
i = pos.getIndex()-1; // Make i point at last char of var name
|
||||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Handle "[:...:]", representing a character category
|
||||
char d = charAfter(pattern, i);
|
||||
if (d == ':') {
|
||||
i += 2;
|
||||
int j = pattern.indexOf(":]", i);
|
||||
if (j < 0) {
|
||||
throw new IllegalArgumentException("Missing \":]\"");
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Handle "[:...:]", representing a character category
|
||||
char d = charAfter(pattern, i);
|
||||
if (d == ':') {
|
||||
i += 2;
|
||||
int j = pattern.indexOf(":]", i);
|
||||
if (j < 0) {
|
||||
throw new IllegalArgumentException("Missing \":]\"");
|
||||
}
|
||||
nestedPairs = getCategoryPairs(pattern.substring(i, j));
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i); // Add 2 to point AFTER op
|
||||
nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
}
|
||||
nestedPairs = getCategoryPairs(pattern.substring(i, j));
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i); // Add 2 to point AFTER op
|
||||
nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -994,6 +1029,13 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
}
|
||||
pos.setIndex(i+1);
|
||||
|
||||
if (false) {
|
||||
// Debug parser
|
||||
System.out.println("UnicodeSet(" +
|
||||
pattern.substring(start, i+1) + ") -> " +
|
||||
pairsBuf.toString());
|
||||
}
|
||||
|
||||
return pairsBuf;
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
|
||||
* $Date: 2000/04/22 01:25:10 $
|
||||
* $Revision: 1.26 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.27 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -20,17 +20,17 @@ import com.ibm.util.Utility;
|
|||
/**
|
||||
* <strong>RuleBasedTransliterator</strong> is a transliterator
|
||||
* that reads a set of rules in order to determine how to perform
|
||||
* translations. Rules are stored in resource bundles indexed by
|
||||
* name. Rules are separated by semicolons (';'). To include a
|
||||
* literal semicolon, prefix it with a backslash ('\'). Whitespace,
|
||||
* as defined by <code>Character.isWhitespace()</code>, is ignored.
|
||||
* If the first non-blank character on a line is '#', the entire
|
||||
* line is ignored as a comment. </p>
|
||||
* translations. Rule sets are stored in resource bundles indexed by
|
||||
* name. Rules within a rule set are separated by semicolons (';').
|
||||
* To include a literal semicolon, prefix it with a backslash ('\').
|
||||
* Whitespace, as defined by <code>Character.isWhitespace()</code>,
|
||||
* is ignored. If the first non-blank character on a line is '#',
|
||||
* the entire line is ignored as a comment. </p>
|
||||
*
|
||||
* <p>Each set of rules consists of two groups, one forward, and one
|
||||
* reverse. This is a convention that is not enforced; rules for one
|
||||
* direction may be omitted, with the result that translations in
|
||||
* that direction will not modify the source text. Alternatively,
|
||||
* that direction will not modify the source text. In addition,
|
||||
* bidirectional forward-reverse rules may be specified for
|
||||
* symmetrical transformations.</p>
|
||||
*
|
||||
|
@ -39,69 +39,27 @@ import com.ibm.util.Utility;
|
|||
* <p>Rule statements take one of the following forms: </p>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>$alefmadda=\u0622</code></dt>
|
||||
* <dt><code>$alefmadda=\u0622;</code></dt>
|
||||
* <dd><strong>Variable definition.</strong> The name on the
|
||||
* left is assigned the character or expression on the
|
||||
* right. Names must begin with a letter and consist only of
|
||||
* letters, digits, and underscores. Case is significant.
|
||||
* Duplicate names (including duplicates of simple variables
|
||||
* or category names) cause an exception to be thrown. If
|
||||
* the right hand side consists of one character, then the
|
||||
* variable stands for that character. In this example,
|
||||
* left is assigned the text on the right. In this example,
|
||||
* after this statement, instances of the left hand name,
|
||||
* "<code>$alefmadda</code>", will be replaced by
|
||||
* the Unicode character U+0622. The right hand side must be
|
||||
* exactly one character long (current limitation).</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>$softvowel=[eiyEIY]</code></dt>
|
||||
* <dd><strong>Category definition.</strong> The name on the
|
||||
* left is assigned to stand for a set of characters. The
|
||||
* same rules for names of simple variables apply. After
|
||||
* this statement, the left hand variable will be
|
||||
* interpreted as indicating a set of characters in
|
||||
* appropriate contexts. The pattern syntax defining sets of
|
||||
* characters is defined by {@link UnicodeSet}. Examples of
|
||||
* valid patterns are:<table border="0">
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[abc]</code></td>
|
||||
* <td valign="top">The set containing the
|
||||
* characters 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[^abc]</code></td>
|
||||
* <td valign="top">The set of all characters <em>except</em>
|
||||
* 'a', 'b', and 'c'.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[A-Z]</code></td>
|
||||
* <td valign="top">The set of all characters from
|
||||
* 'A' to 'Z' in Unicode order.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[:Lu:]</code></td>
|
||||
* <td valign="top">The set of Unicode uppercase
|
||||
* letters. See <a href="http://www.unicode.org">www.unicode.org</a>
|
||||
* for a complete list of categories and their
|
||||
* two-letter codes.</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td valign="top" nowrap><code>[^a-z[:Lu:][:Ll:]]</code></td>
|
||||
* <td valign="top">The set of all characters <em>except</em>
|
||||
* 'a' through 'z' and uppercase or lowercase
|
||||
* letters.</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* <p>Patterns may contain variable references, such as
|
||||
* "<code>$a=[a-z];$not_a=[^$a]</code>". See
|
||||
* {@link UnicodeSet} for more documentation and examples. </p>
|
||||
* </dd>
|
||||
* <dt><code>ai>$alefmadda</code></dt>
|
||||
* the Unicode character U+0622. Variable names must begin
|
||||
* with a letter and consist only of letters, digits, and
|
||||
* underscores. Case is significant. Duplicate names cause
|
||||
* an exception to be thrown, that is, variables cannot be
|
||||
* redefined. The right hand side may contain well-formed
|
||||
* text of any length, including no text at all ("<code>$empty=;</code>").
|
||||
* The right hand side may contain embedded <code>UnicodeSet</code>
|
||||
* patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd>
|
||||
* <dd> </dd>
|
||||
* <dt><code>ai>$alefmadda;</code></dt>
|
||||
* <dd><strong>Forward translation rule.</strong> This rule
|
||||
* states that the string on the left will be changed to the
|
||||
* string on the right when performing forward
|
||||
* transliteration.</dd>
|
||||
* <dt> </dt>
|
||||
* <dt><code>ai<$alefmadda</code></dt>
|
||||
* <dt><code>ai<$alefmadda;</code></dt>
|
||||
* <dd><strong>Reverse translation rule.</strong> This rule
|
||||
* states that the string on the right will be changed to
|
||||
* the string on the left when performing reverse
|
||||
|
@ -109,7 +67,7 @@ import com.ibm.util.Utility;
|
|||
* </dl>
|
||||
*
|
||||
* <dl>
|
||||
* <dt><code>ai<>$alefmadda</code></dt>
|
||||
* <dt><code>ai<>$alefmadda;</code></dt>
|
||||
* <dd><strong>Bidirectional translation rule.</strong> This
|
||||
* rule states that the string on the right will be changed
|
||||
* to the string on the left when performing forward
|
||||
|
@ -151,9 +109,16 @@ import com.ibm.util.Utility;
|
|||
* y and z</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>In addition to being defined in variables, <code>UnicodeSet</code>
|
||||
* patterns may be embedded directly into rule strings. Thus, the
|
||||
* following two rules are equivalent:</p>
|
||||
* <p><b>UnicodeSet</b></p>
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may appear anywhere that
|
||||
* makes sense. They may appear in variable definitions.
|
||||
* Contrariwise, <code>UnicodeSet</code> patterns may themselves
|
||||
* contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>",
|
||||
* or "<code>$range=a-z;$ll=[$range]</code>".</p>
|
||||
*
|
||||
* <p><code>UnicodeSet</code> patterns may also be embedded directly
|
||||
* into rule strings. Thus, the following two rules are equivalent:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>$vowel=[aeiou]; $vowel>'*'; # One way to do this<br>
|
||||
|
@ -162,6 +127,8 @@ import com.ibm.util.Utility;
|
|||
* Another way</code></p>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>See {@link UnicodeSet} for more documentation and examples.</p>
|
||||
*
|
||||
* <p><b>Segments</b></p>
|
||||
*
|
||||
* <p>Segments of the input string can be matched and copied to the
|
||||
|
@ -169,7 +136,8 @@ import com.ibm.util.Utility;
|
|||
* general, and makes reordering possible. For example:</p>
|
||||
*
|
||||
* <blockquote>
|
||||
* <p><code>([a-z]) > $1 $1; #
|
||||
* <p><code>([a-z]) > $1 $1;
|
||||
* #
|
||||
* double lowercase letters<br>
|
||||
* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs</code></p>
|
||||
* </blockquote>
|
||||
|
@ -284,7 +252,7 @@ import com.ibm.util.Utility;
|
|||
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.26 $ $Date: 2000/04/22 01:25:10 $
|
||||
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.27 $ $Date: 2000/04/25 01:42:58 $
|
||||
*/
|
||||
public class RuleBasedTransliterator extends Transliterator {
|
||||
|
||||
|
@ -455,15 +423,15 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
public TransliterationRuleSet ruleSet;
|
||||
|
||||
/**
|
||||
* Map variable name (String) to variable (Character). A variable
|
||||
* name may correspond to a single literal character, in which
|
||||
* case the character is stored in this hash. It may also
|
||||
* correspond to a UnicodeSet, in which case a character is
|
||||
* again stored in this hash, but the character is a stand-in: it
|
||||
* is an index for a secondary lookup in data.setVariables. The stand-in
|
||||
* also represents the UnicodeSet in the stored rules.
|
||||
* Map variable name (String) to variable (char[]). A variable name
|
||||
* corresponds to zero or more characters, stored in a char[] array in
|
||||
* this hash. One or more of these chars may also correspond to a
|
||||
* UnicodeSet, in which case the character in the char[] in this hash is
|
||||
* a stand-in: it is an index for a secondary lookup in
|
||||
* data.setVariables. The stand-in also represents the UnicodeSet in
|
||||
* the stored rules.
|
||||
*/
|
||||
public Hashtable variableNames;
|
||||
private Hashtable variableNames;
|
||||
|
||||
/**
|
||||
* Map category variable (Character) to set (UnicodeSet).
|
||||
|
@ -474,30 +442,30 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
* stored in the rule text to represent the set of characters.
|
||||
* setVariables[i] represents character (setVariablesBase + i).
|
||||
*/
|
||||
public UnicodeSet[] setVariables;
|
||||
private UnicodeSet[] setVariables;
|
||||
|
||||
/**
|
||||
* The character that represents setVariables[0]. Characters
|
||||
* setVariablesBase through setVariablesBase +
|
||||
* setVariables.length - 1 represent UnicodeSet objects.
|
||||
*/
|
||||
public char setVariablesBase;
|
||||
|
||||
/**
|
||||
* Return the UnicodeSet represented by the given character, or
|
||||
* null if none.
|
||||
*/
|
||||
public UnicodeSet lookup(char c) {
|
||||
int i = c - setVariablesBase;
|
||||
return (i >= 0 && i < setVariables.length)
|
||||
? setVariables[i] : null;
|
||||
}
|
||||
private char setVariablesBase;
|
||||
|
||||
/**
|
||||
* The character that represents segment 1. Characters segmentBase
|
||||
* through segmentBase + 8 represent segments 1 through 9.
|
||||
*/
|
||||
public char segmentBase;
|
||||
private char segmentBase;
|
||||
|
||||
/**
|
||||
* Return the UnicodeSet represented by the given character, or
|
||||
* null if none.
|
||||
*/
|
||||
public UnicodeSet lookupSet(char c) {
|
||||
int i = c - setVariablesBase;
|
||||
return (i >= 0 && i < setVariables.length)
|
||||
? setVariables[i] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the zero-based index of the segment represented by the given
|
||||
|
@ -531,18 +499,23 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
private class ParseData implements SymbolTable {
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API. Lookup a variable, returning
|
||||
* either a Character, a UnicodeSet, or null.
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
public Object lookup(String name) {
|
||||
Character ch = (Character) data.variableNames.get(name);
|
||||
if (ch != null) {
|
||||
int i = ch.charValue() - data.setVariablesBase;
|
||||
if (i >= 0 && i < setVariablesVector.size()) {
|
||||
return setVariablesVector.elementAt(i);
|
||||
}
|
||||
public char[] lookup(String name) {
|
||||
return (char[]) data.variableNames.get(name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API.
|
||||
*/
|
||||
public UnicodeSet lookupSet(char ch) {
|
||||
// Note that we cannot use data.lookupSet() because the
|
||||
// set array has not been constructed yet.
|
||||
int i = ch - data.setVariablesBase;
|
||||
if (i >= 0 && i < setVariablesVector.size()) {
|
||||
return (UnicodeSet) setVariablesVector.elementAt(i);
|
||||
}
|
||||
return ch;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -869,10 +842,13 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
String name = parser.parseData.
|
||||
parseReference(rule, pp, limit);
|
||||
pos = pp.getIndex();
|
||||
// If this is a variable definition statement, then the LHS
|
||||
// variable will be undefined. In that case getVariableName()
|
||||
// will return the special placeholder variableLimit-1.
|
||||
buf.append(parser.getVariableDef(name));
|
||||
// If this is a variable definition statement,
|
||||
// then the LHS variable will be undefined. In
|
||||
// that case appendVariableDef() will append the
|
||||
// special placeholder char variableLimit-1.
|
||||
|
||||
//buf.append(parser.getVariableDef(name));
|
||||
parser.appendVariableDef(name, buf);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -1035,11 +1011,12 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
|
||||
syntaxError("Malformed LHS", rule, start);
|
||||
}
|
||||
if (right.text.length() != 1) {
|
||||
syntaxError("Malformed RHS", rule, start);
|
||||
}
|
||||
data.variableNames.put(undefinedVariableName,
|
||||
new Character(right.text.charAt(0)));
|
||||
// We allow anything on the right, including an empty string.
|
||||
int n = right.text.length();
|
||||
char[] value = new char[n];
|
||||
right.text.getChars(0, n, value, 0);
|
||||
data.variableNames.put(undefinedVariableName, value);
|
||||
|
||||
++variableLimit;
|
||||
return pos;
|
||||
}
|
||||
|
@ -1157,12 +1134,12 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the single character value of the given variable name. Defined
|
||||
* names are recognized.
|
||||
* Append the value of the given variable name to the given
|
||||
* StringBuffer.
|
||||
* @exception IllegalArgumentException if the name is unknown.
|
||||
*/
|
||||
private char getVariableDef(String name) {
|
||||
Character ch = (Character) data.variableNames.get(name);
|
||||
private void appendVariableDef(String name, StringBuffer buf) {
|
||||
char[] ch = (char[]) data.variableNames.get(name);
|
||||
if (ch == null) {
|
||||
// We allow one undefined variable so that variable definition
|
||||
// statements work. For the first undefined variable we return
|
||||
|
@ -1173,12 +1150,14 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
if (variableNext >= variableLimit) {
|
||||
throw new RuntimeException("Private use variables exhausted");
|
||||
}
|
||||
return --variableLimit;
|
||||
buf.append((char) --variableLimit);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Undefined variable $"
|
||||
+ name);
|
||||
}
|
||||
throw new IllegalArgumentException("Undefined variable $"
|
||||
+ name);
|
||||
} else {
|
||||
buf.append(ch);
|
||||
}
|
||||
return ch.charValue();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1346,6 +1325,9 @@ public class RuleBasedTransliterator extends Transliterator {
|
|||
|
||||
/**
|
||||
* $Log: RuleBasedTransliterator.java,v $
|
||||
* Revision 1.27 2000/04/25 01:42:58 alan
|
||||
* Allow arbitrary length variable values. Clean up Data API. Update javadocs.
|
||||
*
|
||||
* Revision 1.26 2000/04/22 01:25:10 alan
|
||||
* Add support for cursor positioner '@'; update javadoc
|
||||
*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $
|
||||
* $Date: 2000/04/21 22:16:29 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -32,10 +32,17 @@ public interface SymbolTable {
|
|||
final char SYMBOL_REF = '$';
|
||||
|
||||
/**
|
||||
* Lookup the object associated with this string and return it.
|
||||
* Return <tt>null</tt> if no such name exists.
|
||||
* Lookup the characters associated with this string and return it.
|
||||
* Return <tt>null</tt> if no such name exists. The resultant
|
||||
* array may have length zero.
|
||||
*/
|
||||
Object lookup(String s);
|
||||
char[] lookup(String s);
|
||||
|
||||
/**
|
||||
* Lookup the UnicodeSet associated with the given character, and
|
||||
* return it. Return <tt>null</tt> if not found.
|
||||
*/
|
||||
UnicodeSet lookupSet(char ch);
|
||||
|
||||
/**
|
||||
* Parse a symbol reference name from the given string, starting
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliterationRule.java,v $
|
||||
* $Date: 2000/04/22 01:25:10 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -44,7 +44,7 @@ import com.ibm.util.Utility;
|
|||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.18 $ $Date: 2000/04/22 01:25:10 $
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
/**
|
||||
|
@ -240,7 +240,7 @@ class TransliterationRule {
|
|||
return -1;
|
||||
}
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
return variables.lookup(c) == null ? (c & 0xFF) : -1;
|
||||
return variables.lookupSet(c) == null ? (c & 0xFF) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -300,7 +300,7 @@ class TransliterationRule {
|
|||
return true;
|
||||
}
|
||||
char c = pattern.charAt(anteContextLength);
|
||||
UnicodeSet set = variables.lookup(c);
|
||||
UnicodeSet set = variables.lookupSet(c);
|
||||
return set == null ? (c & 0xFF) == v : set.containsIndexValue(v);
|
||||
}
|
||||
|
||||
|
@ -486,13 +486,16 @@ class TransliterationRule {
|
|||
UnicodeFilter filter) {
|
||||
UnicodeSet set = null;
|
||||
return (filter == null || filter.contains(textChar)) &&
|
||||
(((set = variables.lookup(keyChar)) == null) ?
|
||||
(((set = variables.lookupSet(keyChar)) == null) ?
|
||||
keyChar == textChar : set.contains(textChar));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* $Log: TransliterationRule.java,v $
|
||||
* Revision 1.19 2000/04/25 01:42:58 alan
|
||||
* Allow arbitrary length variable values. Clean up Data API. Update javadocs.
|
||||
*
|
||||
* Revision 1.18 2000/04/22 01:25:10 alan
|
||||
* Add support for cursor positioner '@'; update javadoc
|
||||
*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
|
||||
* $Date: 2000/04/21 22:16:29 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2000/04/25 01:42:58 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -241,7 +241,7 @@ import java.text.*;
|
|||
* *Unsupported by Java (and hence unsupported by UnicodeSet).
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.19 $ $Date: 2000/04/25 01:42:58 $
|
||||
*/
|
||||
public class UnicodeSet implements UnicodeFilter {
|
||||
/**
|
||||
|
@ -774,7 +774,13 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
int start = pos.getIndex();
|
||||
int i = start;
|
||||
int limit = pattern.length();
|
||||
for (; i<limit; ++i) {
|
||||
/* In the case of an embedded SymbolTable variable, we look it up and
|
||||
* then take characters from the resultant char[] array. These chars
|
||||
* are subjected to an extra level of lookup in the SymbolTable in case
|
||||
* they are stand-ins for a nested UnicodeSet. */
|
||||
char[] varValueBuffer = null;
|
||||
int ivarValueBuffer = 0;
|
||||
for (; i<limit; i+=((varValueBuffer==null)?1:0)) {
|
||||
/* If the next element is a single character, c will be set to it,
|
||||
* and nestedPairs will be null. In this case isLiteral indicates
|
||||
* whether the character should assume special meaning if it has
|
||||
|
@ -783,9 +789,23 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
* nestedPairs will be set to the pairs list for the nested set, and
|
||||
* c's value should be ignored.
|
||||
*/
|
||||
char c = pattern.charAt(i);
|
||||
String nestedPairs = null;
|
||||
boolean isLiteral = false;
|
||||
char c;
|
||||
if (varValueBuffer != null) {
|
||||
if (ivarValueBuffer < varValueBuffer.length) {
|
||||
c = varValueBuffer[ivarValueBuffer++];
|
||||
UnicodeSet set = symbols.lookupSet(c);
|
||||
if (set != null) {
|
||||
nestedPairs = set.pairs.toString();
|
||||
}
|
||||
} else {
|
||||
varValueBuffer = null;
|
||||
c = pattern.charAt(i);
|
||||
}
|
||||
} else {
|
||||
c = pattern.charAt(i);
|
||||
}
|
||||
|
||||
// Ignore whitespace. This is not Unicode whitespace, but Java
|
||||
// whitespace, a subset of Unicode whitespace.
|
||||
|
@ -829,82 +849,97 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
// will be 2 if we want a closing ']', or 3 if we should parse a
|
||||
// category and close with ":]".
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
++i;
|
||||
if (i < limit) {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = true;
|
||||
if (c == 'u') {
|
||||
if ((i+4) >= limit) {
|
||||
throw new IllegalArgumentException("Invalid \\u escape");
|
||||
}
|
||||
c = '\u0000';
|
||||
for (int j=(++i)+4; i<j; ++i) { // [sic]
|
||||
int digit = Character.digit(pattern.charAt(i), 16);
|
||||
if (digit<0) {
|
||||
// Only process escapes, variable references, and nested sets
|
||||
// if we are _not_ retrieving characters from the variable
|
||||
// buffer. Characters in the variable buffer have already
|
||||
// benn through escape and variable reference processing.
|
||||
if (varValueBuffer == null) {
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
++i;
|
||||
if (i < limit) {
|
||||
c = pattern.charAt(i);
|
||||
isLiteral = true;
|
||||
if (c == 'u') {
|
||||
if ((i+4) >= limit) {
|
||||
throw new IllegalArgumentException("Invalid \\u escape");
|
||||
}
|
||||
c = (char) ((c << 4) | digit);
|
||||
c = '\u0000';
|
||||
for (int j=(++i)+4; i<j; ++i) { // [sic]
|
||||
int digit = Character.digit(pattern.charAt(i), 16);
|
||||
if (digit<0) {
|
||||
throw new IllegalArgumentException("Invalid \\u escape");
|
||||
}
|
||||
c = (char) ((c << 4) | digit);
|
||||
}
|
||||
--i; // Move i back to last parsed character
|
||||
}
|
||||
--i; // Move i back to last parsed character
|
||||
} else {
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
} else {
|
||||
throw new IllegalArgumentException("Trailing '\\'");
|
||||
}
|
||||
}
|
||||
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, nestedPairs is assigned here.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
|
||||
pos.setIndex(++i);
|
||||
String name = symbols.parseReference(pattern, pos, limit);
|
||||
Object obj = symbols.lookup(name);
|
||||
if (obj == null) {
|
||||
throw new IllegalArgumentException("Undefined variable: "
|
||||
+ name);
|
||||
/* Parse variable references. These are treated as literals. If a
|
||||
* variable refers to a UnicodeSet, nestedPairs is assigned here.
|
||||
* Variable names are only parsed if varNameToChar is not null.
|
||||
* Set variables are only looked up if varCharToSet is not null.
|
||||
*/
|
||||
else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) {
|
||||
pos.setIndex(++i);
|
||||
String name = symbols.parseReference(pattern, pos, limit);
|
||||
/*
|
||||
Object obj = symbols.lookup(name);
|
||||
if (obj == null) {
|
||||
throw new IllegalArgumentException("Undefined variable: "
|
||||
+ name);
|
||||
}
|
||||
isLiteral = true;
|
||||
if (obj instanceof Character) {
|
||||
c = ((Character) obj).charValue();
|
||||
} else {
|
||||
nestedPairs = ((UnicodeSet) obj).pairs.toString();
|
||||
}
|
||||
*/
|
||||
varValueBuffer = symbols.lookup(name);
|
||||
if (varValueBuffer == null) {
|
||||
throw new IllegalArgumentException("Undefined variable: "
|
||||
+ name);
|
||||
}
|
||||
ivarValueBuffer = 0;
|
||||
i = pos.getIndex()-1; // Make i point at last char of var name
|
||||
continue; // Back to the top to get varValueBuffer[0]
|
||||
}
|
||||
isLiteral = true;
|
||||
if (obj instanceof Character) {
|
||||
c = ((Character) obj).charValue();
|
||||
} else {
|
||||
nestedPairs = ((UnicodeSet) obj).pairs.toString();
|
||||
}
|
||||
i = pos.getIndex()-1; // Make i point at last char of var name
|
||||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Handle "[:...:]", representing a character category
|
||||
char d = charAfter(pattern, i);
|
||||
if (d == ':') {
|
||||
i += 2;
|
||||
int j = pattern.indexOf(":]", i);
|
||||
if (j < 0) {
|
||||
throw new IllegalArgumentException("Missing \":]\"");
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedPairs accordingly.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Handle "[:...:]", representing a character category
|
||||
char d = charAfter(pattern, i);
|
||||
if (d == ':') {
|
||||
i += 2;
|
||||
int j = pattern.indexOf(":]", i);
|
||||
if (j < 0) {
|
||||
throw new IllegalArgumentException("Missing \":]\"");
|
||||
}
|
||||
nestedPairs = getCategoryPairs(pattern.substring(i, j));
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i); // Add 2 to point AFTER op
|
||||
nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
}
|
||||
nestedPairs = getCategoryPairs(pattern.substring(i, j));
|
||||
i = j+1; // Make i point to ']' in ":]"
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse loop
|
||||
pairsBuf.append(nestedPairs);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
pos.setIndex(i); // Add 2 to point AFTER op
|
||||
nestedPairs = parse(pattern, pos, symbols, ignoreWhitespace).toString();
|
||||
i = pos.getIndex() - 1; // - 1 to point at ']'
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -994,6 +1029,13 @@ public class UnicodeSet implements UnicodeFilter {
|
|||
}
|
||||
pos.setIndex(i+1);
|
||||
|
||||
if (false) {
|
||||
// Debug parser
|
||||
System.out.println("UnicodeSet(" +
|
||||
pattern.substring(start, i+1) + ") -> " +
|
||||
pairsBuf.toString());
|
||||
}
|
||||
|
||||
return pairsBuf;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue