From aa61987272d2668d43c0a5ec175aa94b6e9d7030 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Fri, 21 Apr 2000 22:16:29 +0000 Subject: [PATCH] Delete variable name parsing to SymbolTable interface to consolidate parsing code. X-SVN-Rev: 1212 --- .../ibm/icu/text/RuleBasedTransliterator.java | 63 ++++++++++++++----- icu4j/src/com/ibm/icu/text/SymbolTable.java | 34 +++++++++- icu4j/src/com/ibm/icu/text/UnicodeSet.java | 62 +++--------------- .../com/ibm/text/RuleBasedTransliterator.java | 63 ++++++++++++++----- icu4j/src/com/ibm/text/SymbolTable.java | 34 +++++++++- icu4j/src/com/ibm/text/UnicodeSet.java | 62 +++--------------- 6 files changed, 168 insertions(+), 150 deletions(-) diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java index a884f12f03c..ac4e32a4034 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $ - * $Date: 2000/04/21 21:16:40 $ - * $Revision: 1.21 $ + * $Date: 2000/04/21 22:16:29 $ + * $Revision: 1.22 $ * ***************************************************************************************** */ @@ -274,7 +274,7 @@ import com.ibm.util.Utility; *

Copyright (c) IBM Corporation 1999-2000. All rights reserved.

* * @author Alan Liu - * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.21 $ $Date: 2000/04/21 21:16:40 $ + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.22 $ $Date: 2000/04/21 22:16:29 $ */ public class RuleBasedTransliterator extends Transliterator { @@ -534,6 +534,31 @@ public class RuleBasedTransliterator extends Transliterator { } return ch; } + + /** + * Implement SymbolTable API. Parse out a symbol reference + * name. + */ + public String parseReference(String text, ParsePosition pos, int limit) { + int start = pos.getIndex(); + int i = start; + if (i < limit) { + char c = text.charAt(i); + if (Character.isUnicodeIdentifierStart(c)) { + ++i; + while (i < limit && + Character.isUnicodeIdentifierPart(text.charAt(i))) { + ++i; + } + } + } + if (i == start) { // No valid name chars + throw new IllegalArgumentException("Illegal variable reference " + + text.substring(start, limit)); + } + pos.setIndex(i); + return text.substring(start, i); + } } /** @@ -587,7 +612,6 @@ public class RuleBasedTransliterator extends Transliterator { private static final char END_OF_RULE = ';'; private static final char RULE_COMMENT_CHAR = '#'; - private static final char VARIABLE_REF = '$'; // also segment refs private static final char CONTEXT_ANTE = '{'; // ante{key private static final char CONTEXT_POST = '}'; // key}post private static final char SET_OPEN = '['; @@ -711,6 +735,7 @@ public class RuleBasedTransliterator extends Transliterator { RuleBasedTransliterator.Parser parser) { int start = pos; StringBuffer buf = new StringBuffer(); + ParsePosition pp = null; main: while (pos < limit) { @@ -781,7 +806,7 @@ public class RuleBasedTransliterator extends Transliterator { case END_OF_RULE: --pos; // Backup to point to END_OF_RULE break main; - case VARIABLE_REF: + case SymbolTable.SYMBOL_REF: // Handle variable references and segment references "$1" .. "$9" { // A variable reference must be followed immediately @@ -792,28 +817,26 @@ public class RuleBasedTransliterator extends Transliterator { syntaxError("Trailing " + c, rule, start); } // Parse "$1" "$2" .. "$9" - c = rule.charAt(pos++); + c = rule.charAt(pos); int r = Character.digit(c, 10); if (r >= 1 && r <= 9) { if (r > maxRef) { maxRef = r; } buf.append((char) (parser.data.segmentBase + r - 1)); - } else if (Character.isUnicodeIdentifierStart(c)) { - int j = pos; - while (j < limit && - Character.isUnicodeIdentifierPart(rule.charAt(j))) { - ++j; + ++pos; + } else { + if (pp == null) { // Lazy create + pp = new ParsePosition(0); } - String name = rule.substring(pos-1, j); - pos = j; + pp.setIndex(pos); + String name = parser.parseData. + parseReference(rule, pp, limit); + pos = pp.getIndex(); // If this is a variable definition statement, then the LHS // variable will be undefined. In that case getVariableName() // will return the special placeholder variableLimit-1. buf.append(parser.getVariableDef(name)); - } else { - syntaxError("Illegal char after " + VARIABLE_REF, - rule, start); } } break; @@ -830,7 +853,10 @@ public class RuleBasedTransliterator extends Transliterator { post = buf.length(); break; case SET_OPEN: - ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '[' + if (pp == null) { + pp = new ParsePosition(0); + } + pp.setIndex(pos-1); // Backup to opening '[' buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData))); pos = pp.getIndex(); break; @@ -1231,6 +1257,9 @@ public class RuleBasedTransliterator extends Transliterator { /** * $Log: RuleBasedTransliterator.java,v $ + * Revision 1.22 2000/04/21 22:16:29 alan + * Delete variable name parsing to SymbolTable interface to consolidate parsing code. + * * Revision 1.21 2000/04/21 21:16:40 alan * Modify rule syntax * diff --git a/icu4j/src/com/ibm/icu/text/SymbolTable.java b/icu4j/src/com/ibm/icu/text/SymbolTable.java index 60c02b35052..c3f9a36f410 100755 --- a/icu4j/src/com/ibm/icu/text/SymbolTable.java +++ b/icu4j/src/com/ibm/icu/text/SymbolTable.java @@ -5,21 +5,49 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SymbolTable.java,v $ - * $Date: 2000/03/10 04:07:24 $ - * $Revision: 1.2 $ + * $Date: 2000/04/21 22:16:29 $ + * $Revision: 1.3 $ * ***************************************************************************************** */ package com.ibm.text; +import java.text.ParsePosition; /** - * An interface that maps strings to objects. + * An interface that maps strings to objects. This interface defines + * both lookup protocol and parsing. This allows different components + * to share a symbol table and to handle name parsing uniformly. It + * is expected that client parse code look for the SYMBOL_REF + * character and, when seen, attempt to parse the characters after it + * using parseReference(). + * + *

Currently, RuleBasedTransliterator and UnicodeSet use this + * interface to share variable definitions. */ public interface SymbolTable { + /** + * The character preceding a symbol reference name. + */ + final char SYMBOL_REF = '$'; + /** * Lookup the object associated with this string and return it. * Return null if no such name exists. */ Object lookup(String s); + + /** + * Parse a symbol reference name from the given string, starting + * at the given position. If no valid symbol reference name is + * found, throw an exception. + * @param text the text to parse for the name + * @param pos on entry, the index of the first character to parse. + * This is the character following the SYMBOL_REF character. On + * exit, the index after the last parsed character. + * @param limit the index after the last character to be parsed. + * @return the parsed name. + * @exception IllegalArgumentException if no valid name is found. + */ + String parseReference(String text, ParsePosition pos, int limit); } diff --git a/icu4j/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/src/com/ibm/icu/text/UnicodeSet.java index b1bf8e29945..b5316a4d2ae 100755 --- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $ - * $Date: 2000/04/21 21:16:40 $ - * $Revision: 1.17 $ + * $Date: 2000/04/21 22:16:29 $ + * $Revision: 1.18 $ * ***************************************************************************************** */ @@ -241,7 +241,7 @@ import java.text.*; * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.17 $ $Date: 2000/04/21 21:16:40 $ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $ */ public class UnicodeSet implements UnicodeFilter { /** @@ -268,9 +268,6 @@ public class UnicodeSet implements UnicodeFilter { private static final int UNSUPPORTED_CATEGORY = 17; - private static final char VARIABLE_REF_OPEN = '{'; - private static final char VARIABLE_REF_CLOSE = '}'; - private static final int CATEGORY_COUNT = 29; /** @@ -866,13 +863,9 @@ public class UnicodeSet implements UnicodeFilter { * Variable names are only parsed if varNameToChar is not null. * Set variables are only looked up if varCharToSet is not null. */ - else if (symbols != null && !isLiteral && c == VARIABLE_REF_OPEN) { - ++i; - int j = pattern.indexOf(VARIABLE_REF_CLOSE, i); - if (i == j || j < 0) { // empty or unterminated - throw new IllegalArgumentException("Illegal variable reference"); - } - String name = pattern.substring(i, j); + else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) { + pos.setIndex(++i); + String name = symbols.parseReference(pattern, pos, limit); Object obj = symbols.lookup(name); if (obj == null) { throw new IllegalArgumentException("Undefined variable: " @@ -884,50 +877,9 @@ public class UnicodeSet implements UnicodeFilter { } else { nestedPairs = ((UnicodeSet) obj).pairs.toString(); } - i = j; // Make i point at closing '}' + i = pos.getIndex()-1; // Make i point at last char of var name } - /* Parse variable references. These are treated as literals. If a - * variable refers to a UnicodeSet, nestedPairs is assigned here. - * Variable names are only parsed if varNameToChar is not null. - * Set variables are only looked up if varCharToSet is not null. - */ - // TEMPORARY - // TEMPORARY - // TEMPORARY - else if (symbols != null && !isLiteral && c == '$') { - ++i; - c = pattern.charAt(i); - int j = i; - if (Character.isUnicodeIdentifierStart(c)) { - ++j; - while (j < limit && - Character.isUnicodeIdentifierPart(pattern.charAt(j))) { - ++j; - } - } - if (i == j || j < 0) { // empty or unterminated - throw new IllegalArgumentException("Illegal variable reference " + - pattern.substring(i-1, limit)); - } - String name = pattern.substring(i, j); - Object obj = symbols.lookup(name); - if (obj == null) { - throw new IllegalArgumentException("Undefined variable: " - + name); - } - isLiteral = true; - if (obj instanceof Character) { - c = ((Character) obj).charValue(); - } else { - nestedPairs = ((UnicodeSet) obj).pairs.toString(); - } - i = j-1; // Make i point at last char of var name - } - // TEMPORARY - // TEMPORARY - // TEMPORARY - /* An opening bracket indicates the first bracket of a nested * subpattern, either a normal pattern or a category pattern. We * recognize these here and set nestedPairs accordingly. diff --git a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java index 2ea8b3e7162..d714fc9cad7 100755 --- a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java +++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $ - * $Date: 2000/04/21 21:16:40 $ - * $Revision: 1.21 $ + * $Date: 2000/04/21 22:16:29 $ + * $Revision: 1.22 $ * ***************************************************************************************** */ @@ -274,7 +274,7 @@ import com.ibm.util.Utility; *

Copyright (c) IBM Corporation 1999-2000. All rights reserved.

* * @author Alan Liu - * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.21 $ $Date: 2000/04/21 21:16:40 $ + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.22 $ $Date: 2000/04/21 22:16:29 $ */ public class RuleBasedTransliterator extends Transliterator { @@ -534,6 +534,31 @@ public class RuleBasedTransliterator extends Transliterator { } return ch; } + + /** + * Implement SymbolTable API. Parse out a symbol reference + * name. + */ + public String parseReference(String text, ParsePosition pos, int limit) { + int start = pos.getIndex(); + int i = start; + if (i < limit) { + char c = text.charAt(i); + if (Character.isUnicodeIdentifierStart(c)) { + ++i; + while (i < limit && + Character.isUnicodeIdentifierPart(text.charAt(i))) { + ++i; + } + } + } + if (i == start) { // No valid name chars + throw new IllegalArgumentException("Illegal variable reference " + + text.substring(start, limit)); + } + pos.setIndex(i); + return text.substring(start, i); + } } /** @@ -587,7 +612,6 @@ public class RuleBasedTransliterator extends Transliterator { private static final char END_OF_RULE = ';'; private static final char RULE_COMMENT_CHAR = '#'; - private static final char VARIABLE_REF = '$'; // also segment refs private static final char CONTEXT_ANTE = '{'; // ante{key private static final char CONTEXT_POST = '}'; // key}post private static final char SET_OPEN = '['; @@ -711,6 +735,7 @@ public class RuleBasedTransliterator extends Transliterator { RuleBasedTransliterator.Parser parser) { int start = pos; StringBuffer buf = new StringBuffer(); + ParsePosition pp = null; main: while (pos < limit) { @@ -781,7 +806,7 @@ public class RuleBasedTransliterator extends Transliterator { case END_OF_RULE: --pos; // Backup to point to END_OF_RULE break main; - case VARIABLE_REF: + case SymbolTable.SYMBOL_REF: // Handle variable references and segment references "$1" .. "$9" { // A variable reference must be followed immediately @@ -792,28 +817,26 @@ public class RuleBasedTransliterator extends Transliterator { syntaxError("Trailing " + c, rule, start); } // Parse "$1" "$2" .. "$9" - c = rule.charAt(pos++); + c = rule.charAt(pos); int r = Character.digit(c, 10); if (r >= 1 && r <= 9) { if (r > maxRef) { maxRef = r; } buf.append((char) (parser.data.segmentBase + r - 1)); - } else if (Character.isUnicodeIdentifierStart(c)) { - int j = pos; - while (j < limit && - Character.isUnicodeIdentifierPart(rule.charAt(j))) { - ++j; + ++pos; + } else { + if (pp == null) { // Lazy create + pp = new ParsePosition(0); } - String name = rule.substring(pos-1, j); - pos = j; + pp.setIndex(pos); + String name = parser.parseData. + parseReference(rule, pp, limit); + pos = pp.getIndex(); // If this is a variable definition statement, then the LHS // variable will be undefined. In that case getVariableName() // will return the special placeholder variableLimit-1. buf.append(parser.getVariableDef(name)); - } else { - syntaxError("Illegal char after " + VARIABLE_REF, - rule, start); } } break; @@ -830,7 +853,10 @@ public class RuleBasedTransliterator extends Transliterator { post = buf.length(); break; case SET_OPEN: - ParsePosition pp = new ParsePosition(pos-1); // Backup to opening '[' + if (pp == null) { + pp = new ParsePosition(0); + } + pp.setIndex(pos-1); // Backup to opening '[' buf.append(parser.registerSet(new UnicodeSet(rule, pp, parser.parseData))); pos = pp.getIndex(); break; @@ -1231,6 +1257,9 @@ public class RuleBasedTransliterator extends Transliterator { /** * $Log: RuleBasedTransliterator.java,v $ + * Revision 1.22 2000/04/21 22:16:29 alan + * Delete variable name parsing to SymbolTable interface to consolidate parsing code. + * * Revision 1.21 2000/04/21 21:16:40 alan * Modify rule syntax * diff --git a/icu4j/src/com/ibm/text/SymbolTable.java b/icu4j/src/com/ibm/text/SymbolTable.java index 8ff6f86536e..714bae4b66d 100755 --- a/icu4j/src/com/ibm/text/SymbolTable.java +++ b/icu4j/src/com/ibm/text/SymbolTable.java @@ -5,21 +5,49 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/SymbolTable.java,v $ - * $Date: 2000/03/10 04:07:24 $ - * $Revision: 1.2 $ + * $Date: 2000/04/21 22:16:29 $ + * $Revision: 1.3 $ * ***************************************************************************************** */ package com.ibm.text; +import java.text.ParsePosition; /** - * An interface that maps strings to objects. + * An interface that maps strings to objects. This interface defines + * both lookup protocol and parsing. This allows different components + * to share a symbol table and to handle name parsing uniformly. It + * is expected that client parse code look for the SYMBOL_REF + * character and, when seen, attempt to parse the characters after it + * using parseReference(). + * + *

Currently, RuleBasedTransliterator and UnicodeSet use this + * interface to share variable definitions. */ public interface SymbolTable { + /** + * The character preceding a symbol reference name. + */ + final char SYMBOL_REF = '$'; + /** * Lookup the object associated with this string and return it. * Return null if no such name exists. */ Object lookup(String s); + + /** + * Parse a symbol reference name from the given string, starting + * at the given position. If no valid symbol reference name is + * found, throw an exception. + * @param text the text to parse for the name + * @param pos on entry, the index of the first character to parse. + * This is the character following the SYMBOL_REF character. On + * exit, the index after the last parsed character. + * @param limit the index after the last character to be parsed. + * @return the parsed name. + * @exception IllegalArgumentException if no valid name is found. + */ + String parseReference(String text, ParsePosition pos, int limit); } diff --git a/icu4j/src/com/ibm/text/UnicodeSet.java b/icu4j/src/com/ibm/text/UnicodeSet.java index bc32947ed7b..82ed7bbbe92 100755 --- a/icu4j/src/com/ibm/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/text/UnicodeSet.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $ - * $Date: 2000/04/21 21:16:40 $ - * $Revision: 1.17 $ + * $Date: 2000/04/21 22:16:29 $ + * $Revision: 1.18 $ * ***************************************************************************************** */ @@ -241,7 +241,7 @@ import java.text.*; * *Unsupported by Java (and hence unsupported by UnicodeSet). * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.17 $ $Date: 2000/04/21 21:16:40 $ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.18 $ $Date: 2000/04/21 22:16:29 $ */ public class UnicodeSet implements UnicodeFilter { /** @@ -268,9 +268,6 @@ public class UnicodeSet implements UnicodeFilter { private static final int UNSUPPORTED_CATEGORY = 17; - private static final char VARIABLE_REF_OPEN = '{'; - private static final char VARIABLE_REF_CLOSE = '}'; - private static final int CATEGORY_COUNT = 29; /** @@ -866,13 +863,9 @@ public class UnicodeSet implements UnicodeFilter { * Variable names are only parsed if varNameToChar is not null. * Set variables are only looked up if varCharToSet is not null. */ - else if (symbols != null && !isLiteral && c == VARIABLE_REF_OPEN) { - ++i; - int j = pattern.indexOf(VARIABLE_REF_CLOSE, i); - if (i == j || j < 0) { // empty or unterminated - throw new IllegalArgumentException("Illegal variable reference"); - } - String name = pattern.substring(i, j); + else if (symbols != null && !isLiteral && c == SymbolTable.SYMBOL_REF) { + pos.setIndex(++i); + String name = symbols.parseReference(pattern, pos, limit); Object obj = symbols.lookup(name); if (obj == null) { throw new IllegalArgumentException("Undefined variable: " @@ -884,50 +877,9 @@ public class UnicodeSet implements UnicodeFilter { } else { nestedPairs = ((UnicodeSet) obj).pairs.toString(); } - i = j; // Make i point at closing '}' + i = pos.getIndex()-1; // Make i point at last char of var name } - /* Parse variable references. These are treated as literals. If a - * variable refers to a UnicodeSet, nestedPairs is assigned here. - * Variable names are only parsed if varNameToChar is not null. - * Set variables are only looked up if varCharToSet is not null. - */ - // TEMPORARY - // TEMPORARY - // TEMPORARY - else if (symbols != null && !isLiteral && c == '$') { - ++i; - c = pattern.charAt(i); - int j = i; - if (Character.isUnicodeIdentifierStart(c)) { - ++j; - while (j < limit && - Character.isUnicodeIdentifierPart(pattern.charAt(j))) { - ++j; - } - } - if (i == j || j < 0) { // empty or unterminated - throw new IllegalArgumentException("Illegal variable reference " + - pattern.substring(i-1, limit)); - } - String name = pattern.substring(i, j); - Object obj = symbols.lookup(name); - if (obj == null) { - throw new IllegalArgumentException("Undefined variable: " - + name); - } - isLiteral = true; - if (obj instanceof Character) { - c = ((Character) obj).charValue(); - } else { - nestedPairs = ((UnicodeSet) obj).pairs.toString(); - } - i = j-1; // Make i point at last char of var name - } - // TEMPORARY - // TEMPORARY - // TEMPORARY - /* An opening bracket indicates the first bracket of a nested * subpattern, either a normal pattern or a category pattern. We * recognize these here and set nestedPairs accordingly.