ICU-64 allow ::ID blocks in rules

X-SVN-Rev: 5878
This commit is contained in:
Alan Liu 2001-09-21 21:24:04 +00:00
parent e186f84db0
commit e39cca2e96
8 changed files with 712 additions and 102 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2001/09/20 21:21:10 $
* $Revision: 1.44 $
* $Date: 2001/09/21 21:23:34 $
* $Revision: 1.45 $
*
*****************************************************************************************
*/
@ -357,8 +357,8 @@ public class TransliteratorTest extends TestFmwk {
* Compose the hex transliterators forward and reverse.
*/
public void TestCompoundHex() {
Transliterator a = Transliterator.getInstance("Unicode-Hex");
Transliterator b = Transliterator.getInstance("Hex-Unicode");
Transliterator a = Transliterator.getInstance("Any-Hex");
Transliterator b = Transliterator.getInstance("Hex-Any");
Transliterator[] trans = { a, b };
Transliterator ab = new CompoundTransliterator(trans);
@ -379,7 +379,7 @@ public class TransliteratorTest extends TestFmwk {
* Do some basic tests of filtering.
*/
public void TestFiltering() {
Transliterator hex = Transliterator.getInstance("Unicode-Hex");
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
public boolean contains(char c) {
return c != 'c';
@ -510,18 +510,18 @@ public class TransliteratorTest extends TestFmwk {
* Prefix, suffix support in hex transliterators
*/
public void TestJ243() {
// Test default Hex-Unicode, which should handle
// Test default Hex-Any, which should handle
// \\u, \\U, u+, and U+
HexToUnicodeTransliterator hex = new HexToUnicodeTransliterator();
expect(hex, "\\u0041+\\U0042,u+0043uu+0044z", "A+B,CuDz");
// Try a custom Hex-Unicode
// Try a custom Hex-Any
// \\uXXXX and &#xXXXX;
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x0123",
"abcd5fx0123");
// Try custom Unicode-Hex (default is tested elsewhere)
// Try custom Any-Hex (default is tested elsewhere)
UnicodeToHexTransliterator hex3 = new UnicodeToHexTransliterator("&\\#x###0;");
expect(hex3, "012", "012");
}
@ -748,13 +748,13 @@ public class TransliteratorTest extends TestFmwk {
*/
public void TestFilterIDs() {
String[] DATA = {
"Unicode[aeiou]-Hex",
"Hex[aeiou]-Unicode",
"Any[aeiou]-Hex",
"Hex[aeiou]-Any",
"quizzical",
"q\\u0075\\u0069zz\\u0069c\\u0061l",
"Unicode[aeiou]-Hex;Hex[^5]-Unicode",
"Unicode[^5]-Hex;Hex[aeiou]-Unicode",
"Any[aeiou]-Hex;Hex[^5]-Any",
"Any[^5]-Hex;Hex[aeiou]-Any",
"quizzical",
"q\\u0075izzical",
@ -961,6 +961,96 @@ public class TransliteratorTest extends TestFmwk {
}
}
/**
* Test compound RBT rules.
*/
public void TestCompoundRBT() {
// Careful with spacing and ';' here: Phrase this exactly
// as toRules() is going to return it. If toRules() changes
// with regard to spacing or ';', then adjust this string.
String rule = "::Hex-Any;\n" +
"::Any-Lower;\n" +
"a > '.A.';\n" +
"b > '.B.';\n" +
"::Any[^t]-Upper;";
Transliterator t = Transliterator.createFromRules("Test", rule, Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
expect(t, "\u0043at in the hat, bat on the mat",
"C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
String r = t.toRules(true);
if (r.equals(rule)) {
logln("OK: toRules() => " + r);
} else {
errln("FAIL: toRules() => " + r +
", expected " + rule);
}
// Now test toRules
t = Transliterator.getInstance("Greek-Latin; Latin-Cyrillic", Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createInstance failed");
return;
}
String exp = "::Greek-Latin;\n::Latin-Cyrillic;";
r = t.toRules(true);
if (!r.equals(exp)) {
errln("FAIL: toRules() => " + r +
", expected " + exp);
} else {
logln("OK: toRules() => " + r);
}
// Round trip the result of toRules
t = Transliterator.createFromRules("Test", r, Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules #2 failed");
return;
} else {
logln("OK: createFromRules(" + r + ") succeeded");
}
// Test toRules again
r = t.toRules(true);
if (!r.equals(exp)) {
errln("FAIL: toRules() => " + r +
", expected " + exp);
} else {
logln("OK: toRules() => " + r);
}
// Test Foo(Bar) IDs. Careful with spacing in id; make it conform
// to what the regenerated ID will look like.
String id = "Upper(Lower);(NFKC)";
t = Transliterator.getInstance(id, Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createInstance #2 failed");
return;
}
if (t.getID().equals(id)) {
logln("OK: created " + id);
} else {
errln("FAIL: createInstance(" + id +
").getID() => " + t.getID());
}
Transliterator u = t.getInverse();
if (u == null) {
errln("FAIL: createInverse failed");
return;
}
exp = "NFKC();Lower(Upper)";
if (u.getID().equals(exp)) {
logln("OK: createInverse(" + id + ") => " +
u.getID());
} else {
errln("FAIL: createInverse(" + id + ") => " +
u.getID());
}
}
/**
* Test inverse of Greek-Latin; Title()
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java,v $
* $Date: 2001/09/20 21:20:39 $
* $Revision: 1.13 $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.14 $
*
*****************************************************************************************
*/
@ -35,7 +35,7 @@ import java.util.Vector;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.13 $ $Date: 2001/09/20 21:20:39 $
* @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.14 $ $Date: 2001/09/21 21:24:04 $
*/
public class CompoundTransliterator extends Transliterator {
@ -139,6 +139,19 @@ public class CompoundTransliterator extends Transliterator {
this(ID, FORWARD, null);
}
/**
* Package private constructor for compound RBTs. Construct a
* compound transliterator using the given idBlock, with the
* splitTrans inserted at the idSplitPoint.
*/
CompoundTransliterator(String ID,
String idBlock,
int idSplitPoint,
Transliterator splitTrans) {
super(ID, null);
init(idBlock, FORWARD, idSplitPoint, splitTrans, false);
}
/**
* Package private constructor for Transliterator from a vector of
* transliterators. The vector order is FORWARD, so if dir is
@ -154,6 +167,39 @@ public class CompoundTransliterator extends Transliterator {
// assume caller will fixup ID
}
/**
* Finish constructing a transliterator: only to be called by
* constructors. Before calling init(), set trans and filter to NULL.
* @param id the id containing ';'-separated entries
* @param direction either FORWARD or REVERSE
* @param idSplitPoint the index into id at which the
* splitTrans should be inserted, if there is one, or
* -1 if there is none.
* @param splitTrans a transliterator to be inserted
* before the entry at offset idSplitPoint in the id string. May be
* NULL to insert no entry.
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
*/
private void init(String id,
int direction,
int idSplitPoint,
Transliterator splitTrans,
boolean fixReverseID) {
// assert(trans == 0);
Vector list = new Vector();
int[] splitTransIndex = new int[1];
StringBuffer regenID = new StringBuffer();
Transliterator.parseCompoundID(id, regenID, direction,
idSplitPoint, splitTrans,
list, splitTransIndex);
compoundRBTIndex = splitTransIndex[0];
init(list, direction, fixReverseID);
}
/**
* Finish constructing a transliterator: only to be called by
* constructors. Before calling init(), set trans and filter to NULL.
@ -165,7 +211,6 @@ public class CompoundTransliterator extends Transliterator {
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
* @param status the error code indicating success or failure
*/
private void init(Vector list,
int direction,
@ -302,6 +347,34 @@ public class CompoundTransliterator extends Transliterator {
super.setFilter(f);
}
public String toRules(boolean escapeUnprintable) {
// We do NOT call toRules() on our component transliterators, in
// general. If we have several rule-based transliterators, this
// yields a concatenation of the rules -- not what we want. We do
// handle compound RBT transliterators specially -- those for which
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
// we do call toRules() recursively.
StringBuffer rulesSource = new StringBuffer();
for (int i=0; i<trans.length; ++i) {
String rule;
if (i == compoundRBTIndex) {
rule = trans[i].toRules(escapeUnprintable);
} else {
rule = trans[i].baseToRules(escapeUnprintable);
}
if (rulesSource.length() != 0 &&
rulesSource.charAt(rulesSource.length() - 1) != '\n') {
rulesSource.append('\n');
}
rulesSource.append(rule);
if (rulesSource.length() != 0 &&
rulesSource.charAt(rulesSource.length() - 1) != ID_DELIM) {
rulesSource.append(ID_DELIM);
}
}
return rulesSource.toString();
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java,v $
* $Date: 2001/09/19 17:43:37 $
* $Revision: 1.43 $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.44 $
*
*****************************************************************************************
*/
@ -279,12 +279,16 @@ import com.ibm.text.resources.ResourceReader;
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.43 $ $Date: 2001/09/19 17:43:37 $
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.44 $ $Date: 2001/09/21 21:24:04 $
*/
public class RuleBasedTransliterator extends Transliterator {
private Data data;
// Indicator for ID blocks
private static final String ID_TOKEN = "::";
private static final int ID_TOKEN_LEN = 2;
private static final String COPYRIGHT =
"\u00A9 IBM Corporation 1999. All rights reserved.";
@ -334,6 +338,31 @@ public class RuleBasedTransliterator extends Transliterator {
return new Parser(rules, direction).getData();
}
/**
* Parse a given set of rules. Return up to three pieces of
* parsed data. These are the header ::id block, the rule block,
* and the footer ::id block. Any or all of these may be empty.
* If the ::id blocks are empty, their corresponding parameters
* are returned as the empty string. If there are no rules, the
* TransliterationRuleData result is 0.
* @param ruleDataResult caller owns the pointer stored here.
* May be NULL.
* @param headerRule string including semicolons for the header
* ::id block. May be empty.
* @param footerRule string including semicolons for the footer
* ::id block. May be empty.
*/
static Data parse(String rules,
int direction,
StringBuffer idBlockResult,
int[] idSplitPointResult) {
Parser parser = new Parser(new String[] { rules }, direction);
idBlockResult.setLength(0);
idBlockResult.append(parser.idBlock);
idSplitPointResult[0] = parser.idSplitPoint;
return (parser.ruleCount == 0) ? null : parser.getData();
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
@ -474,15 +503,27 @@ public class RuleBasedTransliterator extends Transliterator {
private static class Parser {
/**
* Current rule being parsed.
*/
private String rules;
private int direction;
private Data data;
// In a compound RBT, the index at which the RBT rules are
// inserted into the ID block. Index 0 means before any IDs
// in the block. Index idBlock.length() means after all IDs
// in the block. Index is a string index.
int idSplitPoint;
// The block of ::IDs, both at the top and at the bottom.
// Inserted into these may be additional rules at the
// idSplitPoint.
String idBlock;
// The number of rules parsed. This tells us if there were
// any actual transliterator rules, or if there were just ::ID
// block IDs.
int ruleCount;
/**
* This class implements the SymbolTable interface. It is used
* during parsing to give UnicodeSet access to variables that
@ -717,6 +758,8 @@ public class RuleBasedTransliterator extends Transliterator {
* rules
*/
private void parseRules(RuleBody ruleArray) {
ruleCount = 0;
determineVariableRange(ruleArray);
setVariablesVector = new Vector();
parseData = new ParseData();
@ -725,6 +768,16 @@ public class RuleBasedTransliterator extends Transliterator {
int errorCount = 0;
ruleArray.reset();
StringBuffer idBlockResult = new StringBuffer();
idSplitPoint = -1;
// The mode marks whether we are in the header ::id block, the
// rule block, or the footer ::id block.
// mode == 0: start: rule->1, ::id->0
// mode == 1: in rules: rule->1, ::id->2
// mode == 2: in footer rule block: rule->ERROR, ::id->2
int mode = 0;
main:
for (;;) {
String rule = ruleArray.nextLine();
@ -754,10 +807,54 @@ public class RuleBasedTransliterator extends Transliterator {
// at once. We keep parsing rules even after a failure, up
// to a specified limit, and report all errors at once.
try {
// We've found the start of a rule. c is its first
// character, and pos points past c. Lexically parse the
// rule into component pieces.
pos = parseRule(rule, --pos, limit);
// We've found the start of a rule or ID. c is its first
// character, and pos points past c.
--pos;
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
// chars left.
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
pos += ID_TOKEN_LEN;
c = rule.charAt(pos);
while (UCharacter.isWhitespace(c) && pos < limit) {
++pos;
c = rule.charAt(pos);
}
int[] p = new int[] { pos };
boolean[] sawDelim = new boolean[1];
StringBuffer regenID = new StringBuffer();
Transliterator.parseID(rule, regenID, p, sawDelim, direction, false);
if (p[0] == pos || !sawDelim[0]) {
// Invalid ::id
int i1 = pos + 2;
while (i1 < rule.length() && rule.charAt(i1) != ';') {
++i1;
}
throw new IllegalArgumentException("Invalid ::ID " +
rule.substring(pos, i1));
} else {
if (mode == 1) {
mode = 2;
idSplitPoint = idBlockResult.length();
}
String str = rule.substring(pos, p[0]);
idBlockResult.append(str);
if (!sawDelim[0]) {
idBlockResult.append(';');
}
pos = p[0];
}
} else {
// Parse a rule
pos = parseRule(rule, pos, limit);
++ruleCount;
if (mode == 2) {
// ::id in illegal position (because a rule
// occurred after the ::id footer block)
throw new IllegalArgumentException("::ID in illegal position");
}
mode = 1;
}
} catch (IllegalArgumentException e) {
if (errorCount == 30) {
errors.append("\nMore than 30 errors; further messages squelched");
@ -774,6 +871,8 @@ public class RuleBasedTransliterator extends Transliterator {
}
}
idBlock = idBlockResult.toString();
// Convert the set vector to an array
data.setVariables = new UnicodeSet[setVariablesVector.size()];
setVariablesVector.copyInto(data.setVariables);
@ -1480,6 +1579,9 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.44 2001/09/21 21:24:04 alan
* jitterbug 64: allow ::ID blocks in rules
*
* Revision 1.43 2001/09/19 17:43:37 alan
* jitterbug 60: initial implementation of toRules()
*

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Transliterator.java,v $
* $Date: 2001/09/20 21:20:39 $
* $Revision: 1.39 $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.40 $
*
*****************************************************************************************
*/
@ -241,7 +241,7 @@ import com.ibm.util.CaseInsensitiveString;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.39 $ $Date: 2001/09/20 21:20:39 $
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.40 $ $Date: 2001/09/21 21:24:04 $
*/
public abstract class Transliterator {
/**
@ -832,12 +832,52 @@ public abstract class Transliterator {
* NullTransliterator, if it contains ID blocks which parse as
* empty for the given direction.
*/
public static final Transliterator createFromRules(String ID, String rules, int direction) {
public static final Transliterator createFromRules(String ID, String rules, int dir) {
// TODO Flesh this out
return new RuleBasedTransliterator(ID, rules, direction, null);
//// return new RuleBasedTransliterator(ID, rules, direction, null);
StringBuffer idBlock = new StringBuffer();
int[] idSplitPoint = new int[] { -1 };
RuleBasedTransliterator.Data data = null;
data = RuleBasedTransliterator.parse(rules, dir,
idBlock, idSplitPoint);
// NOTE: The logic here matches that in TransliteratorRegistry.
if (idBlock.length() == 0) {
if (data == null) {
// No idBlock, no data -- this is just an
// alias for Null
return new NullTransliterator();
} else {
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA.
return new RuleBasedTransliterator(ID, data, null);
}
} else {
if (data == null) {
// idBlock, no data -- this is an alias
Transliterator t = getInstance(idBlock.toString(), dir);
if (t != null) {
t.setID(ID);
}
return t;
} else {
// idBlock and data -- this is a compound
// RBT
Transliterator t = new RuleBasedTransliterator("_", data, null);
t = new CompoundTransliterator(ID, idBlock.toString(), idSplitPoint[0],
t);
return t;
}
}
}
public String toRules(boolean escapeUnprintable) {
return baseToRules(escapeUnprintable);
}
protected final String baseToRules(boolean escapeUnprintable) {
// The base class implementation of toRules munges the ID into
// the correct format. That is: foo => ::foo
// KEEP in sync with rbt_pars
@ -860,13 +900,13 @@ public abstract class Transliterator {
* 'result' at which the adoptedSplitTrans is stored, or -1 if
* adoptedSplitTrans == 0
*/
private static void parseCompoundID(String id,
StringBuffer regenID,
int dir,
int idSplitPoint,
Transliterator adoptedSplitTrans,
Vector result,
int[] splitTransIndex) {
static void parseCompoundID(String id,
StringBuffer regenID,
int dir,
int idSplitPoint,
Transliterator splitTrans,
Vector result,
int[] splitTransIndex) {
regenID.setLength(0);
splitTransIndex[0] = -1;
int pos = 0;
@ -874,10 +914,10 @@ public abstract class Transliterator {
while (pos < id.length()) {
// We compare (pos >= split), not (pos == split), so we can
// skip over whitespace (see below).
if (pos >= idSplitPoint && adoptedSplitTrans != null) {
if (pos >= idSplitPoint && splitTrans != null) {
splitTransIndex[0] = result.size();
result.addElement(adoptedSplitTrans);
adoptedSplitTrans = null;
result.addElement(splitTrans);
splitTrans = null;
}
int[] p = new int[] { pos };
boolean[] sawDelimiter = new boolean[1];
@ -903,10 +943,10 @@ public abstract class Transliterator {
}
// Handle case of idSplitPoint == id.length()
if (pos >= idSplitPoint && adoptedSplitTrans != null) {
if (pos >= idSplitPoint && splitTrans != null) {
splitTransIndex[0] = result.size();
result.addElement(adoptedSplitTrans);
adoptedSplitTrans = null;
result.addElement(splitTrans);
splitTrans = null;
}
}
@ -941,12 +981,12 @@ public abstract class Transliterator {
* determine if there was an error. Instead, check to see if pos
* moved.
*/
private static Transliterator parseID(String ID,
StringBuffer regenID,
int[] pos,
boolean[] sawDelimiter,
int dir,
boolean create) {
static Transliterator parseID(String ID,
StringBuffer regenID,
int[] pos,
boolean[] sawDelimiter,
int dir,
boolean create) {
int limit, preDelimLimit,
revStart, revLimit=0,
idStart, idLimit,

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
* $Date: 2001/09/20 21:21:10 $
* $Revision: 1.44 $
* $Date: 2001/09/21 21:23:34 $
* $Revision: 1.45 $
*
*****************************************************************************************
*/
@ -357,8 +357,8 @@ public class TransliteratorTest extends TestFmwk {
* Compose the hex transliterators forward and reverse.
*/
public void TestCompoundHex() {
Transliterator a = Transliterator.getInstance("Unicode-Hex");
Transliterator b = Transliterator.getInstance("Hex-Unicode");
Transliterator a = Transliterator.getInstance("Any-Hex");
Transliterator b = Transliterator.getInstance("Hex-Any");
Transliterator[] trans = { a, b };
Transliterator ab = new CompoundTransliterator(trans);
@ -379,7 +379,7 @@ public class TransliteratorTest extends TestFmwk {
* Do some basic tests of filtering.
*/
public void TestFiltering() {
Transliterator hex = Transliterator.getInstance("Unicode-Hex");
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
public boolean contains(char c) {
return c != 'c';
@ -510,18 +510,18 @@ public class TransliteratorTest extends TestFmwk {
* Prefix, suffix support in hex transliterators
*/
public void TestJ243() {
// Test default Hex-Unicode, which should handle
// Test default Hex-Any, which should handle
// \\u, \\U, u+, and U+
HexToUnicodeTransliterator hex = new HexToUnicodeTransliterator();
expect(hex, "\\u0041+\\U0042,u+0043uu+0044z", "A+B,CuDz");
// Try a custom Hex-Unicode
// Try a custom Hex-Any
// \\uXXXX and &#xXXXX;
HexToUnicodeTransliterator hex2 = new HexToUnicodeTransliterator("\\\\u###0;&\\#x###0\\;");
expect(hex2, "\\u61\\u062\\u0063\\u00645\\u66x&#x30;&#x031;&#x0032;&#x00033;",
"abcd5fx012&#x00033;");
// Try custom Unicode-Hex (default is tested elsewhere)
// Try custom Any-Hex (default is tested elsewhere)
UnicodeToHexTransliterator hex3 = new UnicodeToHexTransliterator("&\\#x###0;");
expect(hex3, "012", "&#x30;&#x31;&#x32;");
}
@ -748,13 +748,13 @@ public class TransliteratorTest extends TestFmwk {
*/
public void TestFilterIDs() {
String[] DATA = {
"Unicode[aeiou]-Hex",
"Hex[aeiou]-Unicode",
"Any[aeiou]-Hex",
"Hex[aeiou]-Any",
"quizzical",
"q\\u0075\\u0069zz\\u0069c\\u0061l",
"Unicode[aeiou]-Hex;Hex[^5]-Unicode",
"Unicode[^5]-Hex;Hex[aeiou]-Unicode",
"Any[aeiou]-Hex;Hex[^5]-Any",
"Any[^5]-Hex;Hex[aeiou]-Any",
"quizzical",
"q\\u0075izzical",
@ -961,6 +961,96 @@ public class TransliteratorTest extends TestFmwk {
}
}
/**
* Test compound RBT rules.
*/
public void TestCompoundRBT() {
// Careful with spacing and ';' here: Phrase this exactly
// as toRules() is going to return it. If toRules() changes
// with regard to spacing or ';', then adjust this string.
String rule = "::Hex-Any;\n" +
"::Any-Lower;\n" +
"a > '.A.';\n" +
"b > '.B.';\n" +
"::Any[^t]-Upper;";
Transliterator t = Transliterator.createFromRules("Test", rule, Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules failed");
return;
}
expect(t, "\u0043at in the hat, bat on the mat",
"C.A.t IN tHE H.A.t, .B..A.t ON tHE M.A.t");
String r = t.toRules(true);
if (r.equals(rule)) {
logln("OK: toRules() => " + r);
} else {
errln("FAIL: toRules() => " + r +
", expected " + rule);
}
// Now test toRules
t = Transliterator.getInstance("Greek-Latin; Latin-Cyrillic", Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createInstance failed");
return;
}
String exp = "::Greek-Latin;\n::Latin-Cyrillic;";
r = t.toRules(true);
if (!r.equals(exp)) {
errln("FAIL: toRules() => " + r +
", expected " + exp);
} else {
logln("OK: toRules() => " + r);
}
// Round trip the result of toRules
t = Transliterator.createFromRules("Test", r, Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createFromRules #2 failed");
return;
} else {
logln("OK: createFromRules(" + r + ") succeeded");
}
// Test toRules again
r = t.toRules(true);
if (!r.equals(exp)) {
errln("FAIL: toRules() => " + r +
", expected " + exp);
} else {
logln("OK: toRules() => " + r);
}
// Test Foo(Bar) IDs. Careful with spacing in id; make it conform
// to what the regenerated ID will look like.
String id = "Upper(Lower);(NFKC)";
t = Transliterator.getInstance(id, Transliterator.FORWARD);
if (t == null) {
errln("FAIL: createInstance #2 failed");
return;
}
if (t.getID().equals(id)) {
logln("OK: created " + id);
} else {
errln("FAIL: createInstance(" + id +
").getID() => " + t.getID());
}
Transliterator u = t.getInverse();
if (u == null) {
errln("FAIL: createInverse failed");
return;
}
exp = "NFKC();Lower(Upper)";
if (u.getID().equals(exp)) {
logln("OK: createInverse(" + id + ") => " +
u.getID());
} else {
errln("FAIL: createInverse(" + id + ") => " +
u.getID());
}
}
/**
* Test inverse of Greek-Latin; Title()
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/CompoundTransliterator.java,v $
* $Date: 2001/09/20 21:20:39 $
* $Revision: 1.13 $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.14 $
*
*****************************************************************************************
*/
@ -35,7 +35,7 @@ import java.util.Vector;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.13 $ $Date: 2001/09/20 21:20:39 $
* @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.14 $ $Date: 2001/09/21 21:24:04 $
*/
public class CompoundTransliterator extends Transliterator {
@ -139,6 +139,19 @@ public class CompoundTransliterator extends Transliterator {
this(ID, FORWARD, null);
}
/**
* Package private constructor for compound RBTs. Construct a
* compound transliterator using the given idBlock, with the
* splitTrans inserted at the idSplitPoint.
*/
CompoundTransliterator(String ID,
String idBlock,
int idSplitPoint,
Transliterator splitTrans) {
super(ID, null);
init(idBlock, FORWARD, idSplitPoint, splitTrans, false);
}
/**
* Package private constructor for Transliterator from a vector of
* transliterators. The vector order is FORWARD, so if dir is
@ -154,6 +167,39 @@ public class CompoundTransliterator extends Transliterator {
// assume caller will fixup ID
}
/**
* Finish constructing a transliterator: only to be called by
* constructors. Before calling init(), set trans and filter to NULL.
* @param id the id containing ';'-separated entries
* @param direction either FORWARD or REVERSE
* @param idSplitPoint the index into id at which the
* splitTrans should be inserted, if there is one, or
* -1 if there is none.
* @param splitTrans a transliterator to be inserted
* before the entry at offset idSplitPoint in the id string. May be
* NULL to insert no entry.
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
*/
private void init(String id,
int direction,
int idSplitPoint,
Transliterator splitTrans,
boolean fixReverseID) {
// assert(trans == 0);
Vector list = new Vector();
int[] splitTransIndex = new int[1];
StringBuffer regenID = new StringBuffer();
Transliterator.parseCompoundID(id, regenID, direction,
idSplitPoint, splitTrans,
list, splitTransIndex);
compoundRBTIndex = splitTransIndex[0];
init(list, direction, fixReverseID);
}
/**
* Finish constructing a transliterator: only to be called by
* constructors. Before calling init(), set trans and filter to NULL.
@ -165,7 +211,6 @@ public class CompoundTransliterator extends Transliterator {
* @param fixReverseID if TRUE, then reconstruct the ID of reverse
* entries by calling getID() of component entries. Some constructors
* do not require this because they apply a facade ID anyway.
* @param status the error code indicating success or failure
*/
private void init(Vector list,
int direction,
@ -302,6 +347,34 @@ public class CompoundTransliterator extends Transliterator {
super.setFilter(f);
}
public String toRules(boolean escapeUnprintable) {
// We do NOT call toRules() on our component transliterators, in
// general. If we have several rule-based transliterators, this
// yields a concatenation of the rules -- not what we want. We do
// handle compound RBT transliterators specially -- those for which
// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
// we do call toRules() recursively.
StringBuffer rulesSource = new StringBuffer();
for (int i=0; i<trans.length; ++i) {
String rule;
if (i == compoundRBTIndex) {
rule = trans[i].toRules(escapeUnprintable);
} else {
rule = trans[i].baseToRules(escapeUnprintable);
}
if (rulesSource.length() != 0 &&
rulesSource.charAt(rulesSource.length() - 1) != '\n') {
rulesSource.append('\n');
}
rulesSource.append(rule);
if (rulesSource.length() != 0 &&
rulesSource.charAt(rulesSource.length() - 1) != ID_DELIM) {
rulesSource.append(ID_DELIM);
}
}
return rulesSource.toString();
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/RuleBasedTransliterator.java,v $
* $Date: 2001/09/19 17:43:37 $
* $Revision: 1.43 $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.44 $
*
*****************************************************************************************
*/
@ -279,12 +279,16 @@ import com.ibm.text.resources.ResourceReader;
* <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>
*
* @author Alan Liu
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.43 $ $Date: 2001/09/19 17:43:37 $
* @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.44 $ $Date: 2001/09/21 21:24:04 $
*/
public class RuleBasedTransliterator extends Transliterator {
private Data data;
// Indicator for ID blocks
private static final String ID_TOKEN = "::";
private static final int ID_TOKEN_LEN = 2;
private static final String COPYRIGHT =
"\u00A9 IBM Corporation 1999. All rights reserved.";
@ -334,6 +338,31 @@ public class RuleBasedTransliterator extends Transliterator {
return new Parser(rules, direction).getData();
}
/**
* Parse a given set of rules. Return up to three pieces of
* parsed data. These are the header ::id block, the rule block,
* and the footer ::id block. Any or all of these may be empty.
* If the ::id blocks are empty, their corresponding parameters
* are returned as the empty string. If there are no rules, the
* TransliterationRuleData result is 0.
* @param ruleDataResult caller owns the pointer stored here.
* May be NULL.
* @param headerRule string including semicolons for the header
* ::id block. May be empty.
* @param footerRule string including semicolons for the footer
* ::id block. May be empty.
*/
static Data parse(String rules,
int direction,
StringBuffer idBlockResult,
int[] idSplitPointResult) {
Parser parser = new Parser(new String[] { rules }, direction);
idBlockResult.setLength(0);
idBlockResult.append(parser.idBlock);
idSplitPointResult[0] = parser.idSplitPoint;
return (parser.ruleCount == 0) ? null : parser.getData();
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
@ -474,15 +503,27 @@ public class RuleBasedTransliterator extends Transliterator {
private static class Parser {
/**
* Current rule being parsed.
*/
private String rules;
private int direction;
private Data data;
// In a compound RBT, the index at which the RBT rules are
// inserted into the ID block. Index 0 means before any IDs
// in the block. Index idBlock.length() means after all IDs
// in the block. Index is a string index.
int idSplitPoint;
// The block of ::IDs, both at the top and at the bottom.
// Inserted into these may be additional rules at the
// idSplitPoint.
String idBlock;
// The number of rules parsed. This tells us if there were
// any actual transliterator rules, or if there were just ::ID
// block IDs.
int ruleCount;
/**
* This class implements the SymbolTable interface. It is used
* during parsing to give UnicodeSet access to variables that
@ -717,6 +758,8 @@ public class RuleBasedTransliterator extends Transliterator {
* rules
*/
private void parseRules(RuleBody ruleArray) {
ruleCount = 0;
determineVariableRange(ruleArray);
setVariablesVector = new Vector();
parseData = new ParseData();
@ -725,6 +768,16 @@ public class RuleBasedTransliterator extends Transliterator {
int errorCount = 0;
ruleArray.reset();
StringBuffer idBlockResult = new StringBuffer();
idSplitPoint = -1;
// The mode marks whether we are in the header ::id block, the
// rule block, or the footer ::id block.
// mode == 0: start: rule->1, ::id->0
// mode == 1: in rules: rule->1, ::id->2
// mode == 2: in footer rule block: rule->ERROR, ::id->2
int mode = 0;
main:
for (;;) {
String rule = ruleArray.nextLine();
@ -754,10 +807,54 @@ public class RuleBasedTransliterator extends Transliterator {
// at once. We keep parsing rules even after a failure, up
// to a specified limit, and report all errors at once.
try {
// We've found the start of a rule. c is its first
// character, and pos points past c. Lexically parse the
// rule into component pieces.
pos = parseRule(rule, --pos, limit);
// We've found the start of a rule or ID. c is its first
// character, and pos points past c.
--pos;
// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
// chars left.
if ((pos + ID_TOKEN_LEN + 1) <= limit &&
rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
pos += ID_TOKEN_LEN;
c = rule.charAt(pos);
while (UCharacter.isWhitespace(c) && pos < limit) {
++pos;
c = rule.charAt(pos);
}
int[] p = new int[] { pos };
boolean[] sawDelim = new boolean[1];
StringBuffer regenID = new StringBuffer();
Transliterator.parseID(rule, regenID, p, sawDelim, direction, false);
if (p[0] == pos || !sawDelim[0]) {
// Invalid ::id
int i1 = pos + 2;
while (i1 < rule.length() && rule.charAt(i1) != ';') {
++i1;
}
throw new IllegalArgumentException("Invalid ::ID " +
rule.substring(pos, i1));
} else {
if (mode == 1) {
mode = 2;
idSplitPoint = idBlockResult.length();
}
String str = rule.substring(pos, p[0]);
idBlockResult.append(str);
if (!sawDelim[0]) {
idBlockResult.append(';');
}
pos = p[0];
}
} else {
// Parse a rule
pos = parseRule(rule, pos, limit);
++ruleCount;
if (mode == 2) {
// ::id in illegal position (because a rule
// occurred after the ::id footer block)
throw new IllegalArgumentException("::ID in illegal position");
}
mode = 1;
}
} catch (IllegalArgumentException e) {
if (errorCount == 30) {
errors.append("\nMore than 30 errors; further messages squelched");
@ -774,6 +871,8 @@ public class RuleBasedTransliterator extends Transliterator {
}
}
idBlock = idBlockResult.toString();
// Convert the set vector to an array
data.setVariables = new UnicodeSet[setVariablesVector.size()];
setVariablesVector.copyInto(data.setVariables);
@ -1480,6 +1579,9 @@ public class RuleBasedTransliterator extends Transliterator {
/**
* $Log: RuleBasedTransliterator.java,v $
* Revision 1.44 2001/09/21 21:24:04 alan
* jitterbug 64: allow ::ID blocks in rules
*
* Revision 1.43 2001/09/19 17:43:37 alan
* jitterbug 60: initial implementation of toRules()
*

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Transliterator.java,v $
* $Date: 2001/09/20 21:20:39 $
* $Revision: 1.39 $
* $Date: 2001/09/21 21:24:04 $
* $Revision: 1.40 $
*
*****************************************************************************************
*/
@ -241,7 +241,7 @@ import com.ibm.util.CaseInsensitiveString;
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.39 $ $Date: 2001/09/20 21:20:39 $
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.40 $ $Date: 2001/09/21 21:24:04 $
*/
public abstract class Transliterator {
/**
@ -832,12 +832,52 @@ public abstract class Transliterator {
* NullTransliterator, if it contains ID blocks which parse as
* empty for the given direction.
*/
public static final Transliterator createFromRules(String ID, String rules, int direction) {
public static final Transliterator createFromRules(String ID, String rules, int dir) {
// TODO Flesh this out
return new RuleBasedTransliterator(ID, rules, direction, null);
//// return new RuleBasedTransliterator(ID, rules, direction, null);
StringBuffer idBlock = new StringBuffer();
int[] idSplitPoint = new int[] { -1 };
RuleBasedTransliterator.Data data = null;
data = RuleBasedTransliterator.parse(rules, dir,
idBlock, idSplitPoint);
// NOTE: The logic here matches that in TransliteratorRegistry.
if (idBlock.length() == 0) {
if (data == null) {
// No idBlock, no data -- this is just an
// alias for Null
return new NullTransliterator();
} else {
// No idBlock, data != 0 -- this is an
// ordinary RBT_DATA.
return new RuleBasedTransliterator(ID, data, null);
}
} else {
if (data == null) {
// idBlock, no data -- this is an alias
Transliterator t = getInstance(idBlock.toString(), dir);
if (t != null) {
t.setID(ID);
}
return t;
} else {
// idBlock and data -- this is a compound
// RBT
Transliterator t = new RuleBasedTransliterator("_", data, null);
t = new CompoundTransliterator(ID, idBlock.toString(), idSplitPoint[0],
t);
return t;
}
}
}
public String toRules(boolean escapeUnprintable) {
return baseToRules(escapeUnprintable);
}
protected final String baseToRules(boolean escapeUnprintable) {
// The base class implementation of toRules munges the ID into
// the correct format. That is: foo => ::foo
// KEEP in sync with rbt_pars
@ -860,13 +900,13 @@ public abstract class Transliterator {
* 'result' at which the adoptedSplitTrans is stored, or -1 if
* adoptedSplitTrans == 0
*/
private static void parseCompoundID(String id,
StringBuffer regenID,
int dir,
int idSplitPoint,
Transliterator adoptedSplitTrans,
Vector result,
int[] splitTransIndex) {
static void parseCompoundID(String id,
StringBuffer regenID,
int dir,
int idSplitPoint,
Transliterator splitTrans,
Vector result,
int[] splitTransIndex) {
regenID.setLength(0);
splitTransIndex[0] = -1;
int pos = 0;
@ -874,10 +914,10 @@ public abstract class Transliterator {
while (pos < id.length()) {
// We compare (pos >= split), not (pos == split), so we can
// skip over whitespace (see below).
if (pos >= idSplitPoint && adoptedSplitTrans != null) {
if (pos >= idSplitPoint && splitTrans != null) {
splitTransIndex[0] = result.size();
result.addElement(adoptedSplitTrans);
adoptedSplitTrans = null;
result.addElement(splitTrans);
splitTrans = null;
}
int[] p = new int[] { pos };
boolean[] sawDelimiter = new boolean[1];
@ -903,10 +943,10 @@ public abstract class Transliterator {
}
// Handle case of idSplitPoint == id.length()
if (pos >= idSplitPoint && adoptedSplitTrans != null) {
if (pos >= idSplitPoint && splitTrans != null) {
splitTransIndex[0] = result.size();
result.addElement(adoptedSplitTrans);
adoptedSplitTrans = null;
result.addElement(splitTrans);
splitTrans = null;
}
}
@ -941,12 +981,12 @@ public abstract class Transliterator {
* determine if there was an error. Instead, check to see if pos
* moved.
*/
private static Transliterator parseID(String ID,
StringBuffer regenID,
int[] pos,
boolean[] sawDelimiter,
int dir,
boolean create) {
static Transliterator parseID(String ID,
StringBuffer regenID,
int[] pos,
boolean[] sawDelimiter,
int dir,
boolean create) {
int limit, preDelimLimit,
revStart, revLimit=0,
idStart, idLimit,