diff --git a/icu4j/src/com/ibm/demo/translit/Demo.java b/icu4j/src/com/ibm/demo/translit/Demo.java
new file mode 100755
index 00000000000..d02953d5036
--- /dev/null
+++ b/icu4j/src/com/ibm/demo/translit/Demo.java
@@ -0,0 +1,253 @@
+import java.applet.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.util.*;
+import com.ibm.text.components.*;
+import com.ibm.text.*;
+
+/**
+ * A frame that allows the user to experiment with keyboard
+ * transliteration. This class has a main() method so it can be run
+ * as an application. The frame contains an editable text component
+ * and uses keyboard transliteration to process keyboard events.
+ *
+ *
Copyright (c) IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: Demo.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class Demo extends Frame {
+
+ static final boolean DEBUG = false;
+
+ Transliterator translit = null;
+
+ boolean compound = false;
+ Transliterator[] compoundTranslit = new Transliterator[MAX_COMPOUND];
+ static final int MAX_COMPOUND = 128;
+ int compoundCount = 0;
+
+ TransliteratingTextComponent text = null;
+
+ Menu translitMenu;
+ CheckboxMenuItem translitItem;
+ CheckboxMenuItem noTranslitItem;
+
+ static final String NO_TRANSLITERATOR = "None";
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ public static void main(String[] args) {
+ Frame f = new Demo(600, 200);
+ f.addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent e) {
+ System.exit(0);
+ }
+ });
+ f.setVisible(true);
+ }
+
+ public Demo(int width, int height) {
+ super("Transliteration Demo");
+
+ initMenus();
+
+ addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent e) {
+ handleClose();
+ }
+ });
+
+ text = new TransliteratingTextComponent();
+ Font font = new Font("serif", Font.PLAIN, 48);
+ text.setFont(font);
+ text.setSize(width, height);
+ text.setVisible(true);
+ text.setText("\u03B1\u05D0\u3042\u4E80");
+ add(text);
+
+ setSize(width, height);
+ }
+
+ private void initMenus() {
+ MenuBar mbar;
+ Menu menu;
+ MenuItem mitem;
+ CheckboxMenuItem citem;
+
+ setMenuBar(mbar = new MenuBar());
+ mbar.add(menu = new Menu("File"));
+ menu.add(mitem = new MenuItem("Quit"));
+ mitem.addActionListener(new ActionListener() {
+ public void actionPerformed(ActionEvent e) {
+ handleClose();
+ }
+ });
+
+ final ItemListener setTransliteratorListener = new ItemListener() {
+ public void itemStateChanged(ItemEvent e) {
+ CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
+ if (e.getStateChange() == ItemEvent.DESELECTED) {
+ // Don't let the current transliterator be deselected.
+ // Just reselect it.
+ item.setState(true);
+ } else if (compound) {
+ // Adding an item to a compound transliterator
+ handleAddToCompound(item.getLabel());
+ } else if (item != translitItem) {
+ // Deselect previous choice. Don't need to call
+ // setState(true) on new choice.
+ translitItem.setState(false);
+ translitItem = item;
+ handleSetTransliterator(item.getLabel());
+ }
+ }
+ };
+
+ translit = null;
+ mbar.add(translitMenu = new Menu("Transliterator"));
+ translitMenu.add(translitItem = noTranslitItem =
+ new CheckboxMenuItem(NO_TRANSLITERATOR, true));
+ noTranslitItem.addItemListener(new ItemListener() {
+ public void itemStateChanged(ItemEvent e) {
+ // Can't uncheck None -- any action here sets None to true
+ setNoTransliterator();
+ }
+ });
+
+ translitMenu.addSeparator();
+
+ translitMenu.add(citem = new CheckboxMenuItem("Compound"));
+ citem.addItemListener(new ItemListener() {
+ public void itemStateChanged(ItemEvent e) {
+ CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
+ if (e.getStateChange() == ItemEvent.DESELECTED) {
+ // If compound gets deselected, then select NONE
+ setNoTransliterator();
+ } else if (!compound) {
+ // Switching from non-compound to compound
+ translitItem.setState(false);
+ translitItem = item;
+ translit = null;
+ compound = true;
+ compoundCount = 0;
+ for (int i=0; i 0) {
+ v.setElementAt(b, i);
+ v.setElementAt(a, j);
+ a = b;
+ }
+ }
+ }
+ return v;
+ }
+
+ private void setNoTransliterator() {
+ translitItem = noTranslitItem;
+ noTranslitItem.setState(true);
+ handleSetTransliterator(noTranslitItem.getLabel());
+ compound = false;
+ for (int i=0; i.
+ */
+ private static Transliterator decodeTranslitItem(String name) {
+ return (name.equals(NO_TRANSLITERATOR))
+ ? null : Transliterator.getInstance(name);
+ }
+
+ private void handleBatchTransliterate() {
+ if (translit == null) {
+ return;
+ }
+
+ int start = text.getSelectionStart();
+ int end = text.getSelectionEnd();
+ ReplaceableString s =
+ new ReplaceableString(text.getText().substring(start, end));
+
+ StringBuffer log = null;
+ if (DEBUG) {
+ log = new StringBuffer();
+ log.append('"' + s.toString() + "\" (start " + start +
+ ", end " + end + ") -> \"");
+ }
+
+ translit.transliterate(s);
+ String str = s.toString();
+
+ if (DEBUG) {
+ log.append(str + "\"");
+ System.out.println("Batch " + translit.getID() + ": " + log.toString());
+ }
+
+ text.replaceRange(str, start, end);
+ text.select(start, start + str.length());
+ }
+
+ private void handleClose() {
+ dispose();
+ }
+}
diff --git a/icu4j/src/com/ibm/demo/translit/DemoApplet.java b/icu4j/src/com/ibm/demo/translit/DemoApplet.java
new file mode 100755
index 00000000000..21b256ebc26
--- /dev/null
+++ b/icu4j/src/com/ibm/demo/translit/DemoApplet.java
@@ -0,0 +1,62 @@
+
+import java.awt.*;
+import java.awt.event.*;
+import java.applet.*;
+import com.ibm.text.components.AppletFrame;
+
+/**
+ * A simple Applet that shows a button. When pressed, the button
+ * shows the DemoAppletFrame. This Applet is meant to be embedded
+ * in a web page.
+ *
+ * Copyright (c) IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: DemoApplet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class DemoApplet extends Applet {
+
+ Demo frame = null;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ public static void main(String args[]) {
+ final DemoApplet applet = new DemoApplet();
+ new AppletFrame("Transliteration Demo", applet, 640, 480);
+ }
+
+ public void init() {
+
+ Button button = new Button("Transliteration Demo");
+ button.addActionListener(new ActionListener() {
+ public void actionPerformed(ActionEvent e) {
+ if (frame == null) {
+ frame = new Demo(600, 200);
+ frame.addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent we) {
+ frame = null;
+ }
+ });
+ }
+ frame.setVisible(true);
+ frame.toFront();
+ }
+ });
+
+ add(button);
+
+ Dimension size = button.getPreferredSize();
+ size.width += 10;
+ size.height += 10;
+
+ resize(size);
+ }
+
+ public void stop() {
+ if (frame != null) {
+ frame.dispose();
+ }
+ frame = null;
+ }
+}
diff --git a/icu4j/src/com/ibm/demo/translit/demo.bat b/icu4j/src/com/ibm/demo/translit/demo.bat
new file mode 100755
index 00000000000..88f63e3446f
--- /dev/null
+++ b/icu4j/src/com/ibm/demo/translit/demo.bat
@@ -0,0 +1,7 @@
+REM For best results, run the demo as an applet inside of Netscape
+REM with Bitstream Cyberbit installed.
+
+REM setup your JDK 1.1.x path and classpath here:
+call JDK11
+set CLASSPATH=../translit.jar;%CLASSPATH%
+javaw Demo
diff --git a/icu4j/src/com/ibm/demo/translit/demo.html b/icu4j/src/com/ibm/demo/translit/demo.html
new file mode 100755
index 00000000000..6327daf6504
--- /dev/null
+++ b/icu4j/src/com/ibm/demo/translit/demo.html
@@ -0,0 +1,8 @@
+
+
+Transliteration Demo
+
+
+
+
+
diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java b/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java
new file mode 100755
index 00000000000..d02953d5036
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java
@@ -0,0 +1,253 @@
+import java.applet.*;
+import java.awt.*;
+import java.awt.event.*;
+import java.util.*;
+import com.ibm.text.components.*;
+import com.ibm.text.*;
+
+/**
+ * A frame that allows the user to experiment with keyboard
+ * transliteration. This class has a main() method so it can be run
+ * as an application. The frame contains an editable text component
+ * and uses keyboard transliteration to process keyboard events.
+ *
+ * Copyright (c) IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: Demo.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class Demo extends Frame {
+
+ static final boolean DEBUG = false;
+
+ Transliterator translit = null;
+
+ boolean compound = false;
+ Transliterator[] compoundTranslit = new Transliterator[MAX_COMPOUND];
+ static final int MAX_COMPOUND = 128;
+ int compoundCount = 0;
+
+ TransliteratingTextComponent text = null;
+
+ Menu translitMenu;
+ CheckboxMenuItem translitItem;
+ CheckboxMenuItem noTranslitItem;
+
+ static final String NO_TRANSLITERATOR = "None";
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ public static void main(String[] args) {
+ Frame f = new Demo(600, 200);
+ f.addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent e) {
+ System.exit(0);
+ }
+ });
+ f.setVisible(true);
+ }
+
+ public Demo(int width, int height) {
+ super("Transliteration Demo");
+
+ initMenus();
+
+ addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent e) {
+ handleClose();
+ }
+ });
+
+ text = new TransliteratingTextComponent();
+ Font font = new Font("serif", Font.PLAIN, 48);
+ text.setFont(font);
+ text.setSize(width, height);
+ text.setVisible(true);
+ text.setText("\u03B1\u05D0\u3042\u4E80");
+ add(text);
+
+ setSize(width, height);
+ }
+
+ private void initMenus() {
+ MenuBar mbar;
+ Menu menu;
+ MenuItem mitem;
+ CheckboxMenuItem citem;
+
+ setMenuBar(mbar = new MenuBar());
+ mbar.add(menu = new Menu("File"));
+ menu.add(mitem = new MenuItem("Quit"));
+ mitem.addActionListener(new ActionListener() {
+ public void actionPerformed(ActionEvent e) {
+ handleClose();
+ }
+ });
+
+ final ItemListener setTransliteratorListener = new ItemListener() {
+ public void itemStateChanged(ItemEvent e) {
+ CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
+ if (e.getStateChange() == ItemEvent.DESELECTED) {
+ // Don't let the current transliterator be deselected.
+ // Just reselect it.
+ item.setState(true);
+ } else if (compound) {
+ // Adding an item to a compound transliterator
+ handleAddToCompound(item.getLabel());
+ } else if (item != translitItem) {
+ // Deselect previous choice. Don't need to call
+ // setState(true) on new choice.
+ translitItem.setState(false);
+ translitItem = item;
+ handleSetTransliterator(item.getLabel());
+ }
+ }
+ };
+
+ translit = null;
+ mbar.add(translitMenu = new Menu("Transliterator"));
+ translitMenu.add(translitItem = noTranslitItem =
+ new CheckboxMenuItem(NO_TRANSLITERATOR, true));
+ noTranslitItem.addItemListener(new ItemListener() {
+ public void itemStateChanged(ItemEvent e) {
+ // Can't uncheck None -- any action here sets None to true
+ setNoTransliterator();
+ }
+ });
+
+ translitMenu.addSeparator();
+
+ translitMenu.add(citem = new CheckboxMenuItem("Compound"));
+ citem.addItemListener(new ItemListener() {
+ public void itemStateChanged(ItemEvent e) {
+ CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
+ if (e.getStateChange() == ItemEvent.DESELECTED) {
+ // If compound gets deselected, then select NONE
+ setNoTransliterator();
+ } else if (!compound) {
+ // Switching from non-compound to compound
+ translitItem.setState(false);
+ translitItem = item;
+ translit = null;
+ compound = true;
+ compoundCount = 0;
+ for (int i=0; i 0) {
+ v.setElementAt(b, i);
+ v.setElementAt(a, j);
+ a = b;
+ }
+ }
+ }
+ return v;
+ }
+
+ private void setNoTransliterator() {
+ translitItem = noTranslitItem;
+ noTranslitItem.setState(true);
+ handleSetTransliterator(noTranslitItem.getLabel());
+ compound = false;
+ for (int i=0; i.
+ */
+ private static Transliterator decodeTranslitItem(String name) {
+ return (name.equals(NO_TRANSLITERATOR))
+ ? null : Transliterator.getInstance(name);
+ }
+
+ private void handleBatchTransliterate() {
+ if (translit == null) {
+ return;
+ }
+
+ int start = text.getSelectionStart();
+ int end = text.getSelectionEnd();
+ ReplaceableString s =
+ new ReplaceableString(text.getText().substring(start, end));
+
+ StringBuffer log = null;
+ if (DEBUG) {
+ log = new StringBuffer();
+ log.append('"' + s.toString() + "\" (start " + start +
+ ", end " + end + ") -> \"");
+ }
+
+ translit.transliterate(s);
+ String str = s.toString();
+
+ if (DEBUG) {
+ log.append(str + "\"");
+ System.out.println("Batch " + translit.getID() + ": " + log.toString());
+ }
+
+ text.replaceRange(str, start, end);
+ text.select(start, start + str.length());
+ }
+
+ private void handleClose() {
+ dispose();
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java b/icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java
new file mode 100755
index 00000000000..21b256ebc26
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java
@@ -0,0 +1,62 @@
+
+import java.awt.*;
+import java.awt.event.*;
+import java.applet.*;
+import com.ibm.text.components.AppletFrame;
+
+/**
+ * A simple Applet that shows a button. When pressed, the button
+ * shows the DemoAppletFrame. This Applet is meant to be embedded
+ * in a web page.
+ *
+ * Copyright (c) IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: DemoApplet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class DemoApplet extends Applet {
+
+ Demo frame = null;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ public static void main(String args[]) {
+ final DemoApplet applet = new DemoApplet();
+ new AppletFrame("Transliteration Demo", applet, 640, 480);
+ }
+
+ public void init() {
+
+ Button button = new Button("Transliteration Demo");
+ button.addActionListener(new ActionListener() {
+ public void actionPerformed(ActionEvent e) {
+ if (frame == null) {
+ frame = new Demo(600, 200);
+ frame.addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent we) {
+ frame = null;
+ }
+ });
+ }
+ frame.setVisible(true);
+ frame.toFront();
+ }
+ });
+
+ add(button);
+
+ Dimension size = button.getPreferredSize();
+ size.width += 10;
+ size.height += 10;
+
+ resize(size);
+ }
+
+ public void stop() {
+ if (frame != null) {
+ frame.dispose();
+ }
+ frame = null;
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat
new file mode 100755
index 00000000000..88f63e3446f
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat
@@ -0,0 +1,7 @@
+REM For best results, run the demo as an applet inside of Netscape
+REM with Bitstream Cyberbit installed.
+
+REM setup your JDK 1.1.x path and classpath here:
+call JDK11
+set CLASSPATH=../translit.jar;%CLASSPATH%
+javaw Demo
diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/demo.html b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.html
new file mode 100755
index 00000000000..6327daf6504
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.html
@@ -0,0 +1,8 @@
+
+
+Transliteration Demo
+
+
+
+
+
diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
new file mode 100755
index 00000000000..96433f64a26
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
@@ -0,0 +1,763 @@
+import com.ibm.text.*;
+import java.text.*;
+import java.util.*;
+
+/**
+ * @test
+ * @summary General test of Transliterator
+ */
+public class TransliteratorTest extends IntlTest {
+
+ public static void main(String[] args) throws Exception {
+ new TransliteratorTest().run(args);
+ }
+
+ /**
+ * A CommonPoint legacy round-trip test for the Kana transliterator.
+ */
+// public void TestKanaRoundTrip() {
+// Transliterator t = Transliterator.getInstance("Kana");
+// StringTokenizer tok = new StringTokenizer(KANA_RT_DATA);
+// while (tok.hasMoreTokens()) {
+// String str = tok.nextToken();
+// ReplaceableString tmp = new ReplaceableString(str);
+// t.transliterate(tmp, Transliterator.FORWARD);
+//
+// str = tmp.toString();
+// tmp = new ReplaceableString(str);
+// t.transliterate(tmp, Transliterator.REVERSE);
+// t.transliterate(tmp, Transliterator.FORWARD);
+// if (!tmp.toString().equals(str)) {
+// tmp = new ReplaceableString(str);
+// t.transliterate(tmp, Transliterator.REVERSE);
+// String a = tmp.toString();
+// t.transliterate(tmp, Transliterator.FORWARD);
+// errln("FAIL: " + escape(str) + " -> " +
+// escape(a) + " -> " + escape(tmp.toString()));
+// }
+// }
+// }
+
+ public void TestInstantiation() {
+ long ms = System.currentTimeMillis();
+ String ID;
+ for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
+ ID = (String) e.nextElement();
+ try {
+ Transliterator t = Transliterator.getInstance(ID);
+ // We should get a new instance if we try again
+ Transliterator t2 = Transliterator.getInstance(ID);
+ if (t != t2) {
+ logln(ID + ":" + t);
+ } else {
+ errln("FAIL: " + ID + " returned identical instances");
+ }
+ } catch (IllegalArgumentException ex) {
+ errln("FAIL: " + ID);
+ throw ex;
+ }
+ }
+
+ // Now test the failure path
+ try {
+ ID = "";
+ Transliterator t = Transliterator.getInstance(ID);
+ errln("FAIL: " + ID + " returned " + t);
+ } catch (IllegalArgumentException ex) {
+ logln("OK: Bogus ID handled properly");
+ }
+
+ ms = System.currentTimeMillis() - ms;
+ logln("Elapsed time: " + ms + " ms");
+ }
+
+ public void TestSimpleRules() {
+ /* Example: rules 1. ab>x|y
+ * 2. yc>z
+ *
+ * []|eabcd start - no match, copy e to tranlated buffer
+ * [e]|abcd match rule 1 - copy output & adjust cursor
+ * [ex|y]cd match rule 2 - copy output & adjust cursor
+ * [exz]|d no match, copy d to transliterated buffer
+ * [exzd]| done
+ */
+ expect("ab>x|y\n" +
+ "yc>z",
+ "eabcd", "exzd");
+
+ /* Another set of rules:
+ * 1. ab>x|yzacw
+ * 2. za>q
+ * 3. qc>r
+ * 4. cw>n
+ *
+ * []|ab Rule 1
+ * [x|yzacw] No match
+ * [xy|zacw] Rule 2
+ * [xyq|cw] Rule 4
+ * [xyqn]| Done
+ */
+ expect("ab>x|yzacw\n" +
+ "za>q\n" +
+ "qc>r\n" +
+ "cw>n",
+ "ab", "xyqn");
+
+ /* Test categories
+ */
+ Transliterator t = new RuleBasedTransliterator("",
+ "dummy=\uE100\n" +
+ "vowel=[aeiouAEIOU]\n" +
+ "lu=[:Lu:]\n" +
+ "{vowel}[{lu}>!\n" +
+ "{vowel}>&\n" +
+ "!]{lu}>^\n" +
+ "{lu}>*\n" +
+ "a>ERROR");
+ expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
+ }
+
+ // Restore this test if/when it's been deciphered. In general,
+ // tests that depend on a specific tranliterator are subject
+ // to the same fragility as tests that depend on resource data.
+
+// public void TestKana() {
+// String DATA[] = {
+// "a", "\u3042",
+// "A", "\u30A2",
+// "aA", "\u3042\u30A2",
+// "aaaa", "\u3042\u3042\u3042\u3042",
+// "akasata", "\u3042\u304B\u3055\u305F",
+// };
+//
+// Transliterator t = Transliterator.getInstance("Latin-Kana");
+// Transliterator rt = Transliterator.getInstance("Kana-Latin");
+// for (int i=0; izyx\n" +
+ "ab>yz\n" +
+ "bc>zx\n" +
+ "ca>xy\n" +
+ "a>x\n" +
+ "b>y\n" +
+ "c>z\n" +
+
+ "abc", RULES);
+ Transliterator rev = new RuleBasedTransliterator("", RULES,
+ RuleBasedTransliterator.REVERSE, null);
+ for (int i=0; i",
+ "psch>Y\n"
+ +"ps>y\n"
+ +"ch>x\n"
+ +"a>A\n");
+ String DATA[] = {
+ // insertion, buffer
+ "a", "A",
+ "p", "Ap",
+ "s", "Aps",
+ "c", "Apsc",
+ "a", "AycA",
+ "psch", "AycAY",
+ null, "AycAY", // null means finishKeyboardTransliteration
+ };
+
+ keyboardAux(t, DATA);
+ }
+
+ /**
+ * Basic test of keyboard with cursor.
+ */
+ public void TestKeyboard2() {
+ Transliterator t = new RuleBasedTransliterator("",
+ "ych>Y\n"
+ +"ps>|y\n"
+ +"ch>x\n"
+ +"a>A\n");
+ String DATA[] = {
+ // insertion, buffer
+ "a", "A",
+ "p", "Ap",
+ "s", "Ay",
+ "c", "Ayc",
+ "a", "AycA",
+ "p", "AycAp",
+ "s", "AycAy",
+ "c", "AycAyc",
+ "h", "AycAY",
+ null, "AycAY", // null means finishKeyboardTransliteration
+ };
+
+ keyboardAux(t, DATA);
+ }
+
+ /**
+ * Test keyboard transliteration with back-replacement.
+ */
+ public void TestKeyboard3() {
+ // We want th>z but t>y. Furthermore, during keyboard
+ // transliteration we want t>y then yh>z if t, then h are
+ // typed.
+ String RULES =
+ "t>|y\n" +
+ "yh>z\n" +
+ "";
+
+ String[] DATA = {
+ // Column 1: characters to add to buffer (as if typed)
+ // Column 2: expected appearance of buffer after
+ // keyboard xliteration.
+ "a", "a",
+ "b", "ab",
+ "t", "aby",
+ "c", "abyc",
+ "t", "abycy",
+ "h", "abycz",
+ null, "abycz", // null means finishKeyboardTransliteration
+ };
+
+ Transliterator t = new RuleBasedTransliterator("", RULES);
+ keyboardAux(t, DATA);
+ }
+
+ private void keyboardAux(Transliterator t, String[] DATA) {
+ int[] index = {0, 0, 0};
+ ReplaceableString s = new ReplaceableString();
+ for (int i=0; i ");
+ t.keyboardTransliterate(s, index, DATA[i]);
+ } else {
+ log = new StringBuffer(s.toString() + " => ");
+ t.finishKeyboardTransliteration(s, index);
+ }
+ String str = s.toString();
+ // Show the start index '{' and the cursor '|'
+ log.append(str.substring(0, index[Transliterator.START])).
+ append('{').
+ append(str.substring(index[Transliterator.START],
+ index[Transliterator.CURSOR])).
+ append('|').
+ append(str.substring(index[Transliterator.CURSOR]));
+ if (str.equals(DATA[i+1])) {
+ logln(log.toString());
+ } else {
+ errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]);
+ }
+ }
+ }
+
+ public void TestArabic() {
+ String DATA[] = {
+ "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
+ "\u0627\u0644\u0644\u063a\u0629\u0020"+
+ "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
+ "\u0628\u0628\u0646\u0638\u0645\u0020"+
+ "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
+ "\u062c\u0645\u064a\u0644\u0629",
+ };
+
+ Transliterator t = Transliterator.getInstance("Latin-Arabic");
+ for (int i=0; i", trans);
+
+ expect(t, "aaaaa", "aaaaa");
+ }
+
+ /**
+ * Compose the hex transliterators forward and reverse.
+ */
+ public void TestCompoundHex() {
+ Transliterator a = Transliterator.getInstance("Unicode-Hex");
+ Transliterator b = Transliterator.getInstance("Hex-Unicode");
+ Transliterator[] trans = { a, b };
+ Transliterator ab = new CompoundTransliterator("ab", trans);
+ String s = "abcde";
+ expect(ab, s, s);
+
+ trans = new Transliterator[] { b, a };
+ Transliterator ba = new CompoundTransliterator("ba", trans);
+ ReplaceableString str = new ReplaceableString(s);
+ a.transliterate(str);
+ expect(ba, str.toString(), str.toString());
+ }
+
+ /**
+ * Do some basic tests of filtering.
+ */
+ public void TestFiltering() {
+ Transliterator hex = Transliterator.getInstance("Unicode-Hex");
+ hex.setFilter(new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return c != 'c';
+ }
+ });
+ String s = "abcde";
+ String out = hex.transliterate(s);
+ String exp = "\\u0061\\u0062c\\u0064\\u0065";
+ if (out.equals(exp)) {
+ logln("Ok: \"" + exp + "\"");
+ } else {
+ logln("FAIL: \"" + out + "\", wanted \"" + exp + "\"");
+ }
+ }
+
+ //======================================================================
+ // Support methods
+ //======================================================================
+
+ void expect(String rules, String source, String expectedResult) {
+ expect(new RuleBasedTransliterator("", rules), source, expectedResult);
+ }
+
+ void expect(Transliterator t, String source, String expectedResult,
+ Transliterator reverseTransliterator) {
+ expect(t, source, expectedResult);
+ if (reverseTransliterator != null) {
+ expect(reverseTransliterator, expectedResult, source);
+ }
+ }
+
+ void expect(Transliterator t, String source, String expectedResult) {
+ String result = t.transliterate(source);
+ expectAux(t.getID() + ":String", source, result, expectedResult);
+
+ ReplaceableString rsource = new ReplaceableString(source);
+ t.transliterate(rsource);
+ result = rsource.toString();
+ expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
+
+ // Test keyboard (incremental) transliteration -- this result
+ // must be the same after we finalize (see below).
+ rsource.getStringBuffer().setLength(0);
+ int[] index = { 0, 0, 0 };
+ StringBuffer log = new StringBuffer();
+
+ for (int i=0; i ");
+ t.keyboardTransliterate(rsource, index,
+ String.valueOf(source.charAt(i)));
+ // Append the string buffer with a vertical bar '|' where
+ // the committed index is.
+ String s = rsource.toString();
+ log.append(s.substring(0, index[Transliterator.CURSOR])).
+ append('|').
+ append(s.substring(index[Transliterator.CURSOR]));
+ }
+
+ // As a final step in keyboard transliteration, we must call
+ // transliterate to finish off any pending partial matches that
+ // were waiting for more input.
+ t.finishKeyboardTransliteration(rsource, index);
+ result = rsource.toString();
+ log.append(" => ").append(rsource.toString());
+
+ expectAux(t.getID() + ":Keyboard", log.toString(),
+ result.equals(expectedResult),
+ expectedResult);
+ }
+
+ void expectAux(String tag, String source,
+ String result, String expectedResult) {
+ expectAux(tag, source + " -> " + result,
+ result.equals(expectedResult),
+ expectedResult);
+ }
+
+ void expectAux(String tag, String summary, boolean pass,
+ String expectedResult) {
+ if (pass) {
+ logln("("+tag+") " + escape(summary));
+ } else {
+ errln("FAIL: ("+tag+") "
+ + escape(summary)
+ + ", expected " + escape(expectedResult));
+ }
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ */
+ public static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+
+ /*
+ static final String KANA_RT_DATA =
+"a "+
+
+"ba bi bu be bo "+
+"bya byi byu bye byo "+
+"bba "+
+
+"da di du de do "+
+"dya dyi dyu dye dyo "+
+"dha dhi dhu dhe dho "+
+"dda "+
+
+"e "+
+
+"fa fi fe fo "+
+"fya fyu fyo "+
+"ffa "+
+
+"ga gi gu ge go "+
+"gya gyi gyu gye gyo "+
+"gwa gwi gwu gwe gwo "+
+"gga "+
+
+"ha hi hu he ho "+
+"hya hyi hyu hye hyo "+
+"hha "+
+
+"i "+
+
+"ka ki ku ke ko "+
+"kwa kwi kwu kwe kwo "+
+"kya kyi kyu kye kyo "+
+"kka "+
+
+"ma mi mu me mo "+
+"mya myi myu mye myo "+
+"mba mfa mma mpa mva "+
+"m'' "+
+
+"na ni nu ne no "+
+"nya nyi nyu nye nyo "+
+"nn n'' n "+
+
+"o "+
+
+"pa pi pu pe po "+
+"pya pyi pyu pye pyo "+
+"ppa "+
+
+"qa qi qu qe qo "+
+"qya qyi qyu qye qyo "+
+"qqa "+
+
+"ra ri ru re ro "+
+"rya ryi ryu rye ryo "+
+"rra "+
+
+"sa si su se so "+
+"sya syi syu sye syo "+
+"ssya ssa "+
+
+"ta ti tu te to "+
+"tha thi thu the tho "+
+"tsa tsi tse tso "+
+"tya tyi tyu tye tyo "+
+"ttsa "+
+"tta "+
+
+"u "+
+
+"va vi vu ve vo "+
+"vya vyi vyu vye vyo "+
+"vva "+
+
+"wa wi we wo "+
+"wwa "+
+
+"ya yu ye yo "+
+"yya "+
+
+"za zi zu ze zo "+
+"zya zyi zyu zye zyo "+
+"zza "+
+
+"xa xi xu xe xo "+
+"xka xke "+
+"xtu "+
+"xwa "+
+"xya xyu xyo "+
+
+ "akka akki akku akke akko "+
+ "akkya akkyu akkyo "+
+
+ "atta atti attu atte atto "+
+ "attya attyu attyo "+
+ "adda addi addu adde addo "+
+
+ "atcha atchi atchu atche atcho "+
+
+ "assa assi assu asse asso "+
+ "assya assyu assyo "+
+
+ "ahha ahhi ahhu ahhe ahho "+
+ "appa appi appu appe appo "+
+
+ "an "+
+ "ana ani anu ane ano "+
+ "anna anni annu anne anno "+
+ "an'a an'i an'u an'e an'o "+
+
+ "annna annni annnu annne annno "+
+ "an'na an'ni an'nu an'ne an'no "+
+
+ "anka anki anku anke anko "+
+ "anga angi angu ange ango "+
+
+ "ansa ansi ansu anse anso "+
+ "anza anzi anzu anze anzo "+
+ "anzya anzyu anzyo "+
+
+ "anta anti antu ante anto "+
+ "antya antyu antyo "+
+ "anda andi andu ande ando "+
+
+ "ancha anchi anchu anche ancho "+
+ "anja anji anju anje anjo "+
+ "antsa antsu antso "+
+
+ "anpa anpi anpu anpe anpo "+
+ "ampa ampi ampu ampe ampo "+
+
+ "anba anbi anbu anbe anbo "+
+ "amba ambi ambu ambe ambo "+
+
+ "anma anmi anmu anme anmo "+
+ "amma ammi ammu amme ammo "+
+
+ "anwa anwi anwu anwe anwo "+
+
+ "anha anhi anhu anhe anho "+
+
+ "anya anyi anyu anye anyo "+
+ "annya annyi annyu annye annyo "+
+ "an'ya an'yi an'yu an'ye an'yo "+
+
+ "kkk "+
+ "ggg "+
+ "sss "+
+ "zzz "+
+ "ttt "+
+ "ddd "+
+ "nnn "+
+ "hhh "+
+ "bbb "+
+ "ppp "+
+ "mmm "+
+ "yyy "+
+ "rrr "+
+ "www ";
+*/
+
+ /*+
+
+ "A I U E O "+
+ "XA XI XU XE XO "+
+
+ "KA KI KU KE KO "+
+ "KYA KYI KYU KYE KYO "+
+ "KWA KWI KWU KWE KWO "+
+ "QA QI QU QE QO "+
+ "QYA QYI QYU QYE QYO "+
+ "XKA XKE "+
+
+ "GA GI GU GE GO "+
+ "GYA GYI GYU GYE GYO "+
+ "GWA GWI GWU GWE GWO "+
+
+ "SA SI SU SE SO "+
+ "SHA SHI SHU SHE SHO "+
+ "SYA SYI SYU SYE SYO "+
+
+ "ZA ZI ZU ZE ZO "+
+ "ZYA ZYI ZYU ZYE ZYO "+
+ "JA JI JU JE JO "+
+ "JYA JYU JYO "+
+
+ "TA TI TU TE TO "+
+ "XTU XTSU "+
+ "TYA TYU TYO "+
+ "CYA CYU CYO "+
+ "CHA CHI CHU CHE CHO "+
+ "TSA TSI TSU TSE TSO "+
+ "DA DI DU DE DO "+
+ "DYA DYU DYO "+
+ "THA THI THU THE THO "+
+ "DHA DHI DHU DHE DHO "+
+
+ "NA NI NU NE NO "+
+ "NYA NYU NYO "+
+
+ "HA HI HU HE HO "+
+ "HYA HYU HYO "+
+ "FA FI FU FE FO "+
+ "FYA FYU FYO "+
+ "BA BI BU BE BO "+
+ "BYA BYU BYO "+
+ "PA PI PU PE PO "+
+ "PYA PYU PYO "+
+
+ "MA MI MU ME MO "+
+ "MYA MYU MYO "+
+ "YA YI YU YE YO "+
+ "XYA XYI XYU XYE XYO "+
+
+ "RA RI RU RE RO "+
+ "LA LI LU LE LO "+
+ "RYA RYI RYU RYE RYO "+
+ "LYA LYI LYU LYE LYO "+
+
+ "WA WI WU WE WO "+
+ "VA VI VU VE VO "+
+ "VYA VYU VYO "+
+
+ "CYA CYI CYU CYE CYO "+
+
+ "NN "+
+ "N' "+
+ "N "+
+
+ "AKKA AKKI AKKU AKKE AKKO "+
+ "AKKYA AKKYU AKKYO "+
+
+ "ATTA ATTI ATTU ATTE ATTO "+
+ "ATTYA ATTYU ATTYO "+
+ "ADDA ADDI ADDU ADDE ADDO "+
+
+ "ATCHA ATCHI ATCHU ATCHE ATCHO "+
+
+ "ASSA ASSI ASSU ASSE ASSO "+
+ "ASSYA ASSYU ASSYO "+
+
+ "AHHA AHHI AHHU AHHE AHHO "+
+ "APPA APPI APPU APPE APPO "+
+
+ "AN "+
+ "ANA ANI ANU ANE ANO "+
+ "ANNA ANNI ANNU ANNE ANNO "+
+ "AN'A AN'I AN'U AN'E AN'O "+
+
+ "ANNNA ANNNI ANNNU ANNNE ANNNO "+
+ "AN'NA AN'NI AN'NU AN'NE AN'NO "+
+
+ "ANKA ANKI ANKU ANKE ANKO "+
+ "ANGA ANGI ANGU ANGE ANGO "+
+
+ "ANSA ANSI ANSU ANSE ANSO "+
+ "ANZA ANZI ANZU ANZE ANZO "+
+ "ANZYA ANZYU ANZYO "+
+
+ "ANTA ANTI ANTU ANTE ANTO "+
+ "ANTYA ANTYU ANTYO "+
+ "ANDA ANDI ANDU ANDE ANDO "+
+
+ "ANCHA ANCHI ANCHU ANCHE ANCHO "+
+ "ANJA ANJI ANJU ANJE ANJO "+
+ "ANTSA ANTSU ANTSO "+
+
+ "ANPA ANPI ANPU ANPE ANPO "+
+ "AMPA AMPI AMPU AMPE AMPO "+
+
+ "ANBA ANBI ANBU ANBE ANBO "+
+ "AMBA AMBI AMBU AMBE AMBO "+
+
+ "ANMA ANMI ANMU ANME ANMO "+
+ "AMMA AMMI AMMU AMME AMMO "+
+
+ "ANWA ANWI ANWU ANWE ANWO "+
+
+ "ANHA ANHI ANHU ANHE ANHO "+
+
+ "ANYA ANYI ANYU ANYE ANYO "+
+ "ANNYA ANNYI ANNYU ANNYE ANNYO "+
+ "AN'YA AN'YI AN'YU AN'YE AN'YO "+
+
+ "KKK "+
+ "GGG "+
+ "SSS "+
+ "ZZZ "+
+ "TTT "+
+ "DDD "+
+ "NNN "+
+ "HHH "+
+ "BBB "+
+ "PPP "+
+ "MMM "+
+ "YYY "+
+ "RRR "+
+ "WWW";*/
+}
diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
new file mode 100755
index 00000000000..8417faf4b44
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
@@ -0,0 +1,118 @@
+import com.ibm.text.*;
+import java.text.*;
+import java.util.*;
+
+/**
+ * @test
+ * @summary General test of UnicodeSet
+ */
+public class UnicodeSetTest extends IntlTest {
+
+ public static void main(String[] args) throws Exception {
+ new UnicodeSetTest().run(args);
+ }
+
+ public void TestPatterns() {
+ UnicodeSet set = new UnicodeSet();
+ expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km");
+ expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz");
+ expectPattern(set, "[a\\-z]", "--aazz");
+ expectPattern(set, "[-az]", "--aazz");
+ expectPattern(set, "[az-]", "--aazz");
+ expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz");
+
+ // Throw in a test of complement
+ set.complement();
+ String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF';
+ expectPairs(set, exp);
+ }
+
+ public void TestAddRemove() {
+ UnicodeSet set = new UnicodeSet();
+ set.add('a', 'z');
+ expectPairs(set, "az");
+ set.remove('m', 'p');
+ expectPairs(set, "alqz");
+ set.remove('e', 'g');
+ expectPairs(set, "adhlqz");
+ set.remove('d', 'i');
+ expectPairs(set, "acjlqz");
+ set.remove('c', 'r');
+ expectPairs(set, "absz");
+ set.add('f', 'q');
+ expectPairs(set, "abfqsz");
+ set.remove('a', 'g');
+ expectPairs(set, "hqsz");
+ set.remove('a', 'z');
+ expectPairs(set, "");
+
+ // Try removing an entire set from another set
+ expectPattern(set, "[c-x]", "cx");
+ UnicodeSet set2 = new UnicodeSet();
+ expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
+ set.removeAll(set2);
+ expectPairs(set, "deluxx");
+
+ // Try adding an entire set to another set
+ expectPattern(set, "[jackiemclean]", "aacceein");
+ expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
+ set.addAll(set2);
+ expectPairs(set, "aacehort");
+
+ // Test commutativity
+ expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
+ expectPattern(set2, "[jackiemclean]", "aacceein");
+ set.addAll(set2);
+ expectPairs(set, "aacehort");
+ }
+
+ void expectPattern(UnicodeSet set,
+ String pattern,
+ String expectedPairs) {
+ set.applyPattern(pattern);
+ if (!set.getPairs().equals(expectedPairs)) {
+ errln("FAIL: applyPattern(\"" + pattern +
+ "\") => pairs \"" +
+ escape(set.getPairs()) + "\", expected \"" +
+ escape(expectedPairs) + "\"");
+ } else {
+ logln("Ok: applyPattern(\"" + pattern +
+ "\") => pairs \"" +
+ escape(set.getPairs()) + "\"");
+ }
+ }
+
+ void expectPairs(UnicodeSet set, String expectedPairs) {
+ if (!set.getPairs().equals(expectedPairs)) {
+ errln("FAIL: Expected pair list \"" +
+ escape(expectedPairs) + "\", got \"" +
+ escape(set.getPairs()) + "\"");
+ }
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ */
+ static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java b/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java
new file mode 100755
index 00000000000..c3582237d42
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java
@@ -0,0 +1,285 @@
+package com.ibm.text;
+
+import java.util.Enumeration;
+import java.util.Vector;
+
+/**
+ * A transliterator that is composed of two or more other
+ * transliterator objects linked together. For example, if one
+ * transliterator transliterates from script A to script B, and
+ * another transliterates from script B to script C, the two may be
+ * combined to form a new transliterator from A to C.
+ *
+ * Composed transliterators may not behave as expected. For
+ * example, inverses may not combine to form the identity
+ * transliterator. See the class documentation for {@link
+ * Transliterator} for details.
+ *
+ *
If a non-null UnicodeFilter is applied to a
+ * CompoundTransliterator, it has the effect of being
+ * logically anded with the filter of each transliterator in
+ * the chain.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class CompoundTransliterator extends Transliterator {
+
+ private static final boolean DEBUG = false;
+
+ private Transliterator[] trans;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructs a new compound transliterator given an array of
+ * transliterators. The array of transliterators may be of any
+ * length, including zero or one, however, useful compound
+ * transliterators have at least two components.
+ * @param transliterators array of Transliterator
+ * objects
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ public CompoundTransliterator(String ID, Transliterator[] transliterators,
+ UnicodeFilter filter) {
+ super(ID, filter);
+ trans = new Transliterator[transliterators.length];
+ System.arraycopy(transliterators, 0, trans, 0, trans.length);
+ }
+
+ /**
+ * Constructs a new compound transliterator given an array of
+ * transliterators. The array of transliterators may be of any
+ * length, including zero or one, however, useful compound
+ * transliterators have at least two components.
+ * @param transliterators array of Transliterator
+ * objects
+ */
+ public CompoundTransliterator(String ID, Transliterator[] transliterators) {
+ this(ID, transliterators, null);
+ }
+
+ /**
+ * Returns the number of transliterators in this chain.
+ * @return number of transliterators in this chain.
+ */
+ public int getCount() {
+ return trans.length;
+ }
+
+ /**
+ * Returns the transliterator at the given index in this chain.
+ * @param index index into chain, from 0 to getCount() - 1
+ * @return transliterator at the given index
+ */
+ public Transliterator getTransliterator(int index) {
+ return trans[index];
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return the new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ for (int i=0; i abca/u
+ * S C L S C L gl=f->a
+ *
+ * 2. upup, changes "x" to "XX"
+ *
+ * 4 7 a 4 7 a
+ * abca/u => abcAA/u
+ * S CL S C
+ * L gl=a->b
+ * 3. u-h, changes Unicode to hex
+ *
+ * 4 7 a 4 7 a d 0 3
+ * abcAA/u => abc/u0041/u0041/u
+ * S C L S C
+ * L gl=b->15
+ * 4. return
+ *
+ * 4 7 a d 0 3
+ * abc/u0041/u0041/u
+ * S C L
+ */
+
+ /**
+ * One more wrinkle. If there is a filter F for the compound
+ * transliterator as a whole, then we need to modify every
+ * non-null filter f in the chain to be f' = F & f. Then,
+ * when we're done, we restore the original filters.
+ *
+ * A possible future optimization is to change f to f' at
+ * construction time, but then if anyone else is using the
+ * transliterators in the chain outside of this context, they
+ * will get unexpected results.
+ */
+ UnicodeFilter F = getFilter();
+ UnicodeFilter[] f = null;
+ if (F != null) {
+ f = new UnicodeFilter[trans.length];
+ for (int i=0; i \""));
+ }
+
+ trans[i].handleKeyboardTransliterate(text, index);
+
+ if (DEBUG) {
+ System.out.println(escape(
+ substring(text, index[START], index[CURSOR]) + '|' +
+ substring(text, index[CURSOR], index[LIMIT]) +
+ '"'));
+ }
+
+ // Adjust overall limit for insertions/deletions
+ globalLimit += index[LIMIT] - limit;
+ limit = index[CURSOR]; // Move limit to end of committed text
+ }
+ // Cursor is good where it is -- where the last
+ // transliterator left it. Limit needs to be put back
+ // where it was, modulo adjustments for deletions/insertions.
+ index[LIMIT] = globalLimit;
+
+ } finally {
+ // Fixup the transliterator filters, if we had to modify them.
+ if (f != null) {
+ for (int i=0; ipreceding context.
+ * @return maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ int max = 0;
+ for (int i=0; i max) {
+ max = len;
+ }
+ }
+ return max;
+ }
+
+ /**
+ * DEBUG
+ * Returns a substring of a Replaceable.
+ */
+ private static final String substring(Replaceable str, int start, int limit) {
+ StringBuffer buf = new StringBuffer();
+ while (start < limit) {
+ buf.append(str.charAt(start++));
+ }
+ return buf.toString();
+ }
+
+ /**
+ * DEBUG
+ * Escapes non-ASCII characters as Unicode.
+ */
+ private static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java b/icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java
new file mode 100755
index 00000000000..18673e15fe7
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java
@@ -0,0 +1,130 @@
+package com.ibm.text;
+import java.util.*;
+
+/**
+ * A transliterator that converts from hexadecimal Unicode
+ * escape sequences to the characters they represent. For example, "U+0040"
+ * and '\u0040'. It recognizes the
+ * prefixes "U+", "u+", "\U", and "\u". Hex values may be
+ * upper- or lowercase.
+ *
+ * Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class HexToUnicodeTransliterator extends Transliterator {
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Package accessible ID for this transliterator.
+ */
+ static String _ID = "Hex-Unicode";
+
+ /**
+ * Constructs a transliterator.
+ */
+ public HexToUnicodeTransliterator() {
+ super(_ID, null);
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return the new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ int[] offsets = { start, limit, start };
+ handleKeyboardTransliterate(text, offsets);
+ return offsets[LIMIT];
+ }
+
+ /**
+ * Implements {@link Transliterator#handleKeyboardTransliterate}.
+ */
+ protected void handleKeyboardTransliterate(Replaceable text,
+ int[] offsets) {
+ /**
+ * Performs transliteration changing Unicode hexadecimal
+ * escapes to characters. For example, "U+0040" -> '@'. A fixed
+ * set of prefixes is recognized: "\u", "\U", "u+", "U+".
+ */
+ int cursor = offsets[CURSOR];
+ int limit = offsets[LIMIT];
+
+ int maxCursor = limit - 6;
+ loop:
+ while (cursor <= maxCursor) {
+ char c = filteredCharAt(text, cursor + 5);
+ int digit0 = Character.digit(c, 16);
+ if (digit0 < 0) {
+ if (c == '\\') {
+ cursor += 5;
+ } else if (c == 'U' || c == 'u' || c == '+') {
+ cursor += 4;
+ } else {
+ cursor += 6;
+ }
+ continue;
+ }
+
+ int u = digit0;
+
+ for (int i=4; i>=2; --i) {
+ c = filteredCharAt(text, cursor + i);
+ int digit = Character.digit(c, 16);
+ if (digit < 0) {
+ if (c == 'U' || c == 'u' || c == '+') {
+ cursor += i-1;
+ } else {
+ cursor += 6;
+ }
+ continue loop;
+ }
+ u |= digit << (4 * (5-i));
+ }
+
+ c = filteredCharAt(text, cursor);
+ char d = filteredCharAt(text, cursor + 1);
+ if (((c == 'U' || c == 'u') && d == '+')
+ || (c == '\\' && (d == 'U' || d == 'u'))) {
+
+ // At this point, we have a match; replace cursor..cursor+5
+ // with u.
+ text.replace(cursor, cursor+6, String.valueOf((char) u));
+ limit -= 5;
+ maxCursor -= 5;
+
+ ++cursor;
+ } else {
+ cursor += 6;
+ }
+ }
+
+ offsets[LIMIT] = limit;
+ offsets[CURSOR] = cursor;
+ }
+
+ private char filteredCharAt(Replaceable text, int i) {
+ char c;
+ UnicodeFilter filter = getFilter();
+ return (filter == null) ? text.charAt(i) :
+ (filter.isIn(c = text.charAt(i)) ? c : '\uFFFF');
+ }
+
+ /**
+ * Return the length of the longest context required by this transliterator.
+ * This is preceding context.
+ * @param direction either FORWARD
or REVERSE
+ * @return maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return 0;
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/Replaceable.java b/icu4j/src/com/ibm/icu/text/Replaceable.java
new file mode 100755
index 00000000000..b4c8519689c
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/Replaceable.java
@@ -0,0 +1,77 @@
+package com.ibm.text;
+
+/**
+ * Replaceable
is an interface that supports the
+ * operation of replacing a substring with another piece of text.
+ * Replaceable
is needed in order to change a piece of
+ * text while retaining style attributes. For example, if the string
+ * "the bold font" has range (4, 8) replaced with "strong",
+ * then it becomes "the strong font".
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: Replaceable.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public interface Replaceable {
+ /**
+ * Return the number of characters in the text.
+ * @return number of characters in text
+ */
+ int length();
+
+ /**
+ * Return the character at the given offset into the text.
+ * @param offset an integer between 0 and length()
-1
+ * inclusive
+ * @return character of text at given offset
+ */
+ char charAt(int offset);
+
+ /**
+ * Copies characters from this object into the destination
+ * character array. The first character to be copied is at index
+ * srcStart
; the last character to be copied is at
+ * index srcLimit-1
(thus the total number of
+ * characters to be copied is srcLimit-srcStart
). The
+ * characters are copied into the subarray of dst
+ * starting at index dstStart
and ending at index
+ * dstStart + (srcLimit-srcStart) - 1
.
+ *
+ * @param srcStart the beginning index to copy, inclusive; 0
+ * <= start <= limit
.
+ * @param srcLimit the ending index to copy, exclusive;
+ * start <= limit <= length()
.
+ * @param dst the destination array.
+ * @param dstStart the start offset in the destination array.
+ */
+ void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
+
+ /**
+ * Replace a substring of this object with the given text.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param text the text to replace characters start
+ * to limit - 1
+ */
+ void replace(int start, int limit, String text);
+
+ /**
+ * Replace a substring of this object with the given text.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param chars the text to replace characters start
+ * to limit - 1
+ * @param charsStart the beginning index into chars
,
+ * inclusive; 0 <= start <= limit
.
+ * @param charsLen the number of characters of chars
.
+ */
+ void replace(int start, int limit, char[] chars,
+ int charsStart, int charsLen);
+ // Note: We use length rather than limit to conform to StringBuffer
+ // and System.arraycopy.
+}
diff --git a/icu4j/src/com/ibm/icu/text/ReplaceableString.java b/icu4j/src/com/ibm/icu/text/ReplaceableString.java
new file mode 100755
index 00000000000..d6a7df06db5
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/ReplaceableString.java
@@ -0,0 +1,159 @@
+package com.ibm.text;
+
+/**
+ * ReplaceableString
is an adapter class that implements the
+ * Replaceable
API around an ordinary StringBuffer
.
+ *
+ *
Note: This class does not support attributes and is not
+ * intended for general use. Most clients will need to implement
+ * {@link Replaceable} in their text representation class.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @see Replaceable
+ * @author Alan Liu
+ * @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class ReplaceableString implements Replaceable {
+ private StringBuffer buf;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a new object with the given initial contents.
+ * @param str initial contents
+ */
+ public ReplaceableString(String str) {
+ buf = new StringBuffer(str);
+ }
+
+ /**
+ * Construct a new object using buf
for internal
+ * storage. The contents of buf
at the time of
+ * construction are used as the initial contents. Note!
+ * Modifications to buf
will modify this object, and
+ * vice versa.
+ * @param buf object to be used as internal storage
+ */
+ public ReplaceableString(StringBuffer buf) {
+ this.buf = buf;
+ }
+
+ /**
+ * Construct a new empty object.
+ */
+ public ReplaceableString() {
+ buf = new StringBuffer();
+ }
+
+ /**
+ * Return the contents of this object as a String
.
+ * @return string contents of this object
+ */
+ public String toString() {
+ return buf.toString();
+ }
+
+ /**
+ * Return the internal storage of this object. Note! Any
+ * changes made to the returned object affect this object's
+ * contents, and vice versa.
+ * @return internal buffer used by this object
+ */
+ public StringBuffer getStringBuffer() {
+ return buf;
+ }
+
+ /**
+ * Return the number of characters contained in this object.
+ * Replaceable
API.
+ */
+ public int length() {
+ return buf.length();
+ }
+
+ /**
+ * Return the character at the given position in this object.
+ * Replaceable
API.
+ * @param offset offset into the contents, from 0 to
+ * length()
- 1
+ */
+ public char charAt(int offset) {
+ return buf.charAt(offset);
+ }
+
+ /**
+ * Copies characters from this object into the destination
+ * character array. The first character to be copied is at index
+ * srcStart
; the last character to be copied is at
+ * index srcLimit-1
(thus the total number of
+ * characters to be copied is srcLimit-srcStart
). The
+ * characters are copied into the subarray of dst
+ * starting at index dstStart
and ending at index
+ * dstStart + (srcLimit-srcStart) - 1
.
+ *
+ * @param srcStart the beginning index to copy, inclusive; 0
+ * <= start <= limit
.
+ * @param srcLimit the ending index to copy, exclusive;
+ * start <= limit <= length()
.
+ * @param dst the destination array.
+ * @param dstStart the start offset in the destination array.
+ */
+ public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
+ buf.getChars(srcStart, srcLimit, dst, dstStart);
+ }
+
+ /**
+ * Replace zero or more characters with new characters.
+ * Replaceable
API.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param text new text to replace characters start
to
+ * limit - 1
+ */
+ public void replace(int start, int limit, String text) {
+ if (start == limit) {
+ buf.insert(start, text);
+ } else {
+ char[] tail = null;
+ if (limit < buf.length()) {
+ tail = new char[buf.length() - limit];
+ buf.getChars(limit, buf.length(), tail, 0);
+ }
+ buf.setLength(start);
+ buf.append(text);
+ if (tail != null) {
+ buf.append(tail);
+ }
+ }
+ }
+
+ /**
+ * Replace a substring of this object with the given text.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param chars the text to replace characters start
+ * to limit - 1
+ * @param charsStart the beginning index into chars
,
+ * inclusive; 0 <= start <= limit
.
+ * @param charsLen the number of characters of chars
.
+ */
+ public void replace(int start, int limit, char[] chars,
+ int charsStart, int charsLen) {
+ char[] tail = null;
+ if (limit < buf.length()) {
+ tail = new char[buf.length() - limit];
+ buf.getChars(limit, buf.length(), tail, 0);
+ }
+ buf.setLength(start);
+ buf.append(chars, charsStart, charsLen);
+ if (tail != null) {
+ buf.append(tail);
+ }
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
new file mode 100755
index 00000000000..4a433e9479d
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
@@ -0,0 +1,1187 @@
+package com.ibm.text;
+
+import java.util.Hashtable;
+import java.util.Vector;
+
+/**
+ * A transliterator that reads a set of rules in order to determine how to
+ * perform translations. Rules are stored in resource bundles indexed by name.
+ * Rules are separated by newline characters ('\n'); to include a literal
+ * newline, prefix it with a backslash ('\\\n'). Whitespace is significant. If
+ * the first character on a line is '#', the entire line is ignored as a
+ * comment.
+ *
+ *
Each set of rules consists of two groups, one forward, and one reverse.
+ * This is a convention that is not enforced; rules for one direction may be
+ * omitted, with the result that translations in that direction will not modify
+ * the source text.
+ *
+ *
Rule syntax
+ *
+ *
Rule statements take one of the following forms:
+ *
+ * alefmadda=\u0622
+ *
+ * - Variable definition. The name on the left is
+ * assigned the character or expression on the right. Names may not
+ * contain any special characters (see list below). Duplicate names
+ * (including duplicates of simple variables or category names)
+ * cause an exception to be thrown. If the right hand side consists
+ * of one character, then the variable stands for that character.
+ * In this example, after this statement, instances of the left hand
+ * name surrounded by braces, "
{alefmadda}
",
+ * will be replaced by the Unicode character U+0622. If the
+ * right hand side is longer than one character, then it is
+ * interpreted as a character category expression; see below for
+ * details.
+ *
+ * softvowel=[eiyEIY]
+ *
+ * - Category definition. The name on the left is assigned
+ * to stand for a set of characters. The same rules for names of simple
+ * variables apply. After this statement, the left hand variable will be
+ * interpreted as indicating a set of characters in appropriate contexts. The
+ * pattern syntax defining sets of characters is defined by {@link UnicodeSet}.
+ * Examples of valid patterns are:
+ *
+ *
+ * [abc] |
+ * The set containing the characters 'a', 'b', and 'c'. |
+ *
+ *
+ * [^abc] |
+ * The set of all characters except 'a', 'b', and 'c'. |
+ *
+ *
+ * [A-Z] |
+ * The set of all characters from 'A' to 'Z' in Unicode order. |
+ *
+ *
+ * [:Lu:] |
+ * The set of Unicode uppercase letters. See
+ * www.unicode.org
+ * for a complete list of categories and their two-letter codes. |
+ *
+ *
+ * [^a-z[:Lu:][:Ll:]] |
+ * The set of all characters except 'a' through 'z' and
+ * uppercase or lowercase letters. |
+ *
+ *
+ *
+ * See {@link UnicodeSet} for more documentation and examples.
+ *
+ *
+ * ai>{alefmadda}
+ *
+ * - Forward translation rule. This rule states that the
+ * string on the left will be changed to the string on the right when
+ * performing forward transliteration.
+ *
+ * ai<{alefmadda}
+ *
+ * - Reverse translation rule. This rule states that the
+ * string on the right will be changed to the string on the left when
+ * performing reverse transliteration.
+ *
+ *
+ *
+ * Forward and reverse translation rules consist of a match
+ * pattern and an output string. The match pattern consists
+ * of literal characters, optionally preceded by context, and optionally
+ * followed by context. Context characters, like literal pattern characters,
+ * must be matched in the text being transliterated. However, unlike literal
+ * pattern characters, they are not replaced by the output text. For example,
+ * the pattern "[abc]def
" indicates the characters
+ * "def
" must be preceded by "abc
" for a successful
+ * match. If there is a successful match, "def
" will be replaced,
+ * but not "abc
". The initial '[
' is optional, so
+ * "abc]def
" is equivalent to "[abc]def
". Another
+ * example is "123[456]
" (or "123[456
") in which the
+ * literal pattern "123
" must be followed by "456
".
+ *
+ *
The output string of a forward or reverse rule consists of characters to
+ * replace the literal pattern characters. If the output string contains the
+ * character '|
', this is taken to indicate the location of the
+ * cursor after replacement. The cursor is the point in the text
+ * at which the next replacement, if any, will be applied.
+ *
+ *
Example
+ *
+ *
The following example rules illustrate many of the features of the rule
+ * language.
+ *
+ * Rule 1. |
+ * abc]def>x|y |
+ * Rule 2. |
+ * xyz>r |
+ * Rule 3. |
+ * yz>q |
+ *
+ *
+ * Applying these rules to the string "adefabcdefz
" yields the
+ * following results:
+ *
+ *
+ * |adefabcdefz |
+ * Initial state, no rules match. Advance cursor. |
+ * a|defabcdefz |
+ * Still no match. Rule 1 does not match because the preceding
+ * context is not present. |
+ * ad|efabcdefz |
+ * Still no match. Keep advancing until there is a match... |
+ * ade|fabcdefz |
+ * ... |
+ * adef|abcdefz |
+ * ... |
+ * adefa|bcdefz |
+ * ... |
+ * adefab|cdefz |
+ * ... |
+ * adefabc|defz |
+ * Rule 1 matches; replace "def " with "xy "
+ * and back up the cursor to before the 'y '. |
+ * adefabcx|yz |
+ * Although "xyz " is present, rule 2 does not match
+ * because the cursor is before the 'y ', not before the
+ * 'x '. Rule 3 does match. Replace "yz " with
+ * "q ". |
+ * adefabcxq| |
+ * The cursor is at the end; transliteration is complete. |
+ *
+ *
+ * The order of rules is significant. If multiple rules may match at some
+ * point, the first matching rule is applied.
+ *
+ *
Forward and reverse rules may have an empty output string. Otherwise, an
+ * empty left or right hand side of any statement is a syntax error.
+ *
+ *
Single quotes are used to quote the special characters
+ * =><{}[]|
. To specify a single quote itself, inside or
+ * outside of quotes, use two single quotes in a row. For example, the rule
+ * "'>'>o''clock
" changes the string ">
" to
+ * the string "o'clock
".
+ *
+ *
Notes
+ *
+ *
While a RuleBasedTransliterator is being built, it checks that the rules
+ * are added in proper order. For example, if the rule "a>x" is followed by the
+ * rule "ab>y", then the second rule will throw an exception. The reason is
+ * that the second rule can never be triggered, since the first rule always
+ * matches anything it matches. In other words, the first rule masks
+ * the second rule. There is a cost of O(n^2) to make this check; in real-world
+ * tests it appears to approximately double build time.
+ *
+ *
One optimization that can be made is to add a pragma to the rule language,
+ * "#pragma order", that turns off ordering checking. This pragma can then be
+ * added to all of our resource-based rules (after we build these once and
+ * determine that there are no ordering errors). I haven't made this change yet
+ * in the interests of keeping the code from getting too byzantine.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class RuleBasedTransliterator extends Transliterator {
+ /**
+ * Direction constant passed to constructor to create a transliterator
+ * using the forward rules.
+ */
+ public static final int FORWARD = 0;
+
+ /**
+ * Direction constant passed to constructor to create a transliterator
+ * using the reverse rules.
+ */
+ public static final int REVERSE = 1;
+
+ private Data data;
+
+ static final boolean DEBUG = false;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructs a new transliterator from the given rules.
+ * @param rules rules, separated by '\n'
+ * @param direction either FORWARD or REVERSE.
+ * @exception IllegalArgumentException if rules are malformed
+ * or direction is invalid.
+ */
+ public RuleBasedTransliterator(String ID, String rules, int direction,
+ UnicodeFilter filter) {
+ super(ID, filter);
+ if (direction != FORWARD && direction != REVERSE) {
+ throw new IllegalArgumentException("Invalid direction");
+ }
+ data = parse(rules, direction);
+ }
+
+ /**
+ * Constructs a new transliterator from the given rules in the
+ * FORWARD
direction.
+ * @param rules rules, separated by '\n'
+ * @exception IllegalArgumentException if rules are malformed
+ * or direction is invalid.
+ */
+ public RuleBasedTransliterator(String ID, String rules) {
+ this(ID, rules, FORWARD, null);
+ }
+
+ RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
+ super(ID, filter);
+ this.data = data;
+ }
+
+ static Data parse(String rules, int direction) {
+ return new Parser(rules, direction).getData();
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result buffer to receive the transliterated text; previous
+ * contents are discarded
+ */
+ public void transliterate(String text, int start, int limit,
+ StringBuffer result) {
+ /* In the following loop there is a virtual buffer consisting of the
+ * text transliterated so far followed by the untransliterated text. There is
+ * also a cursor, which may be in the already transliterated buffer or just
+ * before the untransliterated text.
+ *
+ * Example: rules 1. ab>x|y
+ * 2. yc>z
+ *
+ * []|eabcd start - no match, copy e to tranlated buffer
+ * [e]|abcd match rule 1 - copy output & adjust cursor
+ * [ex|y]cd match rule 2 - copy output & adjust cursor
+ * [exz]|d no match, copy d to transliterated buffer
+ * [exzd]| done
+ *
+ * cursor: an index into the virtual buffer, 0..result.length()-1.
+ * Matches take place at the cursor. If there is no match, the cursor
+ * is advanced, and one character is moved from the source text to the
+ * result buffer.
+ *
+ * start, limit: these designate the substring of the source text which
+ * has not been processed yet. The range of offsets is start..limit-1.
+ * At any moment the virtual buffer consists of result +
+ * text.substring(start, limit).
+ */
+ int cursor = 0;
+ result.setLength(0);
+ while (start < limit || cursor < result.length()) {
+ TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result,
+ cursor, data.setVariables, getFilter());
+ if (DEBUG) {
+ StringBuffer buf = new StringBuffer(
+ result.toString() + '#' + text.substring(start, limit));
+ buf.insert(cursor <= result.length()
+ ? cursor : (cursor + 1),
+ '|');
+ System.err.print((r == null ? "nomatch:" : ("match:" + r + ", "))
+ + buf);
+ }
+
+ if (r == null) {
+ if (cursor == result.length()) {
+ result.append(text.charAt(start++));
+ }
+ ++cursor;
+ } else {
+ // resultPad is length of result to right of cursor; >= 0
+ int resultPad = result.length() - cursor;
+ char[] tail = null;
+ if (r.getKeyLength() > resultPad) {
+ start += r.getKeyLength() - resultPad;
+ } else if (r.getKeyLength() < resultPad) {
+ tail = new char[resultPad - r.getKeyLength()];
+ result.getChars(cursor + r.getKeyLength(), result.length(),
+ tail, 0);
+ }
+ result.setLength(cursor);
+ result.append(r.getOutput());
+ if (tail != null) {
+ result.append(tail);
+ }
+ cursor += r.getCursorPos();
+ }
+
+ if (DEBUG) {
+ StringBuffer buf = new StringBuffer(
+ result.toString() + '#' + text.substring(start, limit));
+ buf.insert(cursor <= result.length()
+ ? cursor : (cursor + 1),
+ '|');
+ System.err.println(" => " + buf);
+ }
+ }
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return The new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ /* When using Replaceable, the algorithm is simpler, since we don't have
+ * two separate buffers. We keep start and limit fixed the entire time,
+ * relative to the text -- limit may move numerically if text is
+ * inserted or removed. The cursor moves from start to limit, with
+ * replacements happening under it.
+ *
+ * Example: rules 1. ab>x|y
+ * 2. yc>z
+ *
+ * |eabcd start - no match, advance cursor
+ * e|abcd match rule 1 - change text & adjust cursor
+ * ex|ycd match rule 2 - change text & adjust cursor
+ * exz|d no match, advance cursor
+ * exzd| done
+ */
+ int cursor = start;
+ while (cursor < limit) {
+ TransliterationRule r = data.ruleSet.findMatch(text, start, limit,
+ cursor, data.setVariables, getFilter());
+ if (r == null) {
+ ++cursor;
+ } else {
+ text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
+ limit += r.getOutput().length() - r.getKeyLength();
+ cursor += r.getCursorPos();
+ }
+ }
+ return limit;
+ }
+
+ /**
+ * Implements {@link Transliterator#handleKeyboardTransliterate}.
+ */
+ protected void handleKeyboardTransliterate(Replaceable text,
+ int[] index) {
+ int start = index[START];
+ int limit = index[LIMIT];
+ int cursor = index[CURSOR];
+
+ if (DEBUG) {
+ System.out.print("\"" +
+ escape(rsubstring(text, start, cursor)) + '|' +
+ escape(rsubstring(text, cursor, limit)) + "\"");
+ }
+
+ boolean partial[] = new boolean[1];
+
+ while (cursor < limit) {
+ TransliterationRule r = data.ruleSet.findIncrementalMatch(
+ text, start, limit, cursor, data.setVariables, partial, getFilter());
+ /* If we match a rule then apply it by replacing the key
+ * with the rule output and repositioning the cursor
+ * appropriately. If we get a partial match, then we
+ * can't do anything without more text; return with the
+ * cursor at the current position. If we get null, then
+ * there is no match at this position, and we can advance
+ * the cursor.
+ */
+ if (r == null) {
+ if (partial[0]) {
+ break;
+ } else {
+ ++cursor;
+ }
+ } else {
+ text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
+ limit += r.getOutput().length() - r.getKeyLength();
+ cursor += r.getCursorPos();
+ }
+ }
+
+ if (DEBUG) {
+ System.out.println(" -> \"" +
+ escape(rsubstring(text, start, cursor)) + '|' +
+ escape(rsubstring(text, cursor, cursor)) + '|' +
+ escape(rsubstring(text, cursor, limit)) + "\"");
+ }
+
+ index[LIMIT] = limit;
+ index[CURSOR] = cursor;
+ }
+
+ /**
+ * Returns the length of the longest context required by this transliterator.
+ * This is preceding context.
+ * @return Maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return data.ruleSet.getMaximumContextLength();
+ }
+
+
+ /**
+ * FOR DEBUGGING: Return a substring of a Replaceable.
+ */
+ private static String rsubstring(Replaceable r, int start, int limit) {
+ StringBuffer buf = new StringBuffer();
+ while (start < limit) {
+ buf.append(r.charAt(start++));
+ }
+ return buf.toString();
+ }
+
+ /**
+ * FOR DEBUGGING: Escape non-ASCII characters as Unicode.
+ */
+ private static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ if (c == '\\') {
+ buf.append("\\\\"); // That is, "\\"
+ } else {
+ buf.append(c);
+ }
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+
+
+
+
+
+ static class Data {
+ public Data() {
+ variableNames = new Hashtable();
+ setVariables = new Hashtable();
+ ruleSet = new TransliterationRuleSet();
+ }
+
+ /**
+ * Rule table. May be empty.
+ */
+ public TransliterationRuleSet ruleSet;
+
+ /**
+ * Map variable name (String) to variable (Character). A variable
+ * name may correspond to a single literal character, in which
+ * case the character is stored in this hash. It may also
+ * correspond to a UnicodeSet, in which case a character is
+ * again stored in this hash, but the character is a stand-in: it
+ * is a key for a secondary lookup in data.setVariables. The stand-in
+ * also represents the UnicodeSet in the stored rules.
+ */
+ public Hashtable variableNames;
+
+ /**
+ * Map category variable (Character) to set (UnicodeSet).
+ * Variables that correspond to a set of characters are mapped
+ * from variable name to a stand-in character in data.variableNames.
+ * The stand-in then serves as a key in this hash to lookup the
+ * actual UnicodeSet object. In addition, the stand-in is
+ * stored in the rule text to represent the set of characters.
+ */
+ public Hashtable setVariables;
+ }
+
+
+
+
+
+
+ private static class Parser {
+ private String rules;
+
+ private int direction;
+
+ private Data data;
+
+ /**
+ * The next available stand-in for variables. This starts at some point in
+ * the private use area (discovered dynamically) and increments up toward
+ * variableLimit
. At any point during parsing, available
+ * variables are variableNext..variableLimit-1
.
+ */
+ private char variableNext;
+
+ /**
+ * The last available stand-in for variables. This is discovered
+ * dynamically. At any point during parsing, available variables are
+ * variableNext..variableLimit-1
.
+ */
+ private char variableLimit;
+
+ // Operators
+ private static final char VARIABLE_DEF_OP = '=';
+ private static final char FORWARD_RULE_OP = '>';
+ private static final char REVERSE_RULE_OP = '<';
+
+ private static final String OPERATORS = "=><";
+
+ // Other special characters
+ private static final char QUOTE = '\'';
+ private static final char VARIABLE_REF_OPEN = '{';
+ private static final char VARIABLE_REF_CLOSE = '}';
+ private static final char CONTEXT_OPEN = '[';
+ private static final char CONTEXT_CLOSE = ']';
+ private static final char CURSOR_POS = '|';
+ private static final char RULE_COMMENT_CHAR = '#';
+
+ /**
+ * Specials must be quoted in rules to be used as literals.
+ * Specials may not occur in variable names.
+ */
+ private static final String SPECIALS = "'{}[]|#" + OPERATORS;
+
+ /**
+ * Specials that must be quoted in variable definitions.
+ */
+ private static final String DEF_SPECIALS = "'{}";
+
+ /**
+ * @param rules list of rules, separated by newline characters
+ * @exception IllegalArgumentException if there is a syntax error in the
+ * rules
+ */
+ public Parser(String rules, int direction) {
+ this.rules = rules;
+ this.direction = direction;
+ data = new Data();
+ parseRules();
+ }
+
+ public Data getData() {
+ return data;
+ }
+
+ /**
+ * Parse the given string as a sequence of rules, separated by newline
+ * characters ('\n'), and cause this object to implement those rules. Any
+ * previous rules are discarded. Typically this method is called exactly
+ * once, during construction.
+ * @exception IllegalArgumentException if there is a syntax error in the
+ * rules
+ */
+ private void parseRules() {
+ determineVariableRange();
+
+ int n = rules.length();
+ int i = 0;
+ while (i0 && rules.charAt(limit-1) == '\\') {
+ limit = rules.indexOf('\n', limit+1);
+ }
+
+ if (limit == -1) {
+ limit = n;
+ }
+ // Skip over empty lines and line starting with #
+ if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
+ applyRule(i, limit);
+ }
+ i = limit + 1;
+ }
+
+ data.ruleSet.freeze();
+ }
+
+ /**
+ * Parse the given substring as a rule, and append it to the rules currently
+ * represented in this object.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @exception IllegalArgumentException if there is a syntax error in the
+ * rules
+ */
+ private void applyRule(int start, int limit) {
+ /* General description of parsing: Initially, rules contain two types of
+ * quoted characters. First, there are variable references, such as
+ * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
+ * the first steps in parsing a rule is to resolve such quoted matter.
+ * Quotes are removed early, leaving unquoted literal matter. Variable
+ * references are resolved and replaced by single characters. In some
+ * instances these characters represent themselves; in others, they
+ * stand for categories of characters. Character categories are either
+ * predefined (e.g., "{Lu}"), or are defined by the user using a
+ * statement (e.g., "vowels:aeiouAEIOU").
+ *
+ * Another early step in parsing is to split each rule into component
+ * pieces. These pieces are, for every rule, a left-hand side, a right-
+ * hand side, and an operator. The left- and right-hand sides may not
+ * be empty, except for the output patterns of forward and reverse
+ * rules. In addition to this partitioning, the match patterns of
+ * forward and reverse rules must be partitioned into antecontext,
+ * postcontext, and literal pattern, where the context portions may or
+ * may not be present. Finally, output patterns must have the cursor
+ * indicator '|' detected and removed, with its position recorded.
+ *
+ * Quote removal, variable resolution, and sub-pattern splitting must
+ * all happen at once. This is due chiefly to the quoting mechanism,
+ * which allows special characters to appear at arbitrary positions in
+ * the final unquoted text. (For this reason, alteration of the rule
+ * language is somewhat clumsy; it entails reassessment and revision of
+ * the parsing methods as a whole.)
+ *
+ * After this processing of rules is complete, the final end products
+ * are unquoted pieces of text of various types, and an integer cursor
+ * position, if one is specified. These processed raw materials are now
+ * easy to deal with; other classes such as UnicodeSet and
+ * TransliterationRule need know nothing of quoting or variables.
+ */
+ StringBuffer left = new StringBuffer();
+ StringBuffer right = new StringBuffer();
+ StringBuffer anteContext = new StringBuffer();
+ StringBuffer postContext = new StringBuffer();
+ int cursorPos[] = new int[1];
+
+ char operator = parseRule(start, limit, left, right,
+ anteContext, postContext, cursorPos);
+
+ switch (operator) {
+ case VARIABLE_DEF_OP:
+ applyVariableDef(left.toString(), right.toString());
+ break;
+ case FORWARD_RULE_OP:
+ if (direction == FORWARD) {
+ data.ruleSet.addRule(new TransliterationRule(
+ left.toString(), right.toString(),
+ anteContext.toString(), postContext.toString(),
+ cursorPos[0]));
+ } // otherwise ignore the rule; it's not the direction we want
+ break;
+ case REVERSE_RULE_OP:
+ if (direction == REVERSE) {
+ data.ruleSet.addRule(new TransliterationRule(
+ right.toString(), left.toString(),
+ anteContext.toString(), postContext.toString(),
+ cursorPos[0]));
+ } // otherwise ignore the rule; it's not the direction we want
+ break;
+ }
+ }
+
+ /**
+ * Add a variable definition.
+ * @param name the name of the variable. It must not already be defined.
+ * @param pattern the value of the variable. It may be a single character
+ * or a pattern describing a character set.
+ * @exception IllegalArgumentException if there is a syntax error
+ */
+ private final void applyVariableDef(String name, String pattern) {
+ validateVariableName(name);
+ if (data.variableNames.get(name) != null) {
+ throw new IllegalArgumentException("Duplicate variable definition: "
+ + name + '=' + pattern);
+ }
+//! if (UnicodeSet.getCategoryID(name) >= 0) {
+//! throw new IllegalArgumentException("Reserved variable name: "
+//! + name);
+//! }
+ if (pattern.length() < 1) {
+ throw new IllegalArgumentException("Variable definition missing: "
+ + name);
+ }
+ if (pattern.length() == 1) {
+ // Got a single character variable definition
+ data.variableNames.put(name, new Character(pattern.charAt(0)));
+ } else {
+ // Got more than one character; parse it as a category
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException("Private use variables exhausted");
+ }
+ Character c = new Character(variableNext++);
+ data.variableNames.put(name, c);
+ data.setVariables.put(c, new UnicodeSet(pattern));
+ }
+ }
+
+ /**
+ * Given a rule, parses it into three pieces: The left side, the right side,
+ * and the operator. Returns the operator. Quotes and variable references
+ * are resolved; the otuput text in all StringBuffer
parameters
+ * is literal text. This method delegates to other parsing methods to
+ * handle the match pattern, output pattern, and other sub-patterns in the
+ * rule.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param left left side of rule is appended to this buffer
+ * with the quotes removed and variables resolved
+ * @param right right side of rule is appended to this buffer
+ * with the quotes removed and variables resolved
+ * @param anteContext the preceding context of the match pattern,
+ * if there is one, is appended to this buffer
+ * @param postContext the following context of the match pattern,
+ * if there is one, is appended to this buffer
+ * @param cursorPos if there is a cursor in the output pattern, its
+ * offset is stored in cursorPos[0]
+ * @return The operator character, one of the characters in OPERATORS.
+ */
+ private char parseRule(int start, int limit,
+ StringBuffer left, StringBuffer right,
+ StringBuffer anteContext,
+ StringBuffer postContext,
+ int[] cursorPos) {
+ if (false) {
+ System.err.println("Parsing " + rules.substring(start, limit));
+ }
+ /* Parse the rule into three pieces -- left, operator, and right,
+ * parsing out quotes. The result is that left and right will have
+ * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
+ * operators throw an exception. Two quotes inside or outside
+ * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
+ */
+ int i = quotedIndexOf(rules, start, limit, OPERATORS);
+ if (i < 0) {
+ throw new IllegalArgumentException(
+ "Syntax error: "
+ + rules.substring(start, limit));
+ }
+ char c = rules.charAt(i);
+ switch (c) {
+ case FORWARD_RULE_OP:
+ if (i == start) {
+ throw new IllegalArgumentException(
+ "Empty left side: "
+ + rules.substring(start, limit));
+ }
+ parseMatchPattern(start, i, left, anteContext, postContext);
+ if (i != (limit-1)) {
+ parseOutputPattern(i+1, limit, right, cursorPos);
+ }
+ break;
+ case REVERSE_RULE_OP:
+ if (i == (limit-1)) {
+ throw new IllegalArgumentException(
+ "Empty right side: "
+ + rules.substring(start, limit));
+ }
+ if (i != start) {
+ parseOutputPattern(start, i, left, cursorPos);
+ }
+ parseMatchPattern(i+1, limit, right, anteContext, postContext);
+ break;
+ default:
+ if (i == start || i == (limit-1)) {
+ throw new IllegalArgumentException(
+ "Empty left or right side: "
+ + rules.substring(start, limit));
+ }
+ parseSubPattern(start, i, left);
+ parseDefPattern(i+1, limit, right);
+ break;
+ }
+ return c;
+ }
+
+ /**
+ * Parses the match pattern of a forward or reverse rule. Given the raw
+ * match pattern, return the match text and the context on both sides, if
+ * any. Resolves all quotes and variables.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param text the key to be matched will be appended to this buffer
+ * @param anteContext the preceding context, if any, will be appended
+ * to this buffer.
+ * @param postContext the following context, if any, will be appended
+ * to this buffer.
+ */
+ private void parseMatchPattern(int start, int limit,
+ StringBuffer text,
+ StringBuffer anteContext,
+ StringBuffer postContext) {
+ if (start >= limit) {
+ throw new IllegalArgumentException(
+ "Empty expression in rule: "
+ + rules.substring(start, limit));
+ }
+ if (anteContext != null) {
+ // Ignore optional opening and closing context characters
+ if (rules.charAt(start) == CONTEXT_OPEN) {
+ ++start;
+ }
+ if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
+ --limit;
+ }
+ // The four possibilities are:
+ // key
+ // anteContext]key
+ // anteContext]key[postContext
+ // key[postContext
+ int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE));
+ int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN));
+ if (ante >= 0 && post >= 0 && ante > post) {
+ throw new IllegalArgumentException(
+ "Syntax error in context specifier: "
+ + rules.substring(start, limit));
+ }
+ if (ante >= 0) {
+ parseSubPattern(start, ante, anteContext);
+ start = ante+1;
+ }
+ if (post >= 0) {
+ parseSubPattern(post+1, limit, postContext);
+ limit = post;
+ }
+ }
+ parseSubPattern(start, limit, text);
+ }
+
+ private final void parseSubPattern(int start, int limit,
+ StringBuffer text) {
+ parseSubPattern(start, limit, text, null, SPECIALS);
+ }
+
+ /**
+ * Parse a variable definition sub pattern. This kind of sub
+ * pattern differs in the set of characters that are considered
+ * special. In particular, the '[' and ']' characters are not
+ * special, since these are used in UnicodeSet patterns.
+ */
+ private final void parseDefPattern(int start, int limit,
+ StringBuffer text) {
+ parseSubPattern(start, limit, text, null, DEF_SPECIALS);
+ }
+
+ /**
+ * Parses the output pattern of a forward or reverse rule. Given the
+ * output pattern, return the output text and the position of the cursor,
+ * if any. Resolves all quotes and variables.
+ * @param rules the string to be parsed
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param text the output text will be appended to this buffer
+ * @param cursorPos if this parameter is not null, then cursorPos[0]
+ * will be set to the cursor position, or -1 if there is none. If this
+ * parameter is null, then cursors will be disallowed.
+ */
+ private final void parseOutputPattern(int start, int limit,
+ StringBuffer text,
+ int[] cursorPos) {
+ parseSubPattern(start, limit, text, cursorPos, SPECIALS);
+ }
+
+ /**
+ * Parses a sub-pattern of a rule. Return the text and the position of the cursor,
+ * if any. Resolves all quotes and variables.
+ * @param rules the string to be parsed
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param text the output text will be appended to this buffer
+ * @param cursorPos if this parameter is not null, then cursorPos[0]
+ * will be set to the cursor position, or -1 if there is none. If this
+ * parameter is null, then cursors will be disallowed.
+ * @param specials characters that must be quoted; typically either
+ * SPECIALS or DEF_SPECIALS.
+ */
+ private void parseSubPattern(int start, int limit,
+ StringBuffer text,
+ int[] cursorPos,
+ String specials) {
+ boolean inQuote = false;
+
+ if (start >= limit) {
+ throw new IllegalArgumentException("Empty expression in rule");
+ }
+ if (cursorPos != null) {
+ cursorPos[0] = -1;
+ }
+ for (int i=start; i= 0) {
+ throw new IllegalArgumentException("Multiple cursors: "
+ + rules.substring(start, limit));
+ }
+ cursorPos[0] = text.length();
+ } else if (specials.indexOf(c) >= 0) {
+ throw new IllegalArgumentException("Unquoted special character: "
+ + rules.substring(start, limit));
+ } else {
+ text.append(c);
+ }
+ }
+ }
+
+ private static void validateVariableName(String name) {
+ if (indexOf(name, SPECIALS) >= 0) {
+ throw new IllegalArgumentException(
+ "Special character in variable name: "
+ + name);
+ }
+ }
+
+ /**
+ * Returns the single character value of the given variable name. Defined
+ * names are recognized.
+ *
+ * NO LONGER SUPPORTED:
+ * If a Unicode category name is given, a standard character variable
+ * in the range firstCategoryVariable to lastCategoryVariable is returned,
+ * with value firstCategoryVariable + n, where n is the category
+ * number.
+ * @exception IllegalArgumentException if the name is unknown.
+ */
+ private Character getVariableDef(String name) {
+ Character ch = (Character) data.variableNames.get(name);
+//! if (ch == null) {
+//! int id = UnicodeSet.getCategoryID(name);
+//! if (id >= 0) {
+//! ch = new Character((char) (firstCategoryVariable + id));
+//! data.variableNames.put(name, ch);
+//! data.setVariables.put(ch, new UnicodeSet(id));
+//! }
+//! }
+ if (ch == null) {
+ throw new IllegalArgumentException("Undefined variable: "
+ + name);
+ }
+ return ch;
+ }
+
+ /**
+ * Determines what part of the private use region of Unicode we can use for
+ * variable stand-ins. The correct way to do this is as follows: Parse each
+ * rule, and for forward and reverse rules, take the FROM expression, and
+ * make a hash of all characters used. The TO expression should be ignored.
+ * When done, everything not in the hash is available for use. In practice,
+ * this method may employ some other algorithm for improved speed.
+ */
+ private final void determineVariableRange() {
+ Range r = new Range('\uE000', 0x1900); // Private use area
+ r = r.largestUnusedSubrange(rules);
+
+ if (r == null) {
+ throw new RuntimeException(
+ "No private use characters available for variables");
+ }
+
+ variableNext = r.start;
+ variableLimit = (char) (r.start + r.length);
+
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException(
+ "Too few private use characters available for variables");
+ }
+ }
+
+ /**
+ * Returns the index of the first character in a set, ignoring quoted text.
+ * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
+ * found by a search for "h". Unlike String.indexOf(), this method searches
+ * not for a single character, but for any character of the string
+ * setOfChars
.
+ * @param text text to be searched
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param setOfChars string with one or more distinct characters
+ * @return Offset of the first character in setOfChars
+ * found, or -1 if not found.
+ * @see #indexOf
+ */
+ private static int quotedIndexOf(String text, int start, int limit,
+ String setOfChars) {
+ for (int i=start; i= 0) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the index of the first character in a set. Unlike
+ * String.indexOf(), this method searches not for a single character, but
+ * for any character of the string setOfChars
.
+ * @param text text to be searched
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param setOfChars string with one or more distinct characters
+ * @return Offset of the first character in setOfChars
+ * found, or -1 if not found.
+ * @see #quotedIndexOf
+ */
+ private static int indexOf(String text, int start, int limit,
+ String setOfChars) {
+ for (int i=start; i= 0) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the index of the first character in a set. Unlike
+ * String.indexOf(), this method searches not for a single character, but
+ * for any character of the string setOfChars
.
+ * @param text text to be searched
+ * @param setOfChars string with one or more distinct characters
+ * @return Offset of the first character in setOfChars
+ * found, or -1 if not found.
+ * @see #quotedIndexOf
+ */
+ private static int indexOf(String text, String setOfChars) {
+ return indexOf(text, 0, text.length(), setOfChars);
+ }
+
+
+
+ /**
+ * A range of Unicode characters. Support the operations of testing for
+ * inclusion (does this range contain this character?) and splitting.
+ * Splitting involves breaking a range into two smaller ranges around a
+ * character inside the original range. The split character is not included
+ * in either range. If the split character is at either extreme end of the
+ * range, one of the split products is an empty range.
+ *
+ * This class is used internally to determine the largest available private
+ * use character range for variable stand-ins.
+ */
+ private static class Range implements Cloneable {
+ char start;
+ int length;
+
+ Range(char start, int length) {
+ this.start = start;
+ this.length = length;
+ }
+
+ public Object clone() {
+ return new Range(start, length);
+ }
+
+ boolean contains(char c) {
+ return c >= start && (c - start) < length;
+ }
+
+ /**
+ * Assume that contains(c) is true. Split this range into two new
+ * ranges around the character c. Make this range one of the new ranges
+ * (modify it in place) and return the other new range. The character
+ * itself is not included in either range. If the split results in an
+ * empty range (that is, if c == start or c == start + length - 1) then
+ * return null.
+ */
+ Range split(char c) {
+ if (c == start) {
+ ++start;
+ --length;
+ return null;
+ } else if (c - start == length - 1) {
+ --length;
+ return null;
+ } else {
+ ++c;
+ Range r = new Range(c, start + length - c);
+ length = --c - start;
+ return r;
+ }
+ }
+
+ /**
+ * Finds the largest unused subrange by the given string. A
+ * subrange is unused by a string if the string contains no
+ * characters in that range. If the given string contains no
+ * characters in this range, then this range itself is
+ * returned.
+ */
+ Range largestUnusedSubrange(String str) {
+ int n = str.length();
+
+ Vector v = new Vector(1);
+ v.addElement(clone());
+ for (int i=0; i bestRange.length) {
+ bestRange = r;
+ }
+ }
+
+ return bestRange;
+ }
+ }
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
new file mode 100755
index 00000000000..383c77ed340
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java
@@ -0,0 +1,530 @@
+package com.ibm.text;
+
+import java.util.Dictionary;
+
+/**
+ * A transliteration rule used by
+ * RuleBasedTransliterator
.
+ * TransliterationRule
is an immutable object.
+ *
+ * A rule consists of an input pattern and an output string. When
+ * the input pattern is matched, the output string is emitted. The
+ * input pattern consists of zero or more characters which are matched
+ * exactly (the key) and optional context. Context must match if it
+ * is specified. Context may be specified before the key, after the
+ * key, or both. The key, preceding context, and following context
+ * may contain variables. Variables represent a set of Unicode
+ * characters, such as the letters a through z.
+ * Variables are detected by looking up each character in a supplied
+ * variable list to see if it has been so defined.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+class TransliterationRule {
+ /**
+ * Constant returned by getMatchDegree()
indicating a mismatch
+ * between the text and this rule. One or more characters of the context or
+ * key do not match the text.
+ * @see #getMatchDegree
+ */
+ public static final int MISMATCH = 0;
+
+ /**
+ * Constant returned by getMatchDegree()
indicating a partial
+ * match between the text and this rule. All characters of the text match
+ * the corresponding context or key, but more characters are required for a
+ * complete match. There are some key or context characters at the end of
+ * the pattern that remain unmatched because the text isn't long enough.
+ * @see #getMatchDegree
+ */
+ public static final int PARTIAL_MATCH = 1;
+
+ /**
+ * Constant returned by getMatchDegree()
indicating a complete
+ * match between the text and this rule. The text matches all context and
+ * key characters.
+ * @see #getMatchDegree
+ */
+ public static final int FULL_MATCH = 2;
+
+ /**
+ * The string that must be matched.
+ */
+ private String key;
+
+ /**
+ * The string that is emitted if the key, anteContext, and postContext
+ * are matched.
+ */
+ private String output;
+
+ /**
+ * The string that must match before the key. Must not be the empty string.
+ * May be null; if null, then there is no matching requirement before the
+ * key.
+ */
+ private String anteContext;
+
+ /**
+ * The string that must match after the key. Must not be the empty string.
+ * May be null; if null, then there is no matching requirement after the
+ * key.
+ */
+ private String postContext;
+
+ /**
+ * The position of the cursor after emitting the output string, from 0 to
+ * output.length(). For most rules with no special cursor specification,
+ * the cursorPos is output.length().
+ */
+ private int cursorPos;
+
+ /**
+ * A string used to implement masks().
+ */
+ private String maskKey;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a new rule with the given key, output text, and other
+ * attributes. Zero, one, or two context strings may be specified. A
+ * cursor position may be specified for the output text.
+ * @param key the string to match
+ * @param output the string to produce when the key
is seen
+ * @param anteContext if not null and not empty, then it must be matched
+ * before the key
+ * @param postContext if not null and not empty, then it must be matched
+ * after the key
+ * @param cursorPos a position for the cursor after the output
+ * is emitted. If less than zero, then the cursor is placed after the
+ * output
; that is, -1 is equivalent to
+ * output.length()
. If greater than
+ * output.length()
then an exception is thrown.
+ * @exception IllegalArgumentException if the cursor position is out of
+ * range.
+ */
+ public TransliterationRule(String key, String output,
+ String anteContext, String postContext,
+ int cursorPos) {
+ this.key = key;
+ this.output = output;
+ this.anteContext = (anteContext != null && anteContext.length() > 0)
+ ? anteContext : null;
+ this.postContext = (postContext != null && postContext.length() > 0)
+ ? postContext : null;
+ this.cursorPos = cursorPos < 0 ? output.length() : cursorPos;
+ if (this.cursorPos > output.length()) {
+ throw new IllegalArgumentException("Illegal cursor position");
+ }
+
+ /* The mask key is needed when we are adding individual rules to a rule
+ * set, for performance. Here are the numbers: Without mask key, 13.0
+ * seconds. With mask key, 6.2 seconds. However, once the rules have
+ * been added to the set, then they can be discarded to free up space.
+ * This is what the freeze() method does. After freeze() has been
+ * called, the method masks() must NOT be called.
+ */
+ maskKey = key;
+ if (postContext != null) {
+ maskKey += postContext;
+ }
+ }
+
+ /**
+ * Return the length of the key. Equivalent to getKey().length()
.
+ * @return the length of the match key.
+ */
+ public int getKeyLength() {
+ return key.length();
+ }
+
+ /**
+ * Return the key.
+ * @return the match key.
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ * Return the output string.
+ * @return the output string.
+ */
+ public String getOutput() {
+ return output;
+ }
+
+ /**
+ * Return the position of the cursor within the output string.
+ * @return a value from 0 to getOutput().length()
, inclusive.
+ */
+ public int getCursorPos() {
+ return cursorPos;
+ }
+
+ /**
+ * Return the preceding context length. This method is needed to
+ * support the Transliterator
method
+ * getMaximumContextLength()
.
+ */
+ public int getAnteContextLength() {
+ return anteContext == null ? 0 : anteContext.length();
+ }
+
+ /**
+ * Return true if this rule masks another rule. If r1 masks r2 then
+ * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
+ * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
+ * "[c]a>x" masks "[dc]a>y".
+ *
+ *
This method must not be called after freeze() is called.
+ */
+ public boolean masks(TransliterationRule r2) {
+ /* There are three cases of masking. In each instance, rule1
+ * masks rule2.
+ *
+ * 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
+ *
+ * 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
+ * prefix2 ends with prefix1, suffix2 starts with suffix1.
+ *
+ * 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
+ * prefix2 ends with prefix1, suffix2 starts with suffix1.
+ */
+
+ /* LIMITATION of the current mask algorithm: Some rule
+ * maskings are currently not detected. For example,
+ * "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
+ * we need a subset operator on UnicodeSet objects, which we
+ * currently do not have. This can be added later.
+ */
+ return ((maskKey.length() < r2.maskKey.length() &&
+ r2.maskKey.startsWith(maskKey)) ||
+ (r2.anteContext != null && maskKey.equals(r2.maskKey) &&
+ ((anteContext == null) ||
+ (anteContext.length() < r2.anteContext.length() &&
+ r2.anteContext.endsWith(anteContext)))));
+ }
+
+ /**
+ * Free up space. Once this method is called, masks() must NOT be called.
+ * If it is called, an exception will be thrown.
+ */
+ public void freeze() {
+ maskKey = null;
+ }
+
+ /**
+ * Return a string representation of this object.
+ * @return string representation of this object
+ */
+ public String toString() {
+ return getClass().getName() + '['
+ + escape((anteContext != null ? ("[" + anteContext + ']') : "")
+ + key
+ + (postContext != null ? ("[" + postContext + ']') : "")
+ + " -> "
+ + (cursorPos < output.length()
+ ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
+ : output))
+ + ']';
+ }
+
+ /**
+ * Return true if this rule matches the given text. The text being matched
+ * occupies a virtual buffer consisting of the contents of
+ * result
concatenated to a substring of text
.
+ * The substring is specified by start
and limit
.
+ * The value of cursor
is an index into this virtual buffer,
+ * from 0 to the length of the buffer. In terms of the parameters,
+ * cursor
must be between 0 and result.length() + limit -
+ * start
.
+ * @param text the untranslated text
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result translated text so far
+ * @param cursor position at which to translate next, an offset into result.
+ * If greater than or equal to result.length(), represents offset start +
+ * cursor - result.length() into text.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ public boolean matches(String text, int start, int limit,
+ StringBuffer result, int cursor,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ return
+ (anteContext == null
+ || regionMatches(text, start, limit, result,
+ cursor - anteContext.length(),
+ anteContext, variables, filter)) &&
+ regionMatches(text, start, limit, result, cursor,
+ key, variables, filter) &&
+ (postContext == null
+ || regionMatches(text, start, limit, result,
+ cursor + key.length(),
+ postContext, variables, filter));
+ }
+
+ /**
+ * Return true if this rule matches the given text.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ public boolean matches(Replaceable text, int start, int limit,
+ int cursor, Dictionary variables,
+ UnicodeFilter filter) {
+ return
+ (anteContext == null
+ || regionMatches(text, start, limit, cursor - anteContext.length(),
+ anteContext, variables, filter)) &&
+ regionMatches(text, start, limit, cursor,
+ key, variables, filter) &&
+ (postContext == null
+ || regionMatches(text, start, limit, cursor + key.length(),
+ postContext, variables, filter));
+ }
+
+ /**
+ * Return the degree of match between this rule and the given text. The
+ * degree of match may be mismatch, a partial match, or a full match. A
+ * mismatch means at least one character of the text does not match the
+ * context or key. A partial match means some context and key characters
+ * match, but the text is not long enough to match all of them. A full
+ * match means all context and key characters match.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return one of MISMATCH
, PARTIAL_MATCH
, or
+ * FULL_MATCH
.
+ * @see #MISMATCH
+ * @see #PARTIAL_MATCH
+ * @see #FULL_MATCH
+ */
+ public int getMatchDegree(Replaceable text, int start, int limit,
+ int cursor, Dictionary variables,
+ UnicodeFilter filter) {
+ if (anteContext != null
+ && !regionMatches(text, start, limit, cursor - anteContext.length(),
+ anteContext, variables, filter)) {
+ return MISMATCH;
+ }
+ int len = getRegionMatchLength(text, start, limit, cursor,
+ key, variables, filter);
+ if (len < 0) {
+ return MISMATCH;
+ }
+ if (len < key.length()) {
+ return PARTIAL_MATCH;
+ }
+ if (postContext == null) {
+ return FULL_MATCH;
+ }
+ len = getRegionMatchLength(text, start, limit,
+ cursor + key.length(),
+ postContext, variables, filter);
+ return (len < 0) ? MISMATCH
+ : ((len == postContext.length()) ? FULL_MATCH
+ : PARTIAL_MATCH);
+ }
+
+ /**
+ * Return true if a template matches the text. The entire length of the
+ * template is compared to the text at the cursor. As in
+ * matches()
, the text being matched occupies a virtual buffer
+ * consisting of the contents of result
concatenated to a
+ * substring of text
. See matches()
for details.
+ * @param text the untranslated text
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result translated text so far
+ * @param cursor position at which to translate next, an offset into result.
+ * If greater than or equal to result.length(), represents offset start +
+ * cursor - result.length() into text.
+ * @param template the text to match against. All characters must match.
+ * @param variables a dictionary of variables mapping Character
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return true if there is a match
+ */
+ protected static boolean regionMatches(String text, int start, int limit,
+ StringBuffer result, int cursor,
+ String template,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ int rlen = result.length();
+ if (cursor < 0
+ || (cursor + template.length()) > (rlen + limit - start)) {
+ return false;
+ }
+ for (int i=0; i0 <= start
+ * <= limit.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param template the text to match against. All characters must match.
+ * @param variables a dictionary of variables mapping Character
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return true if there is a match
+ */
+ protected static boolean regionMatches(Replaceable text, int start, int limit,
+ int cursor,
+ String template, Dictionary variables,
+ UnicodeFilter filter) {
+ if (cursor < start
+ || (cursor + template.length()) > limit) {
+ return false;
+ }
+ for (int i=0; i0 <= start
+ * <= limit.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param template the text to match against. All characters must match.
+ * @param variables a dictionary of variables mapping Character
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return -1 if there is a mismatch, 0 if the text is not long enough to
+ * match any characters, otherwise the number of characters of text that
+ * match this rule.
+ */
+ protected static int getRegionMatchLength(Replaceable text, int start,
+ int limit, int cursor,
+ String template,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ if (cursor < start) {
+ return -1;
+ }
+ int i;
+ for (i=0; iCharacter
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ protected static boolean charMatches(char keyChar, char textChar,
+ Dictionary variables, UnicodeFilter filter) {
+ UnicodeSet set = null;
+ return (filter == null || filter.isIn(textChar)) &&
+ ((set = (UnicodeSet) variables.get(new Character(keyChar)))
+ == null) ?
+ keyChar == textChar : set.contains(textChar);
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ */
+ public static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java b/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java
new file mode 100755
index 00000000000..d57bf75464a
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java
@@ -0,0 +1,218 @@
+package com.ibm.text;
+
+import java.util.*;
+
+/**
+ * A set of rules for a RuleBasedTransliterator
. This set encodes
+ * the transliteration in one direction from one set of characters or short
+ * strings to another. A RuleBasedTransliterator
consists of up to
+ * two such sets, one for the forward direction, and one for the reverse.
+ *
+ * A TransliterationRuleSet
has one important operation, that of
+ * finding a matching rule at a given point in the text. This is accomplished
+ * by the findMatch()
method.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+class TransliterationRuleSet {
+ /* Note: There was an old implementation that indexed by first letter of
+ * key. Problem with this is that key may not have a meaningful first
+ * letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
+ * rules whose intial key letter is a category variable. However, the
+ * problem is that they must be kept in order with respect to other rules.
+ * One solution -- add a sequence number to each rule. Do the usual
+ * first-letter lookup, and also a lookup from the spare bin with rules like
+ * {Lu}>*. Take the lower sequence number. This seems complex and not
+ * worth the trouble, but we may revisit this later. For documentation (or
+ * possible resurrection) the old code is included below, commented out
+ * with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
+ * implementation, rules
is a Hashtable, not a Vector.
+ */
+
+ /**
+ * Vector of rules, in the order added.
+ */
+ private Vector rules;
+
+ /**
+ * Length of the longest preceding context
+ */
+ private int maxContextLength;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a new empty rule set.
+ */
+ public TransliterationRuleSet() {
+ rules = new Vector();
+ maxContextLength = 0;
+ }
+
+ /**
+ * Return the maximum context length.
+ * @return the length of the longest preceding context.
+ */
+ public int getMaximumContextLength() {
+ return maxContextLength;
+ }
+
+ /**
+ * Add a rule to this set. Rules are added in order, and order is
+ * significant.
+ *
+ *
Once freeze() is called, this method must not be called.
+ * @param rule the rule to add
+ */
+ public void addRule(TransliterationRule rule) {
+
+ // Build time, no checking : 3562 ms
+ // Build time, with checking: 6234 ms
+
+ for (int i=0; i maxContextLength) {
+ maxContextLength = len;
+ }
+ }
+
+ /**
+ * Free up space. Once this method is called, addRule() must NOT
+ * be called again.
+ */
+ public void freeze() {
+ for (int i=0; iresult concatenated to a substring of text
.
+ * The substring is specified by start
and limit
.
+ * The value of cursor
is an index into this virtual buffer,
+ * from 0 to the length of the buffer. In terms of the parameters,
+ * cursor
must be between 0 and result.length() + limit -
+ * start
.
+ * @param text the untranslated text
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result tranlated text
+ * @param cursor position at which to translate next, an offset into result.
+ * If greater than or equal to result.length(), represents offset start +
+ * cursor - result.length() into text.
+ * @param variables a dictionary mapping variables to the sets they
+ * represent (maps Character
to UnicodeSet
)
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return the matching rule, or null if none found.
+ */
+ public TransliterationRule findMatch(String text, int start, int limit,
+ StringBuffer result, int cursor,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
+ TransliterationRule rule = (TransliterationRule) e.nextElement();
+ if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
+ return rule;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Attempt to find a matching rule at the specified point in the text.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param variables a dictionary mapping variables to the sets they
+ * represent (maps Character
to UnicodeSet
)
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return the matching rule, or null if none found.
+ */
+ public TransliterationRule findMatch(Replaceable text, int start, int limit,
+ int cursor,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
+ TransliterationRule rule = (TransliterationRule) e.nextElement();
+ if (rule.matches(text, start, limit, cursor, variables, filter)) {
+ return rule;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Attempt to find a matching rule at the specified point in the text.
+ * Unlike findMatch()
, this method does an incremental match.
+ * An incremental match requires that there be no partial matches that might
+ * pre-empt the full match that is found. If there are partial matches,
+ * then null is returned. A non-null result indicates that a full match has
+ * been found, and that it cannot be pre-empted by a partial match
+ * regardless of what additional text is added to the translation buffer.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param variables a dictionary mapping variables to the sets they
+ * represent (maps Character
to UnicodeSet
)
+ * @param partial output parameter. partial[0]
is set to
+ * true if a partial match is returned.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return the matching rule, or null if none found, or if the text buffer
+ * does not have enough text yet to unambiguously match a rule.
+ */
+ public TransliterationRule findIncrementalMatch(Replaceable text, int start,
+ int limit, int cursor,
+ Dictionary variables,
+ boolean partial[],
+ UnicodeFilter filter) {
+ partial[0] = false;
+ for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
+ TransliterationRule rule = (TransliterationRule) e.nextElement();
+ int match = rule.getMatchDegree(text, start, limit, cursor,
+ variables, filter);
+ switch (match) {
+ case TransliterationRule.FULL_MATCH:
+ return rule;
+ case TransliterationRule.PARTIAL_MATCH:
+ partial[0] = true;
+ return null;
+ }
+ }
+ return null;
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/Transliterator.java b/icu4j/src/com/ibm/icu/text/Transliterator.java
new file mode 100755
index 00000000000..83171a961e7
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/Transliterator.java
@@ -0,0 +1,860 @@
+package com.ibm.text;
+
+import java.util.*;
+import java.text.MessageFormat;
+
+/**
+ * Transliterator
is an abstract class that
+ * transliterates text from one format to another. The most common
+ * kind of transliterator is a script, or alphabet, transliterator.
+ * For example, a Russian to Latin transliterator changes Russian text
+ * written in Cyrillic characters to phonetically equivalent Latin
+ * characters. It does not translate Russian to English!
+ * Transliteration, unlike translation, operates on characters, without
+ * reference to the meanings of words and sentences.
+ *
+ * Although script conversion is its most common use, a
+ * transliterator can actually perform a more general class of tasks.
+ * In fact, Transliterator
defines a very general API
+ * which specifies only that a segment of the input text is replaced
+ * by new text. The particulars of this conversion are determined
+ * entirely by subclasses of Transliterator
.
+ *
+ *
Transliterators are stateless
+ *
+ *
Transliterator
objects are stateless; they
+ * retain no information between calls to
+ * transliterate()
. As a result, threads may share
+ * transliterators without synchronizing them. This might seem to
+ * limit the complexity of the transliteration operation. In
+ * practice, subclasses perform complex transliterations by delaying
+ * the replacement of text until it is known that no other
+ * replacements are possible. In other words, although the
+ * Transliterator
objects are stateless, the source text
+ * itself embodies all the needed information, and delayed operation
+ * allows arbitrary complexity.
+ *
+ *
Batch transliteration
+ *
+ *
The simplest way to perform transliteration is all at once, on a
+ * string of existing text. This is referred to as batch
+ * transliteration. For example, given a string input
+ * and a transliterator t
, the call
+ *
+ *
String result = t.transliterate(input);
+ *
+ *
+ * will transliterate it and return the result. Other methods allow
+ * the client to specify a substring to be transliterated and to use
+ * {@link Replaceable} objects instead of strings, in order to
+ * preserve out-of-band information (such as text styles).
+ *
+ * Keyboard transliteration
+ *
+ *
Somewhat more involved is keyboard, or incremental
+ * transliteration. This is the transliteration of text that is
+ * arriving from some source (typically the user's keyboard) one
+ * character at a time, or in some other piecemeal fashion.
+ *
+ *
In keyboard transliteration, a Replaceable
buffer
+ * stores the text. As text is inserted, as much as possible is
+ * transliterated on the fly. This means a GUI that displays the
+ * contents of the buffer may show text being modified as each new
+ * character arrives.
+ *
+ *
Consider the simple RuleBasedTransliterator
:
+ *
+ *
+ * th>{theta}
+ * t>{tau}
+ *
+ *
+ * When the user types 't', nothing will happen, since the
+ * transliterator is waiting to see if the next character is 'h'. To
+ * remedy this, we introduce the notion of a cursor, marked by a '|'
+ * in the output string:
+ *
+ *
+ * t>|{tau}
+ * {tau}h>{theta}
+ *
+ *
+ * Now when the user types 't', tau appears, and if the next character
+ * is 'h', the tau changes to a theta. This is accomplished by
+ * maintaining a cursor position (independent of the insertion point,
+ * and invisible in the GUI) across calls to
+ * keyboardTransliterate()
. Typically, the cursor will
+ * be coincident with the insertion point, but in a case like the one
+ * above, it will precede the insertion point.
+ *
+ * Keyboard transliteration methods maintain a set of three indices
+ * that are updated with each call to
+ * keyboardTransliterate()
, including the cursor, start,
+ * and limit. Since these indices are changed by the method, they are
+ * passed in an int[]
array. The START
index
+ * marks the beginning of the substring that the transliterator will
+ * look at. It is advanced as text becomes committed (but it is not
+ * the committed index; that's the CURSOR
). The
+ * CURSOR
index, described above, marks the point at
+ * which the transliterator last stopped, either because it reached
+ * the end, or because it required more characters to disambiguate
+ * between possible inputs. The CURSOR
can also be
+ * explicitly set by rules in a RuleBasedTransliterator
.
+ * Any characters before the CURSOR
index are frozen;
+ * future keyboard transliteration calls within this input sequence
+ * will not change them. New text is inserted at the
+ * LIMIT
index, which marks the end of the substring that
+ * the transliterator looks at.
+ *
+ *
Because keyboard transliteration assumes that more characters
+ * are to arrive, it is conservative in its operation. It only
+ * transliterates when it can do so unambiguously. Otherwise it waits
+ * for more characters to arrive. When the client code knows that no
+ * more characters are forthcoming, perhaps because the user has
+ * performed some input termination operation, then it should call
+ * finishKeyboardTransliteration()
to complete any
+ * pending transliterations.
+ *
+ *
Inverses
+ *
+ *
Pairs of transliterators may be inverses of one another. For
+ * example, if transliterator A transliterates characters by
+ * incrementing their Unicode value (so "abc" -> "def"), and
+ * transliterator B decrements character values, then A
+ * is an inverse of B and vice versa. If we compose A
+ * with B in a compound transliterator, the result is the
+ * indentity transliterator, that is, a transliterator that does not
+ * change its input text.
+ *
+ * The Transliterator
method getInverse()
+ * returns a transliterator's inverse, if one exists, or
+ * null
otherwise. However, the result of
+ * getInverse()
usually will not be a true
+ * mathematical inverse. This is because true inverse transliterators
+ * are difficult to formulate. For example, consider two
+ * transliterators: AB, which transliterates the character 'A'
+ * to 'B', and BA, which transliterates 'B' to 'A'. It might
+ * seem that these are exact inverses, since
+ *
+ *
"A" x AB -> "B"
+ * "B" x BA -> "A"
+ *
+ * where 'x' represents transliteration. However,
+ *
+ * "ABCD" x AB -> "BBCD"
+ * "BBCD" x BA -> "AACD"
+ *
+ * so AB composed with BA is not the
+ * identity. Nonetheless, BA may be usefully considered to be
+ * AB's inverse, and it is on this basis that
+ * AB.getInverse()
could legitimately return
+ * BA.
+ *
+ * IDs and display names
+ *
+ *
A transliterator is designated by a short identifier string or
+ * ID. IDs follow the format source-destination,
+ * where source describes the entity being replaced, and
+ * destination describes the entity replacing
+ * source. The entities may be the names of scripts,
+ * particular sequences of characters, or whatever else it is that the
+ * transliterator converts to or from. For example, a transliterator
+ * from Russian to Latin might be named "Russian-Latin". A
+ * transliterator from keyboard escape sequences to Latin-1 characters
+ * might be named "KeyboardEscape-Latin1". By convention, system
+ * entity names are in English, with the initial letters of words
+ * capitalized; user entity names may follow any format so long as
+ * they do not contain dashes.
+ *
+ *
In addition to programmatic IDs, transliterator objects have
+ * display names for presentation in user interfaces, returned by
+ * {@link #getDisplayName}.
+ *
+ *
Factory methods and registration
+ *
+ *
In general, client code should use the factory method
+ * getInstance()
to obtain an instance of a
+ * transliterator given its ID. Valid IDs may be enumerated using
+ * getAvailableIDs()
. Since transliterators are
+ * stateless, multiple calls to getInstance()
with the
+ * same ID will return the same object.
+ *
+ *
In addition to the system transliterators registered at startup,
+ * user transliterators may be registered by calling
+ * registerInstance()
at run time. To register a
+ * transliterator subclass without instantiating it (until it is
+ * needed), users may call registerClass()
.
+ *
+ *
Subclassing
+ *
+ *
Subclasses must implement the abstract
+ * transliterate()
method. They should also override the
+ * transliterate()
method taking a String
+ * and StringBuffer
if the performance of these methods
+ * can be improved over the performance obtained by the default
+ * implementations in this class. Subclasses must also implement
+ * handleKeyboardTransliterate()
.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: Transliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public abstract class Transliterator {
+ /**
+ * In the keyboardTransliterate()
+ * index[]
array, the beginning index, inclusive
+ * @see #keyboardTransliterate
+ */
+ public static final int START = 0;
+
+ /**
+ * In the keyboardTransliterate()
+ * index[]
array, the ending index, exclusive
+ * @see #keyboardTransliterate
+ */
+ public static final int LIMIT = 1;
+
+ /**
+ * In the keyboardTransliterate()
+ * index[]
array, the next character to be considered
+ * for transliteration
+ * @see #keyboardTransliterate
+ */
+ public static final int CURSOR = 2;
+
+ /**
+ * Programmatic name, e.g., "Latin-Arabic".
+ */
+ private String ID;
+
+ /**
+ * This transliterator's filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ private UnicodeFilter filter;
+
+ /**
+ * Dictionary of known transliterators. Keys are String
+ * names, values are one of the following:
+ *
+ *
Transliterator
objects
+ *
+ * Class
objects. Such objects must represent
+ * subclasses of Transliterator
, and must satisfy the
+ * constraints described in registerClass()
+ *
+ * RULE_BASED_PLACEHOLDER
, in which case the ID
+ * will have its first '-' removed and be appended to
+ * RB_RULE_BASED_PREFIX to form a resource bundle name from which
+ * the RB_RULE key is looked up to obtain the rule.
+ *
+ * REVERSE_RULE_BASED_PLACEHOLDER
. Like
+ * RULE_BASED_PLACEHOLDER
, except the entity names in
+ * the ID are reversed, and the argument
+ * RuleBasedTransliterator.REVERSE is pased to the
+ * RuleBasedTransliterator constructor.
+ *
+ */
+ private static Hashtable cache;
+
+ /**
+ * Internal object used to stand for instances of
+ * RuleBasedTransliterator
that have not been
+ * constructed yet in the cache
. When a
+ * getInstance()
call retrieves this object, it is
+ * replaced by the actual RuleBasedTransliterator
.
+ * This allows Transliterator
to delay instantiation
+ * of such transliterators until they are needed.
+ */
+ private static final Object RULE_BASED_PLACEHOLDER = new Object();
+
+ /**
+ * Internal object used to stand for instances of
+ * RuleBasedTransliterator
that have not been
+ * constructed yet in the cache
. These instances are
+ * constructed with an argument
+ * RuleBasedTransliterator.REVERSE
.
+ */
+ private static final Object REVERSE_RULE_BASED_PLACEHOLDER = new Object();
+
+ /**
+ * Prefix for resource bundle key for the display name for a
+ * transliterator. The ID is appended to this to form the key.
+ * The resource bundle value should be a String.
+ */
+ private static final String RB_DISPLAY_NAME_PREFIX = "T:";
+
+ /**
+ * Resource bundle key for display name pattern.
+ * The resource bundle value should be a String forming a
+ * MessageFormat pattern, e.g.:
+ * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
+ */
+ private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
+
+ /**
+ * Resource bundle key for the list of RuleBasedTransliterator IDs.
+ * The resource bundle value should be a String[] with each element
+ * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
+ * to obtain the class name in which the RB_RULE key will be sought.
+ */
+ private static final String RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs";
+
+ /**
+ * Resource bundle containing display name keys and the
+ * RB_RULE_BASED_IDS array.
+ *
+ * If we ever integrate this with the Sun JDK, the resource bundle
+ * root will change to java.text.resources.LocaleElements
+ */
+ private static final String RB_LOCALE_ELEMENTS =
+ "com.ibm.text.resources.LocaleElements";
+
+ /**
+ * Prefix for resource bundle containing RuleBasedTransliterator
+ * RB_RULE string. The ID is munged to remove the first '-' then appended
+ * to this String to obtain the class name.
+ */
+ private static final String RB_RULE_BASED_PREFIX =
+ "com.ibm.text.resources.TransliterationRule";
+
+ /**
+ * Resource bundle key for the RuleBasedTransliterator rule.
+ */
+ private static final String RB_RULE = "Rule";
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Default constructor.
+ * @param ID the string identifier for this transliterator
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ protected Transliterator(String ID, UnicodeFilter filter) {
+ if (ID == null) {
+ throw new NullPointerException();
+ }
+ this.ID = ID;
+ this.filter = filter;
+ }
+
+ /**
+ * Transliterates the segment of a string that begins at the
+ * character at offset start
and extends to the
+ * character at offset limit - 1
, with optional
+ * filtering. A default implementaion is provided here;
+ * subclasses should provide a more efficient implementation if
+ * possible.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result buffer to receive the transliterated text; previous
+ * contents are discarded
+ */
+ public void transliterate(String text, int start, int limit,
+ StringBuffer result) {
+ /* This is a default implementation that should be replaced by
+ * a more efficient subclass implementation if possible.
+ */
+ result.setLength(0);
+ result.append(text.substring(start, limit));
+ transliterate(new ReplaceableString(result),
+ 0, result.length());
+ }
+
+ /**
+ * Transliterates a segment of a string, with optional filtering.
+ * Subclasses must override this abstract method.
+ *
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return The new limit index. The text previously occupying [start,
+ * limit)
has been transliterated, possibly to a string of a different
+ * length, at [start,
new-limit)
, where
+ * new-limit is the return value.
+ */
+ public abstract int transliterate(Replaceable text, int start, int limit);
+
+ /**
+ * Transliterates an entire string. Convenience method.
+ * @param text the string to be transliterated
+ * @param result buffer to receive the transliterated text; previous
+ * contents are discarded
+ */
+ public final void transliterate(String text, StringBuffer result) {
+ transliterate(text, 0, text.length(), result);
+ }
+
+ /**
+ * Transliterate an entire string and returns the result. Convenience method.
+ *
+ * @param text the string to be transliterated
+ * @return The transliterated text
+ */
+ public final String transliterate(String text) {
+ StringBuffer result = new StringBuffer();
+ transliterate(text, 0, text.length(), result);
+ return result.toString();
+ }
+
+ /**
+ * Transliterates an entire string in place. Convenience method.
+ * @param text the string to be transliterated
+ */
+ public final void transliterate(Replaceable text) {
+ transliterate(text, 0, text.length());
+ }
+
+ /**
+ * Transliterates the portion of the text buffer that can be
+ * transliterated unambiguosly after new text has been inserted,
+ * typically as a result of a keyboard event. The new text in
+ * insertion
will be inserted into text
+ * at index[LIMIT]
, advancing
+ * index[LIMIT]
by insertion.length()
.
+ * Then the transliterator will try to transliterate characters of
+ * text
between index[CURSOR]
and
+ * index[LIMIT]
. Characters before
+ * index[CURSOR]
will not be changed.
+ *
+ *
Upon return, values in index[]
will be updated.
+ * index[START]
will be advanced to the first
+ * character that future calls to this method will read.
+ * index[CURSOR]
and index[LIMIT]
will
+ * be adjusted to delimit the range of text that future calls to
+ * this method may change.
+ *
+ *
Typical usage of this method begins with an initial call
+ * with index[START]
and index[LIMIT]
+ * set to indicate the portion of text
to be
+ * transliterated, and index[CURSOR] == index[START]
.
+ * Thereafter, index[]
can be used without
+ * modification in future calls, provided that all changes to
+ * text
are made via this method.
+ *
+ *
This method assumes that future calls may be made that will
+ * insert new text into the buffer. As a result, it only performs
+ * unambiguous transliterations. After the last call to this
+ * method, there may be untransliterated text that is waiting for
+ * more input to resolve an ambiguity. In order to perform these
+ * pending transliterations, clients should call {@link
+ * #finishKeyboardTransliteration} after the last call to this
+ * method has been made.
+ *
+ * @param text the buffer holding transliterated and untransliterated text
+ * @param index an array of three integers.
+ *
+ *
index[START]
: the beginning index,
+ * inclusive; 0 <= index[START] <= index[LIMIT]
.
+ *
+ * index[LIMIT]
: the ending index, exclusive;
+ * index[START] <= index[LIMIT] <= text.length()
.
+ * insertion
is inserted at
+ * index[LIMIT]
.
+ *
+ * index[CURSOR]
: the next character to be
+ * considered for transliteration; index[START] <=
+ * index[CURSOR] <= index[LIMIT]
. Characters before
+ * index[CURSOR]
will not be changed by future calls
+ * to this method.
+ *
+ * @param insertion text to be inserted and possibly
+ * transliterated into the translation buffer at
+ * index[LIMIT]
. If null
then no text
+ * is inserted.
+ * @see #START
+ * @see #LIMIT
+ * @see #CURSOR
+ * @see #handleKeyboardTransliterate
+ * @exception IllegalArgumentException if index[]
+ * is invalid
+ */
+ public final void keyboardTransliterate(Replaceable text, int[] index,
+ String insertion) {
+ if (index.length < 3 ||
+ index[START] < 0 ||
+ index[LIMIT] > text.length() ||
+ index[CURSOR] < index[START] ||
+ index[CURSOR] > index[LIMIT]) {
+ throw new IllegalArgumentException("Invalid index array");
+ }
+
+ int originalStart = index[START];
+ if (insertion != null) {
+ text.replace(index[LIMIT], index[LIMIT], insertion);
+ index[LIMIT] += insertion.length();
+ }
+
+ handleKeyboardTransliterate(text, index);
+
+ index[START] = Math.max(index[CURSOR] - getMaximumContextLength(),
+ originalStart);
+ }
+
+ /**
+ * Transliterates the portion of the text buffer that can be
+ * transliterated unambiguosly after a new character has been
+ * inserted, typically as a result of a keyboard event. This is a
+ * convenience method; see {@link
+ * #keyboardTransliterate(Replaceable, int[], String)} for details.
+ * @param text the buffer holding transliterated and
+ * untransliterated text
+ * @param index an array of three integers. See {@link
+ * #keyboardTransliterate(Replaceable, int[], String)}.
+ * @param insertion text to be inserted and possibly
+ * transliterated into the translation buffer at
+ * index[LIMIT]
.
+ * @see #keyboardTransliterate(Replaceable, int[], String)
+ */
+ public final void keyboardTransliterate(Replaceable text, int[] index,
+ char insertion) {
+ keyboardTransliterate(text, index, String.valueOf(insertion));
+ }
+
+ /**
+ * Transliterates the portion of the text buffer that can be
+ * transliterated unambiguosly. This is a convenience method; see
+ * {@link #keyboardTransliterate(Replaceable, int[], String)} for
+ * details.
+ * @param text the buffer holding transliterated and
+ * untransliterated text
+ * @param index an array of three integers. See {@link
+ * #keyboardTransliterate(Replaceable, int[], String)}.
+ * @see #keyboardTransliterate(Replaceable, int[], String)
+ */
+ public final void keyboardTransliterate(Replaceable text, int[] index) {
+ keyboardTransliterate(text, index, null);
+ }
+
+ /**
+ * Finishes any pending transliterations that were waiting for
+ * more characters. Clients should call this method as the last
+ * call after a sequence of one or more calls to
+ * keyboardTransliterate()
.
+ * @param text the buffer holding transliterated and
+ * untransliterated text.
+ * @param index the array of indices previously passed to {@link
+ * #keyboardTransliterate}
+ */
+ public final void finishKeyboardTransliteration(Replaceable text,
+ int[] index) {
+ transliterate(text, index[START], index[LIMIT]);
+ }
+
+ /**
+ * Abstract method that concrete subclasses define to implement
+ * keyboard transliteration. This method should transliterate all
+ * characters between index[CURSOR]
and
+ * index[LIMIT]
that can be unambiguously
+ * transliterated, regardless of future insertions of text at
+ * index[LIMIT]
. index[CURSOR]
should
+ * be advanced past committed characters (those that will not
+ * change in future calls to this method).
+ * index[LIMIT]
should be updated to reflect text
+ * replacements that shorten or lengthen the text between
+ * index[CURSOR]
and index[LIMIT]
. Upon
+ * return, neither index[CURSOR]
nor
+ * index[LIMIT]
should be less than the initial value
+ * of index[CURSOR]
. index[START]
+ * should not be changed.
+ *
+ * @param text the buffer holding transliterated and
+ * untransliterated text
+ * @param index an array of three integers. See {@link
+ * #keyboardTransliterate(Replaceable, int[], String)}.
+ * @see #keyboardTransliterate
+ */
+ protected abstract void handleKeyboardTransliterate(Replaceable text,
+ int[] index);
+
+ /**
+ * Returns the length of the longest context required by this transliterator.
+ * This is preceding context. The default implementation supplied
+ * by Transliterator
returns zero; subclasses
+ * that use preceding context should override this method to return the
+ * correct value. For example, if a transliterator translates "ddd" (where
+ * d is any digit) to "555" when preceded by "(ddd)", then the preceding
+ * context length is 5, the length of "(ddd)".
+ *
+ * @return The maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return 0;
+ }
+
+ /**
+ * Returns a programmatic identifier for this transliterator.
+ * If this identifier is passed to getInstance()
, it
+ * will return this object, if it has been registered.
+ * @see #registerInstance
+ * @see #registerClass
+ * @see #getAvailableIDs
+ */
+ public final String getID() {
+ return ID;
+ }
+
+ /**
+ * Returns a name for this transliterator that is appropriate for
+ * display to the user in the default locale. See {@link
+ * #getDisplayName(Locale)} for details.
+ */
+ public final String getDisplayName() {
+ return getDisplayName(Locale.getDefault());
+ }
+
+ /**
+ * Returns a name for this transliterator that is appropriate for
+ * display to the user in the given locale. This name is taken
+ * from the locale resource data in the standard manner of the
+ * java.text
package.
+ *
+ * If no localized names exist in the system resource bundles,
+ * a name is synthesized using a localized
+ * MessageFormat
pattern from the resource data. The
+ * arguments to this pattern are an integer followed by one or two
+ * strings. The integer is the number of strings, either 1 or 2.
+ * The strings are formed by splitting the ID for this
+ * transliterator at the first '-'. If there is no '-', then the
+ * entire ID forms the only string.
+ * @param inLocale the Locale in which the display name should be
+ * localized.
+ * @see java.text.MessageFormat
+ */
+ public String getDisplayName(Locale inLocale) {
+ ResourceBundle bundle = ResourceBundle.getBundle(
+ RB_LOCALE_ELEMENTS, inLocale);
+
+ try {
+ return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
+ } catch (MissingResourceException e) {}
+
+ try {
+ // Construct the formatter first; if getString() fails
+ // we'll exit the try block
+ MessageFormat format = new MessageFormat(
+ bundle.getString(RB_DISPLAY_NAME_PATTERN));
+ // Construct the argument array
+ int i = ID.indexOf('-');
+ Object[] args = (i < 0)
+ ? new Object[] { new Integer(1), ID }
+ : new Object[] { new Integer(2), ID.substring(0, i),
+ ID.substring(i+1) };
+ // Format it using the pattern in the resource
+ return format.format(args);
+ } catch (MissingResourceException e2) {}
+
+ // We should not reach this point unless there is something
+ // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
+ // been deleted from the root RB_LOCALE_ELEMENTS resource.
+ throw new RuntimeException();
+ }
+
+ /**
+ * Returns the filter used by this transliterator, or null
+ * if this transliterator uses no filter.
+ */
+ public UnicodeFilter getFilter() {
+ return filter;
+ }
+
+ /**
+ * Changes the filter used by this transliterator. If the filter
+ * is set to null then no filtering will occur.
+ *
+ *
Callers must take care if a transliterator is in use by
+ * multiple threads. The filter should not be changed by one
+ * thread while another thread may be transliterating.
+ */
+ public void setFilter(UnicodeFilter filter) {
+ this.filter = filter;
+ }
+
+ /**
+ * Returns this transliterator's inverse. See the class
+ * documentation for details. This implementation simply inverts
+ * the two entities in the ID and attempts to retrieve the
+ * resulting transliterator. That is, if getID()
+ * returns "A-B", then this method will return the result of
+ * getInstance("B-A")
, or null
if that
+ * call fails.
+ *
+ *
This method does not take filtering into account. The
+ * returned transliterator will have no filter.
+ *
+ *
Subclasses with knowledge of their inverse may wish to
+ * override this method.
+ *
+ * @return a transliterator that is an inverse, not necessarily
+ * exact, of this transliterator, or null
if no such
+ * transliterator is registered.
+ * @see #registerInstance
+ */
+ public Transliterator getInverse() {
+ int i = ID.indexOf('-');
+ if (i >= 0) {
+ String inverseID = ID.substring(i+1) + '-' + ID.substring(0, i);
+ return internalGetInstance(inverseID);
+ }
+ return null;
+ }
+
+ /**
+ * Returns a Transliterator
object given its ID.
+ * The ID must be either a system transliterator ID or a ID registered
+ * using registerInstance()
.
+ *
+ * @param ID a valid ID, as enumerated by getAvailableIDs()
+ * @return A Transliterator
object with the given ID
+ * @exception IllegalArgumentException if the given ID is invalid.
+ * @see #registerInstance
+ * @see #getAvailableIDs
+ * @see #getID
+ */
+ public static Transliterator getInstance(String ID) {
+ Transliterator t = internalGetInstance(ID);
+ if (t != null) {
+ return t;
+ }
+ throw new IllegalArgumentException("Unsupported transliterator: "
+ + ID);
+ }
+
+ /**
+ * Returns a transliterator object given its ID. Unlike getInstance(),
+ * this method returns null if it cannot make use of the given ID.
+ */
+ private static Transliterator internalGetInstance(String ID) {
+ Object obj = cache.get(ID);
+ RuleBasedTransliterator.Data data = null;
+
+ if (obj instanceof RuleBasedTransliterator.Data) {
+ data = (RuleBasedTransliterator.Data) obj;
+ // Fall through to construct transliterator from cached Data object.
+ } else if (obj instanceof Class) {
+ try {
+ return (Transliterator) ((Class) obj).newInstance();
+ } catch (InstantiationException e) {
+ } catch (IllegalAccessException e2) {}
+ } else {
+ synchronized (cache) {
+ boolean isReverse = (obj == REVERSE_RULE_BASED_PLACEHOLDER);
+ String resourceName = RB_RULE_BASED_PREFIX;
+ int i = ID.indexOf('-');
+ if (i < 0) {
+ resourceName += ID;
+ } else {
+ String IDLeft = ID.substring(0, i);
+ String IDRight = ID.substring(i+1);
+ resourceName += isReverse ? (IDRight + IDLeft)
+ : (IDLeft + IDRight);
+ }
+ try {
+ ResourceBundle resource = ResourceBundle.getBundle(resourceName);
+
+ data = RuleBasedTransliterator.parse(resource.getString(RB_RULE),
+ isReverse
+ ? RuleBasedTransliterator.REVERSE
+ : RuleBasedTransliterator.FORWARD);
+
+ cache.put(ID, data);
+ // Fall through to construct transliterator from Data object.
+ } catch (MissingResourceException e) {}
+ }
+ }
+
+ if (data != null) {
+ return new RuleBasedTransliterator(ID, data, null);
+ }
+
+ return null;
+ }
+
+ /**
+ * Registers a subclass of Transliterator
with the
+ * system. This subclass must have a public constructor taking no
+ * arguments. When that constructor is called, the resulting
+ * object must return the ID
passed to this method if
+ * its getID()
method is called.
+ *
+ * @param ID the result of getID()
for this
+ * transliterator
+ * @param transClass a subclass of Transliterator
+ * @see #registerInstance
+ * @see #unregister
+ */
+ public static void registerClass(String ID, Class transClass) {
+ cache.put(ID, transClass);
+ }
+
+ /**
+ * Unregisters a transliterator or class. This may be either
+ * a system transliterator or a user transliterator or class.
+ *
+ * @param ID the ID of the transliterator or class
+ * @return the Object
that was registered with
+ * ID
, or null
if none was
+ * @see #registerInstance
+ * @see #registerClass
+ */
+ public static Object unregister(String ID) {
+ return cache.remove(ID);
+ }
+
+ /**
+ * Returns an enumeration over the programmatic names of registered
+ * Transliterator
objects. This includes both system
+ * transliterators and user transliterators registered using
+ * registerInstance()
. The enumerated names may be
+ * passed to getInstance()
.
+ *
+ * @return An Enumeration
over String
objects
+ * @see #getInstance
+ * @see #registerInstance
+ */
+ public static final Enumeration getAvailableIDs() {
+ return cache.keys();
+ }
+
+ static {
+ ResourceBundle bundle = ResourceBundle.getBundle(RB_LOCALE_ELEMENTS);
+
+ try {
+ String[] ruleBasedIDs = bundle.getStringArray(RB_RULE_BASED_IDS);
+
+ cache = new Hashtable();
+
+ for (int i=0; iUnicodeFilter defines a protocol for selecting a
+ * subset of the full range (U+0000 to U+FFFF) of Unicode characters.
+ * Currently, filters are used in conjunction with classes like {@link
+ * Transliterator} to only process selected characters through a
+ * transformation.
+ *
+ * {@link UnicodeFilterLogic}
+ */
+
+public interface UnicodeFilter {
+
+ /**
+ * Returns true for characters that are in the selected
+ * subset. In other words, if a character is to be
+ * filtered, then isIn() returns
+ * false.
+ */
+ public boolean isIn(char c);
+}
diff --git a/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java b/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java
new file mode 100755
index 00000000000..f9e6ec1c609
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java
@@ -0,0 +1,112 @@
+package com.ibm.text;
+
+/**
+ * UnicodeFilterLogic
provides logical operators on
+ * {@link UnicodeFilter} objects. This class cannot be instantiated;
+ * it consists only of static methods. The static methods return
+ * filter objects that perform logical inversion (not),
+ * intersection (and), or union (or) of the given
+ * filter objects.
+ */
+public final class UnicodeFilterLogic {
+
+ /**
+ * Returns a UnicodeFilter that implements the inverse of
+ * the given filter.
+ */
+ public static UnicodeFilter not(final UnicodeFilter f) {
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return !f.isIn(c);
+ }
+ };
+ }
+
+ /**
+ * Returns a UnicodeFilter that implements a short
+ * circuit AND of the result of the two given filters. That is,
+ * if f.isIn() is false, then g.isIn()
+ * is not called, and isIn() returns false.
+ *
+ * Either f or g must be non-null.
+ */
+ public static UnicodeFilter and(final UnicodeFilter f,
+ final UnicodeFilter g) {
+ if (f == null) {
+ return g;
+ }
+ if (g == null) {
+ return f;
+ }
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return f.isIn(c) && g.isIn(c);
+ }
+ };
+ }
+
+ /**
+ * Returns a UnicodeFilter that implements a short
+ * circuit AND of the result of the given filters. That is, if
+ * f[i].isIn() is false, then
+ * f[j].isIn() is not called, where j > i, and
+ * isIn() returns false.
+ */
+ public static UnicodeFilter and(final UnicodeFilter[] f) {
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ for (int i=0; iUnicodeFilter that implements a short
+ * circuit OR of the result of the two given filters. That is, if
+ * f.isIn() is true, then g.isIn() is
+ * not called, and isIn() returns true.
+ *
+ * Either f or g must be non-null.
+ */
+ public static UnicodeFilter or(final UnicodeFilter f,
+ final UnicodeFilter g) {
+ if (f == null) {
+ return g;
+ }
+ if (g == null) {
+ return f;
+ }
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return f.isIn(c) || g.isIn(c);
+ }
+ };
+ }
+
+ /**
+ * Returns a UnicodeFilter that implements a short
+ * circuit OR of the result of the given filters. That is, if
+ * f[i].isIn() is false, then
+ * f[j].isIn() is not called, where j > i, and
+ * isIn() returns true.
+ */
+ public static UnicodeFilter or(final UnicodeFilter[] f) {
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ for (int i=0; icharacter classes used in regular expressions.
+ * Such classes specify a subset of the set of all Unicode characters,
+ * which in this implementation is the characters from U+0000 to
+ * U+FFFF, ignoring surrogates.
+ *
+ * This class supports two APIs. The first is modeled after Java 2's
+ * java.util.Set
interface, although this class does not
+ * implement that interface. All methods of Set
are
+ * supported, with the modification that they take a character range
+ * or single character instead of an Object
, and they
+ * take a UnicodeSet
instead of a Collection
.
+ *
+ *
The second API is the
+ * applyPattern()
/toPattern()
API from the
+ * java.text.Format
-derived classes. Unlike the
+ * methods that add characters, add categories, and control the logic
+ * of the set, the method applyPattern()
sets all
+ * attributes of a UnicodeSet
at once, based on a
+ * string pattern.
+ *
+ *
In addition, the set complement operation is supported through
+ * the complement()
method.
+ *
+ *
Pattern syntax
+ *
+ * Patterns are accepted by the constructors and the
+ * applyPattern()
methods and returned by the
+ * toPattern()
method. These patterns follow a syntax
+ * similar to that employed by version 8 regular expression character
+ * classes:
+ *
+ *
+ *
+ *
+ * pattern := |
+ * ('[' '^'? item* ']') |
+ * ('[:' '^'? category ':]') |
+ *
+ *
+ * item := |
+ * char | (char '-' char) | pattern-expr
+ * |
+ *
+ *
+ * pattern-expr := |
+ * pattern | pattern-expr pattern |
+ * pattern-expr op pattern
+ * |
+ *
+ *
+ * op := |
+ * '&' | '-'
+ * |
+ *
+ *
+ * special := |
+ * '[' | ']' | '-'
+ * |
+ *
+ *
+ * char := |
+ * any character that is not special
+ * | ('\u005C' any character)
+ * | ('\u005Cu' hex hex hex hex)
+ * |
+ *
+ *
+ * hex := |
+ * any character for which
+ * Character.digit(c, 16)
+ * returns a non-negative result |
+ *
+ *
+ * category := |
+ * 'M' | 'N' | 'Z' | 'C' | 'L' | 'P' |
+ * 'S' | 'Mn' | 'Mc' | 'Me' | 'Nd' | 'Nl' | 'No' | 'Zs' | 'Zl' |
+ * 'Zp' | 'Cc' | 'Cf' | 'Cs' | 'Co' | 'Cn' | 'Lu' | 'Ll' | 'Lt'
+ * | 'Lm' | 'Lo' | 'Pc' | 'Pd' | 'Ps' | 'Pe' | 'Po' | 'Sm' |
+ * 'Sc' | 'Sk' | 'So' |
+ *
+ *
+ *
+ *
+ *
+ * Legend:
+ *
+ * a := b |
+ * |
+ * a may be replaced by b |
+ *
+ *
+ * a? |
+ * |
+ * zero or one instance of a
+ * |
+ *
+ *
+ * a* |
+ * |
+ * one or more instances of a
+ * |
+ *
+ *
+ * a | b |
+ * |
+ * either a or b
+ * |
+ *
+ *
+ * 'a' |
+ * |
+ * the literal string between the quotes |
+ *
+ *
+ * |
+ *
+ *
+ *
+ *
+ * Patterns specify individual characters, ranges of characters, and
+ * Unicode character categories. When elements are concatenated, they
+ * specify their union. To complement a set, place a '^' immediately
+ * after the opening '[' or '[:'. In any other location, '^' has no
+ * special meaning.
+ *
+ * Ranges are indicated by placing two a '-' between two
+ * characters, as in "a-z". This specifies the range of all
+ * characters from the left to the right, in Unicode order. If the
+ * left and right characters are the same, then the range consists of
+ * just that character. If the left character is greater than the
+ * right character it is a syntax error. If a '-' occurs as the first
+ * character after the opening '[' or '[^', or if it occurs as the
+ * last character before the closing ']', then it is taken as a
+ * literal. Thus "[a\u005C-b]", "[-ab]", and "[ab-]" all indicate the same
+ * set of three characters, 'a', 'b', and '-'.
+ *
+ *
Sets may be intersected using the '&' operator or the asymmetric
+ * set difference may be taken using the '-' operator, for example,
+ * "[[:L:]&[\u005Cu0000-\u005Cu0FFF]]" indicates the set of all Unicode letters
+ * with values less than 4096. Operators ('&' and '|') have equal
+ * precedence and bind left-to-right. Thus
+ * "[[:L:]-[a-z]-[\u005Cu0100-\u005Cu01FF]]" is equivalent to
+ * "[[[:L:]-[a-z]]-[\u005Cu0100-\u005Cu01FF]]". This only really matters for
+ * difference; intersection is commutative.
+ *
+ *
+ * [a] | The set containing 'a'
+ * |
[a-z] | The set containing 'a'
+ * through 'z' and all letters in between, in Unicode order
+ * |
[^a-z] | The set containing
+ * all characters but 'a' through 'z',
+ * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
+ * |
[[pat1][pat2]]
+ * | The union of sets specified by pat1 and pat2
+ * |
[[pat1]&[pat2]]
+ * | The intersection of sets specified by pat1 and pat2
+ * |
[[pat1]-[pat2]]
+ * | The asymmetric difference of sets specified by pat1 and
+ * pat2
+ * |
[:Lu:]
+ * | The set of characters belonging to the given
+ * Unicode category, as defined by Character.getType() ; in
+ * this case, Unicode uppercase letters
+ * |
[:L:]
+ * | The set of characters belonging to all Unicode categories
+ * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]] .
+ * |
+ *
+ * Character categories.
+ *
+ * Character categories are specified using the POSIX-like syntax
+ * '[:Lu:]'. The complement of a category is specified by inserting
+ * '^' after the opening '[:'. The following category names are
+ * recognized. Actual determination of category data uses
+ * Character.getType()
, so it reflects the underlying
+ * implmementation used by Character
. As of Java 2 and
+ * JDK 1.1.8, this is Unicode 2.1.2.
+ *
+ *
+ * Normative
+ * Mn = Mark, Non-Spacing
+ * Mc = Mark, Spacing Combining
+ * Me = Mark, Enclosing
+ *
+ * Nd = Number, Decimal Digit
+ * Nl = Number, Letter
+ * No = Number, Other
+ *
+ * Zs = Separator, Space
+ * Zl = Separator, Line
+ * Zp = Separator, Paragraph
+ *
+ * Cc = Other, Control
+ * Cf = Other, Format
+ * Cs = Other, Surrogate
+ * Co = Other, Private Use
+ * Cn = Other, Not Assigned
+ *
+ * Informative
+ * Lu = Letter, Uppercase
+ * Ll = Letter, Lowercase
+ * Lt = Letter, Titlecase
+ * Lm = Letter, Modifier
+ * Lo = Letter, Other
+ *
+ * Pc = Punctuation, Connector
+ * Pd = Punctuation, Dash
+ * Ps = Punctuation, Open
+ * Pe = Punctuation, Close
+ * *Pi = Punctuation, Initial quote
+ * *Pf = Punctuation, Final quote
+ * Po = Punctuation, Other
+ *
+ * Sm = Symbol, Math
+ * Sc = Symbol, Currency
+ * Sk = Symbol, Modifier
+ * So = Symbol, Other
+ *
+ * *Unsupported by Java (and hence unsupported by UnicodeSet).
+ *
+ * @author Alan Liu
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */
+public class UnicodeSet {
+ /**
+ * The internal representation is a StringBuffer of even length.
+ * Each pair of characters represents a range that is included in
+ * the set. A single character c is represented as cc. Thus, the
+ * ranges in the set are (a,b), a and b inclusive, where a =
+ * pairs.charAt(i) and b = pairs.charAt(i+1) for all even i, 0 <=
+ * i <= pairs.length()-2. Pairs are always stored in ascending
+ * Unicode order. Pairs are always stored in shortest form. For
+ * example, if the pair "hh", representing the single character
+ * 'h', is added to the pairs list "agik", representing the ranges
+ * 'a'-'g' and 'i'-'k', the result is "ak", not "aghhik".
+ *
+ * This representation format was originally used in Richard
+ * Gillam's CharSet class.
+ */
+ private StringBuffer pairs;
+
+ private static final String CATEGORY_NAMES =
+ // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
+ //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8
+ "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
+
+ private static final int UNSUPPORTED_CATEGORY = 17;
+
+ private static final int CATEGORY_COUNT = 29;
+
+ /**
+ * A cache mapping character category integers, as returned by
+ * Character.getType(), to pairs strings. Entries are initially
+ * null and are created on demand.
+ */
+ private static final String[] CATEGORY_PAIRS_CACHE =
+ new String[CATEGORY_COUNT];
+
+ //----------------------------------------------------------------
+ // Debugging and testing
+ //----------------------------------------------------------------
+
+ /**
+ * Return the representation of this set as a list of character
+ * ranges. Ranges are listed in ascending Unicode order. For
+ * example, the set [a-zA-M3] is represented as "33AMaz".
+ */
+ public String getPairs() {
+ return pairs.toString();
+ }
+
+ //----------------------------------------------------------------
+ // Public API
+ //----------------------------------------------------------------
+
+ /**
+ * Constructs an empty set.
+ */
+ public UnicodeSet() {
+ pairs = new StringBuffer();
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @exception IllegalArgumentException if the pattern contains
+ * a syntax error.
+ */
+ public UnicodeSet(String pattern) {
+ applyPattern(pattern, false);
+ }
+
+ /**
+ * Constructs a set from the given pattern, optionally ignoring
+ * white space. See the class description for the syntax of the
+ * pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param ignoreSpaces if true
, all spaces in the
+ * pattern are ignored, except those preceded by '\u005C'. Spaces are
+ * those characters for which Character.isSpaceChar()
+ * is true
.
+ * @exception IllegalArgumentException
if the pattern
+ * contains a syntax error.
+ */
+ public UnicodeSet(String pattern, boolean ignoreSpaces) {
+ applyPattern(pattern, ignoreSpaces);
+ }
+
+ /**
+ * Constructs a set from the given Unicode character category.
+ * @param category an integer indicating the character category as
+ * returned by Character.getType()
.
+ * @exception IllegalArgumentException
if the given
+ * category is invalid.
+ */
+ public UnicodeSet(int category) {
+ if (category < 0 || category >= CATEGORY_COUNT ||
+ category == UNSUPPORTED_CATEGORY) {
+ throw new IllegalArgumentException("Invalid category");
+ }
+ pairs = new StringBuffer(getCategoryPairs(category));
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given
+ * pattern. See the class description for the syntax of the
+ * pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @exception IllegalArgumentException
if the pattern
+ * contains a syntax error.
+ */
+ public final void applyPattern(String pattern) {
+ applyPattern(pattern, false);
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given
+ * pattern, optionally ignoring white space. See the class
+ * description for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param ignoreSpaces if true
, all spaces in the
+ * pattern are ignored. Spaces are those characters for which
+ * Character.isSpaceChar()
is true
.
+ * Characters preceded by '\\' are escaped, losing any special
+ * meaning they otherwise have. Spaces may be included by
+ * escaping them.
+ * @exception IllegalArgumentException
if the pattern
+ * contains a syntax error.
+ */
+ public void applyPattern(String pattern, boolean ignoreSpaces) {
+ ParsePosition pos = new ParsePosition(0);
+
+ // To ignore spaces, create a new pattern without spaces. We
+ // have to process all '\' escapes. If '\' is encountered,
+ // insert it and the following character (if any -- let parse
+ // deal with any syntax errors) in the pattern. This allows
+ // escaped spaces.
+ if (ignoreSpaces) {
+ StringBuffer pat = new StringBuffer();
+ for (int i=0; in, where 0 <=
n <= 65536
.
+ *
+ * @return the number of elements in this set (its cardinality).
+ */
+ public int size() {
+ int n = 0;
+ for (int i=0; itrue if this set contains no elements.
+ *
+ * @return true if this set contains no elements.
+ */
+ public boolean isEmpty() {
+ return pairs.length() == 0;
+ }
+
+ /**
+ * Returns true if this set contains the specified range
+ * of chars.
+ *
+ * @return true if this set contains the specified range
+ * of chars.
+ */
+ public boolean contains(char first, char last) {
+ // Set i to the end of the smallest range such that its end
+ // point >= last, or pairs.length() if no such range exists.
+ int i = 1;
+ while (ipairs.charAt(i)) i+=2;
+ return i=pairs.charAt(i-1);
+ }
+
+ /**
+ * Returns true if this set contains the specified char.
+ *
+ * @return true if this set contains the specified char.
+ */
+ public boolean contains(char c) {
+ return contains(c, c);
+ }
+
+ /**
+ * Adds the specified range to this set if it is not already
+ * present. If this set already contains the specified range,
+ * the call leaves this set unchanged. If last > first
+ * then an empty range is added, leaving the set unchanged.
+ *
+ * @param first first character, inclusive, of range to be added
+ * to this set.
+ * @param last last character, inclusive, of range to be added
+ * to this set.
+ */
+ public void add(char first, char last) {
+ if (first <= last) {
+ addPair(pairs, first, last);
+ }
+ }
+
+ /**
+ * Adds the specified character to this set if it is not already
+ * present. If this set already contains the specified character,
+ * the call leaves this set unchanged.
+ */
+ public final void add(char c) {
+ add(c, c);
+ }
+
+ /**
+ * Removes the specified range from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns. If last > first
then an empty range is
+ * removed, leaving the set unchanged.
+ *
+ * @param first first character, inclusive, of range to be removed
+ * from this set.
+ * @param last last character, inclusive, of range to be removed
+ * from this set.
+ */
+ public void remove(char first, char last) {
+ if (first <= last) {
+ removePair(pairs, first, last);
+ }
+ }
+
+ /**
+ * Removes the specified character from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns.
+ */
+ public final void remove(char c) {
+ remove(c, c);
+ }
+
+ /**
+ * Returns true if the specified set is a subset
+ * of this set.
+ *
+ * @param c set to be checked for containment in this set.
+ * @return true if this set contains all of the elements of the
+ * specified set.
+ */
+ public boolean containsAll(UnicodeSet c) {
+ // The specified set is a subset if all of its pairs are contained
+ // in this set.
+ int i = 1;
+ for (int j=0; j= last, or pairs.length() if no such range
+ // exists.
+ while (ipairs.charAt(i)) i+=2;
+ if (i>pairs.length() || c.pairs.charAt(j) < pairs.charAt(i-1)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Adds all of the elements in the specified set to this set if
+ * they're not already present. This operation effectively
+ * modifies this set so that its value is the union of the two
+ * sets. The behavior of this operation is unspecified if the specified
+ * collection is modified while the operation is in progress.
+ *
+ * @param c set whose elements are to be added to this set.
+ * @see #add(char, char)
+ */
+ public void addAll(UnicodeSet c) {
+ doUnion(pairs, c.pairs.toString());
+ }
+
+ /**
+ * Retains only the elements in this set that are contained in the
+ * specified set. In other words, removes from this set all of
+ * its elements that are not contained in the specified set. This
+ * operation effectively modifies this set so that its value is
+ * the intersection of the two sets.
+ *
+ * @param c set that defines which elements this set will retain.
+ */
+ public void retainAll(UnicodeSet c) {
+ doIntersection(pairs, c.pairs.toString());
+ }
+
+ /**
+ * Removes from this set all of its elements that are contained in the
+ * specified set. This operation effectively modifies this
+ * set so that its value is the asymmetric set difference of
+ * the two sets.
+ *
+ * @param c set that defines which elements will be removed from
+ * this set.
+ */
+ public void removeAll(UnicodeSet c) {
+ doDifference(pairs, c.pairs.toString());
+ }
+
+ /**
+ * Inverts this set. This operation modifies this set so that
+ * its value is its complement. This is equivalent to the pseudo code:
+ * this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this)
.
+ */
+ public void complement() {
+ doComplement(pairs);
+ }
+
+ /**
+ * Removes all of the elements from this set. This set will be
+ * empty after this call returns.
+ */
+ public void clear() {
+ pairs.setLength(0);
+ }
+
+ /**
+ * Compares the specified object with this set for equality. Returns
+ * true if the specified object is also a set, the two sets
+ * have the same size, and every member of the specified set is
+ * contained in this set (or equivalently, every member of this set is
+ * contained in the specified set).
+ *
+ * @param o Object to be compared for equality with this set.
+ * @return true if the specified Object is equal to this set.
+ */
+ public boolean equals(Object o) {
+ return o instanceof UnicodeSet &&
+ pairs.equals(((UnicodeSet)o).pairs);
+ }
+
+ /**
+ * Returns the hash code value for this set.
+ *
+ * @return the hash code value for this set.
+ * @see Object#hashCode()
+ */
+ public int hashCode() {
+ return pairs.hashCode();
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Pattern parsing
+ //----------------------------------------------------------------
+
+ /**
+ * Parses the given pattern, starting at the given position. The
+ * character at pattern.charAt(pos.getIndex()) must be '[', or the
+ * parse fails. Parsing continues until the corresponding closing
+ * ']'. If a syntax error is encountered between the opening and
+ * closing brace, the parse fails. Upon return from a successful
+ * parse, the ParsePosition is updated to point to the character
+ * following the closing ']', and a StringBuffer containing a
+ * pairs list for the parsed pattern is returned. This method calls
+ * itself recursively to parse embedded subpatterns.
+ *
+ * @param pattern the string containing the pattern to be parsed.
+ * The portion of the string from pos.getIndex(), which must be a
+ * '[', to the corresponding closing ']', is parsed.
+ * @param pos upon entry, the position at which to being parsing.
+ * The character at pattern.charAt(pos.getIndex()) must be a '['.
+ * Upon return from a successful parse, pos.getIndex() is either
+ * the character after the closing ']' of the parsed pattern, or
+ * pattern.length() if the closing ']' is the last character of
+ * the pattern string.
+ * @return a StringBuffer containing a pairs list for the parsed
+ * substring of pattern
+ * @exception IllegalArgumentException if the parse fails.
+ */
+ private static StringBuffer parse(String pattern, ParsePosition pos) {
+
+ boolean invert = false;
+ StringBuffer pairsBuf = new StringBuffer();
+
+ /**
+ * Nodes: 0 - idle, waiting for '['
+ * 10 - like 11, but immediately after "[" or "[^"
+ * 11 - awaiting x, "]", "[...]", or "[:...:]"
+ * 21 - after x
+ * 23 - after x-
+ *
+ * The parsing state machine moves from node 0 through zero or more
+ * other nodes back to node 0, in a successful parse.
+ */
+ int node = 0;
+ char first = 0;
+ int i;
+
+ /**
+ * This loop iterates over the characters in the pattern. We
+ * start at the position specified by pos. We exit the loop
+ * when either a matching closing ']' is seen, or we read all
+ * characters of the pattern.
+ */
+ for (i=pos.getIndex(); i= pattern.length()) {
+ throw new IllegalArgumentException("Invalid \\u escape");
+ }
+ c = '\u0000';
+ for (int j=(++i)+4; i "aq". addPair("ampz", 'n',
+ * 'o') => "az".
+ */
+ private static void addPair(StringBuffer pairs, char c, char d) {
+ char a = 0;
+ char b = 0;
+ for (int i=0; i "ak".
+ * removePair("ampz", 'l', 'q') => "akrz".
+ */
+ private static void removePair(StringBuffer pairs, char c, char d) {
+ // Iterate over pairs until we find a pair that overlaps
+ // with the given range.
+ for (int i=0; i= a.
+ // rangeEdited is set to true if we have modified the
+ // range a-b (the range at i) in place.
+ boolean rangeEdited = false;
+ if (c > a) {
+ // If c is after a and before b, then we have overlap
+ // of this sort: a--c==b--d or a--c==d--b, where a-b
+ // and c-d are the ranges of interest. We need to
+ // add the range a,c-1.
+ pairs.setCharAt(i+1, (char)(c-1));
+ // i is already a
+ rangeEdited = true;
+ }
+ if (d < b) {
+ // If d is after a and before b, we overlap like this:
+ // c--a==d--b or a--c==d--b, where a-b is the range at
+ // i and c-d is the range being removed. We need to
+ // add the range d+1,b.
+ if (rangeEdited) {
+ pairs.insert(i+2, new char[] { (char)(d+1), b });
+ i += 2;
+ } else {
+ pairs.setCharAt(i, (char)(d+1));
+ // i+1 is already b
+ rangeEdited = true;
+ }
+ }
+ if (!rangeEdited) {
+ // If we didn't add any ranges, that means the entire
+ // range a-b must be deleted, since we have
+ // c--a==b--d.
+ stringBufferDelete(pairs, i, i+2);
+ i -= 2;
+ }
+ }
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Fundamental operators
+ //----------------------------------------------------------------
+
+ /**
+ * Changes the pairs list to represent the complement of the set it
+ * currently represents. The pairs list will be normalized (in
+ * order and in shortest possible form) if the original pairs list
+ * was normalized.
+ */
+ private static void doComplement(StringBuffer pairs) {
+ if (pairs.length() == 0) {
+ pairs.append('\u0000').append('\uffff');
+ return;
+ }
+
+ // Change each end to a start and each start to an end of the
+ // gaps between the ranges. That is, 3-7 9-12 becomes x-2 8-8
+ // 13-x, where 'x' represents a range that must now be fixed
+ // up.
+ for (int i=0; i 0 && c1.charAt(i - 1) > ub)
+ ub = c1.charAt(i - 1);
+
+ // now advance j to the first character that is greater
+ // that "ub" plus one
+ while (j < c2.length() && c2.charAt(j) <= ub + 1)
+ ++j;
+
+ // if j points to the endpoint of a range, update "ub"
+ // to that character, or if j points to the start of
+ // a range and the endpoint of the preceding range is
+ // greater than "ub", update "up" to _that_ character
+ if (j % 2 == 1)
+ ub = c2.charAt(j);
+ else if (j > 0 && c2.charAt(j - 1) > ub)
+ ub = c2.charAt(j - 1);
+ }
+ // when we finally fall out of this loop, we will have stitched
+ // together a series of ranges that overlap or touch, i and j
+ // will both point to starting points of ranges, and "ub" will
+ // be the endpoint of the range we're working on. Write "ub"
+ // to the result
+ result.append(ub);
+
+ // loop back around to create the next range in the result
+ }
+
+ // we fall out to here when we've exhausted all the characters in
+ // one of the operands. We can append all of the remaining characters
+ // in the other operand without doing any extra work.
+ if (i < c1.length())
+ result.append(c1.substring(i));
+ if (j < c2.length())
+ result.append(c2.substring(j));
+
+ pairs.setLength(0);
+ pairs.append(result.toString());
+ }
+
+ /**
+ * Given two pairs lists, changes the first in place to represent
+ * the asymmetric difference of the two sets.
+ */
+ private static void doDifference(StringBuffer pairs, String pairs2) {
+ StringBuffer p2 = new StringBuffer(pairs2);
+ doComplement(p2);
+ doIntersection(pairs, p2.toString());
+ }
+
+ /**
+ * Given two pairs lists, changes the first in place to represent
+ * the intersection of the two sets.
+ *
+ * This implementation format was stolen from Richard Gillam's
+ * CharSet class.
+ */
+ private static void doIntersection(StringBuffer pairs, String c2) {
+ StringBuffer result = new StringBuffer();
+ String c1 = pairs.toString();
+
+ int i = 0;
+ int j = 0;
+ int oldI;
+ int oldJ;
+
+ // iterate until we've exhausted one of the operands
+ while (i < c1.length() && j < c2.length()) {
+
+ // advance j until it points to a character that is larger than
+ // the one i points to. If this is the beginning of a one-
+ // character range, advance j to point to the end
+ if (i < c1.length() && i % 2 == 0) {
+ while (j < c2.length() && c2.charAt(j) < c1.charAt(i))
+ ++j;
+ if (j < c2.length() && j % 2 == 0 && c2.charAt(j) == c1.charAt(i))
+ ++j;
+ }
+
+ // if j points to the endpoint of a range, save the current
+ // value of i, then advance i until it reaches a character
+ // which is larger than the character pointed at
+ // by j. All of the characters we've advanced over (except
+ // the one currently pointed to by i) are added to the result
+ oldI = i;
+ while (j % 2 == 1 && i < c1.length() && c1.charAt(i) <= c2.charAt(j))
+ ++i;
+ result.append(c1.substring(oldI, i));
+
+ // if i points to the endpoint of a range, save the current
+ // value of j, then advance j until it reaches a character
+ // which is larger than the character pointed at
+ // by i. All of the characters we've advanced over (except
+ // the one currently pointed to by i) are added to the result
+ oldJ = j;
+ while (i % 2 == 1 && j < c2.length() && c2.charAt(j) <= c1.charAt(i))
+ ++j;
+ result.append(c2.substring(oldJ, j));
+
+ // advance i until it points to a character larger than j
+ // If it points at the beginning of a one-character range,
+ // advance it to the end of that range
+ if (j < c2.length() && j % 2 == 0) {
+ while (i < c1.length() && c1.charAt(i) < c2.charAt(j))
+ ++i;
+ if (i < c1.length() && i % 2 == 0 && c2.charAt(j) == c1.charAt(i))
+ ++i;
+ }
+ }
+
+ pairs.setLength(0);
+ pairs.append(result.toString());
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Generation of pairs for Unicode categories
+ //----------------------------------------------------------------
+
+ /**
+ * Returns a pairs string for the given category, given its name.
+ * The category name must be either a two-letter name, such as
+ * "Lu", or a one letter name, such as "L". One-letter names
+ * indicate the logical union of all two-letter names that start
+ * with that letter. Case is significant. If the name starts
+ * with the character '^' then the complement of the given
+ * character set is returned.
+ *
+ * Although individual categories such as "Lu" are cached, we do
+ * not currently cache single-letter categories such as "L" or
+ * complements such as "^Lu" or "^L". It would be easy to cache
+ * these as well in a hashtable should the need arise.
+ */
+ private static String getCategoryPairs(String catName) {
+ boolean invert = (catName.length() > 1 &&
+ catName.charAt(0) == '^');
+ if (invert) {
+ catName = catName.substring(1);
+ }
+
+ StringBuffer cat = null;
+
+ // if we have two characters, search the category map for that
+ // code and either construct and return a UnicodeSet from the
+ // data in the category map or throw an exception
+ if (catName.length() == 2) {
+ int i = CATEGORY_NAMES.indexOf(catName);
+ if (i>=0 && i%2==0) {
+ i /= 2;
+ if (i != UNSUPPORTED_CATEGORY) {
+ String pairs = getCategoryPairs(i);
+ if (!invert) {
+ return pairs;
+ }
+ cat = new StringBuffer(pairs);
+ }
+ }
+ } else if (catName.length() == 1) {
+ // if we have one character, search the category map for
+ // codes beginning with that letter, and union together
+ // all of the matching sets that we find (or throw an
+ // exception if there are no matches)
+ for (int i=0; i= 0) {
+ pairs.append((char)first).append((char)last);
+ }
+ first = last = i;
+ }
+ }
+ }
+ if (first >= 0) {
+ pairs.append((char)first).append((char)last);
+ }
+ CATEGORY_PAIRS_CACHE[cat] = pairs.toString();
+ }
+ return CATEGORY_PAIRS_CACHE[cat];
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Utility methods
+ //----------------------------------------------------------------
+
+ /**
+ * Returns the character after the given position, or '\uFFFF' if
+ * there is none.
+
+ */
+ private static final char charAfter(String str, int i) {
+ return ((++i) < str.length()) ? str.charAt(i) : '\uFFFF';
+ }
+
+ /**
+ * Deletes a range of character from a StringBuffer, from start to
+ * limit-1. This is not part of JDK 1.1 StringBuffer, but is
+ * present in Java 2.
+ * @param start inclusive start of range
+ * @param limit exclusive end of range
+ */
+ private static void stringBufferDelete(StringBuffer buf,
+ int start, int limit) {
+ // In Java 2 just use:
+ // buf.delete(start, limit);
+ char[] chars = null;
+ if (buf.length() > limit) {
+ chars = new char[buf.length() - limit];
+ buf.getChars(limit, buf.length(), chars, 0);
+ }
+ buf.setLength(start);
+ if (chars != null) {
+ buf.append(chars);
+ }
+ }
+}
diff --git a/icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java b/icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java
new file mode 100755
index 00000000000..1e688f65fa9
--- /dev/null
+++ b/icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java
@@ -0,0 +1,172 @@
+package com.ibm.text;
+import java.util.*;
+
+/**
+ * A transliterator that converts from Unicode characters to
+ * hexadecimal Unicode escape sequences. It outputs a
+ * prefix specified in the constructor and optionally converts the hex
+ * digits to uppercase.
+ *
+ * Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class UnicodeToHexTransliterator extends Transliterator {
+
+ /**
+ * Package accessible ID for this transliterator.
+ */
+ static String _ID = "Unicode-Hex";
+
+ private String prefix;
+
+ private boolean uppercase;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructs a transliterator.
+ * @param prefix the string that will precede the four hex
+ * digits for UNICODE_HEX transliterators. Ignored
+ * if direction is HEX_UNICODE.
+ * @param uppercase if true, the four hex digits will be
+ * converted to uppercase; otherwise they will be lowercase.
+ * Ignored if direction is HEX_UNICODE.
+ */
+ public UnicodeToHexTransliterator(String prefix, boolean uppercase,
+ UnicodeFilter filter) {
+ super(_ID, filter);
+ this.prefix = prefix;
+ this.uppercase = uppercase;
+ }
+
+ /**
+ * Constructs a transliterator with the default prefix "\u"
+ * that outputs uppercase hex digits.
+ */
+ public UnicodeToHexTransliterator() {
+ this("\\u", true, null);
+ }
+
+ /**
+ * Returns the string that precedes the four hex digits.
+ * @return prefix string
+ */
+ public String getPrefix() {
+ return prefix;
+ }
+
+ /**
+ * Sets the string that precedes the four hex digits.
+ *
+ *
Callers must take care if a transliterator is in use by
+ * multiple threads. The prefix should not be changed by one
+ * thread while another thread may be transliterating.
+ * @param prefix prefix string
+ */
+ public void setPrefix(String prefix) {
+ this.prefix = prefix;
+ }
+
+ /**
+ * Returns true if this transliterator outputs uppercase hex digits.
+ */
+ public boolean isUppercase() {
+ return uppercase;
+ }
+
+ /**
+ * Sets if this transliterator outputs uppercase hex digits.
+ *
+ *
Callers must take care if a transliterator is in use by
+ * multiple threads. The uppercase mode should not be changed by
+ * one thread while another thread may be transliterating.
+ * @param outputUppercase if true, then this transliterator
+ * outputs uppercase hex digits.
+ */
+ public void setUppercase(boolean outputUppercase) {
+ uppercase = outputUppercase;
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return the new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ int[] offsets = { start, limit, start };
+ handleKeyboardTransliterate(text, offsets);
+ return offsets[LIMIT];
+ }
+
+ /**
+ * Implements {@link Transliterator#handleKeyboardTransliterate}.
+ */
+ protected void handleKeyboardTransliterate(Replaceable text,
+ int[] offsets) {
+ /**
+ * Performs transliteration changing all characters to
+ * Unicode hexadecimal escapes. For example, '@' -> "U+0040",
+ * assuming the prefix is "U+".
+ */
+ int cursor = offsets[CURSOR];
+ int limit = offsets[LIMIT];
+
+ UnicodeFilter filter = getFilter();
+
+ loop:
+ while (cursor < limit) {
+ char c = text.charAt(cursor);
+ if (filter != null && !filter.isIn(c)) {
+ ++cursor;
+ continue;
+ }
+ String hex = hex(c);
+ text.replace(cursor, cursor+1, hex);
+ int len = hex.length();
+ cursor += len; // Advance cursor by 1 and adjust for new text
+ --len;
+ limit += len;
+ }
+
+ offsets[LIMIT] = limit;
+ offsets[CURSOR] = cursor;
+ }
+
+ /**
+ * Return the length of the longest context required by this transliterator.
+ * This is preceding context.
+ * @param direction either FORWARD
or REVERSE
+ * @return maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return 0;
+ }
+
+ /**
+ * Form escape sequence.
+ */
+ private final String hex(char c) {
+ StringBuffer buf = new StringBuffer();
+ buf.append(prefix);
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ String h = Integer.toHexString(c);
+ buf.append(uppercase ? h.toUpperCase() : h);
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
new file mode 100755
index 00000000000..96433f64a26
--- /dev/null
+++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java
@@ -0,0 +1,763 @@
+import com.ibm.text.*;
+import java.text.*;
+import java.util.*;
+
+/**
+ * @test
+ * @summary General test of Transliterator
+ */
+public class TransliteratorTest extends IntlTest {
+
+ public static void main(String[] args) throws Exception {
+ new TransliteratorTest().run(args);
+ }
+
+ /**
+ * A CommonPoint legacy round-trip test for the Kana transliterator.
+ */
+// public void TestKanaRoundTrip() {
+// Transliterator t = Transliterator.getInstance("Kana");
+// StringTokenizer tok = new StringTokenizer(KANA_RT_DATA);
+// while (tok.hasMoreTokens()) {
+// String str = tok.nextToken();
+// ReplaceableString tmp = new ReplaceableString(str);
+// t.transliterate(tmp, Transliterator.FORWARD);
+//
+// str = tmp.toString();
+// tmp = new ReplaceableString(str);
+// t.transliterate(tmp, Transliterator.REVERSE);
+// t.transliterate(tmp, Transliterator.FORWARD);
+// if (!tmp.toString().equals(str)) {
+// tmp = new ReplaceableString(str);
+// t.transliterate(tmp, Transliterator.REVERSE);
+// String a = tmp.toString();
+// t.transliterate(tmp, Transliterator.FORWARD);
+// errln("FAIL: " + escape(str) + " -> " +
+// escape(a) + " -> " + escape(tmp.toString()));
+// }
+// }
+// }
+
+ public void TestInstantiation() {
+ long ms = System.currentTimeMillis();
+ String ID;
+ for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
+ ID = (String) e.nextElement();
+ try {
+ Transliterator t = Transliterator.getInstance(ID);
+ // We should get a new instance if we try again
+ Transliterator t2 = Transliterator.getInstance(ID);
+ if (t != t2) {
+ logln(ID + ":" + t);
+ } else {
+ errln("FAIL: " + ID + " returned identical instances");
+ }
+ } catch (IllegalArgumentException ex) {
+ errln("FAIL: " + ID);
+ throw ex;
+ }
+ }
+
+ // Now test the failure path
+ try {
+ ID = "";
+ Transliterator t = Transliterator.getInstance(ID);
+ errln("FAIL: " + ID + " returned " + t);
+ } catch (IllegalArgumentException ex) {
+ logln("OK: Bogus ID handled properly");
+ }
+
+ ms = System.currentTimeMillis() - ms;
+ logln("Elapsed time: " + ms + " ms");
+ }
+
+ public void TestSimpleRules() {
+ /* Example: rules 1. ab>x|y
+ * 2. yc>z
+ *
+ * []|eabcd start - no match, copy e to tranlated buffer
+ * [e]|abcd match rule 1 - copy output & adjust cursor
+ * [ex|y]cd match rule 2 - copy output & adjust cursor
+ * [exz]|d no match, copy d to transliterated buffer
+ * [exzd]| done
+ */
+ expect("ab>x|y\n" +
+ "yc>z",
+ "eabcd", "exzd");
+
+ /* Another set of rules:
+ * 1. ab>x|yzacw
+ * 2. za>q
+ * 3. qc>r
+ * 4. cw>n
+ *
+ * []|ab Rule 1
+ * [x|yzacw] No match
+ * [xy|zacw] Rule 2
+ * [xyq|cw] Rule 4
+ * [xyqn]| Done
+ */
+ expect("ab>x|yzacw\n" +
+ "za>q\n" +
+ "qc>r\n" +
+ "cw>n",
+ "ab", "xyqn");
+
+ /* Test categories
+ */
+ Transliterator t = new RuleBasedTransliterator("",
+ "dummy=\uE100\n" +
+ "vowel=[aeiouAEIOU]\n" +
+ "lu=[:Lu:]\n" +
+ "{vowel}[{lu}>!\n" +
+ "{vowel}>&\n" +
+ "!]{lu}>^\n" +
+ "{lu}>*\n" +
+ "a>ERROR");
+ expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
+ }
+
+ // Restore this test if/when it's been deciphered. In general,
+ // tests that depend on a specific tranliterator are subject
+ // to the same fragility as tests that depend on resource data.
+
+// public void TestKana() {
+// String DATA[] = {
+// "a", "\u3042",
+// "A", "\u30A2",
+// "aA", "\u3042\u30A2",
+// "aaaa", "\u3042\u3042\u3042\u3042",
+// "akasata", "\u3042\u304B\u3055\u305F",
+// };
+//
+// Transliterator t = Transliterator.getInstance("Latin-Kana");
+// Transliterator rt = Transliterator.getInstance("Kana-Latin");
+// for (int i=0; izyx\n" +
+ "ab>yz\n" +
+ "bc>zx\n" +
+ "ca>xy\n" +
+ "a>x\n" +
+ "b>y\n" +
+ "c>z\n" +
+
+ "abc", RULES);
+ Transliterator rev = new RuleBasedTransliterator("", RULES,
+ RuleBasedTransliterator.REVERSE, null);
+ for (int i=0; i",
+ "psch>Y\n"
+ +"ps>y\n"
+ +"ch>x\n"
+ +"a>A\n");
+ String DATA[] = {
+ // insertion, buffer
+ "a", "A",
+ "p", "Ap",
+ "s", "Aps",
+ "c", "Apsc",
+ "a", "AycA",
+ "psch", "AycAY",
+ null, "AycAY", // null means finishKeyboardTransliteration
+ };
+
+ keyboardAux(t, DATA);
+ }
+
+ /**
+ * Basic test of keyboard with cursor.
+ */
+ public void TestKeyboard2() {
+ Transliterator t = new RuleBasedTransliterator("",
+ "ych>Y\n"
+ +"ps>|y\n"
+ +"ch>x\n"
+ +"a>A\n");
+ String DATA[] = {
+ // insertion, buffer
+ "a", "A",
+ "p", "Ap",
+ "s", "Ay",
+ "c", "Ayc",
+ "a", "AycA",
+ "p", "AycAp",
+ "s", "AycAy",
+ "c", "AycAyc",
+ "h", "AycAY",
+ null, "AycAY", // null means finishKeyboardTransliteration
+ };
+
+ keyboardAux(t, DATA);
+ }
+
+ /**
+ * Test keyboard transliteration with back-replacement.
+ */
+ public void TestKeyboard3() {
+ // We want th>z but t>y. Furthermore, during keyboard
+ // transliteration we want t>y then yh>z if t, then h are
+ // typed.
+ String RULES =
+ "t>|y\n" +
+ "yh>z\n" +
+ "";
+
+ String[] DATA = {
+ // Column 1: characters to add to buffer (as if typed)
+ // Column 2: expected appearance of buffer after
+ // keyboard xliteration.
+ "a", "a",
+ "b", "ab",
+ "t", "aby",
+ "c", "abyc",
+ "t", "abycy",
+ "h", "abycz",
+ null, "abycz", // null means finishKeyboardTransliteration
+ };
+
+ Transliterator t = new RuleBasedTransliterator("", RULES);
+ keyboardAux(t, DATA);
+ }
+
+ private void keyboardAux(Transliterator t, String[] DATA) {
+ int[] index = {0, 0, 0};
+ ReplaceableString s = new ReplaceableString();
+ for (int i=0; i ");
+ t.keyboardTransliterate(s, index, DATA[i]);
+ } else {
+ log = new StringBuffer(s.toString() + " => ");
+ t.finishKeyboardTransliteration(s, index);
+ }
+ String str = s.toString();
+ // Show the start index '{' and the cursor '|'
+ log.append(str.substring(0, index[Transliterator.START])).
+ append('{').
+ append(str.substring(index[Transliterator.START],
+ index[Transliterator.CURSOR])).
+ append('|').
+ append(str.substring(index[Transliterator.CURSOR]));
+ if (str.equals(DATA[i+1])) {
+ logln(log.toString());
+ } else {
+ errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]);
+ }
+ }
+ }
+
+ public void TestArabic() {
+ String DATA[] = {
+ "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
+ "\u0627\u0644\u0644\u063a\u0629\u0020"+
+ "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
+ "\u0628\u0628\u0646\u0638\u0645\u0020"+
+ "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
+ "\u062c\u0645\u064a\u0644\u0629",
+ };
+
+ Transliterator t = Transliterator.getInstance("Latin-Arabic");
+ for (int i=0; i", trans);
+
+ expect(t, "aaaaa", "aaaaa");
+ }
+
+ /**
+ * Compose the hex transliterators forward and reverse.
+ */
+ public void TestCompoundHex() {
+ Transliterator a = Transliterator.getInstance("Unicode-Hex");
+ Transliterator b = Transliterator.getInstance("Hex-Unicode");
+ Transliterator[] trans = { a, b };
+ Transliterator ab = new CompoundTransliterator("ab", trans);
+ String s = "abcde";
+ expect(ab, s, s);
+
+ trans = new Transliterator[] { b, a };
+ Transliterator ba = new CompoundTransliterator("ba", trans);
+ ReplaceableString str = new ReplaceableString(s);
+ a.transliterate(str);
+ expect(ba, str.toString(), str.toString());
+ }
+
+ /**
+ * Do some basic tests of filtering.
+ */
+ public void TestFiltering() {
+ Transliterator hex = Transliterator.getInstance("Unicode-Hex");
+ hex.setFilter(new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return c != 'c';
+ }
+ });
+ String s = "abcde";
+ String out = hex.transliterate(s);
+ String exp = "\\u0061\\u0062c\\u0064\\u0065";
+ if (out.equals(exp)) {
+ logln("Ok: \"" + exp + "\"");
+ } else {
+ logln("FAIL: \"" + out + "\", wanted \"" + exp + "\"");
+ }
+ }
+
+ //======================================================================
+ // Support methods
+ //======================================================================
+
+ void expect(String rules, String source, String expectedResult) {
+ expect(new RuleBasedTransliterator("", rules), source, expectedResult);
+ }
+
+ void expect(Transliterator t, String source, String expectedResult,
+ Transliterator reverseTransliterator) {
+ expect(t, source, expectedResult);
+ if (reverseTransliterator != null) {
+ expect(reverseTransliterator, expectedResult, source);
+ }
+ }
+
+ void expect(Transliterator t, String source, String expectedResult) {
+ String result = t.transliterate(source);
+ expectAux(t.getID() + ":String", source, result, expectedResult);
+
+ ReplaceableString rsource = new ReplaceableString(source);
+ t.transliterate(rsource);
+ result = rsource.toString();
+ expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
+
+ // Test keyboard (incremental) transliteration -- this result
+ // must be the same after we finalize (see below).
+ rsource.getStringBuffer().setLength(0);
+ int[] index = { 0, 0, 0 };
+ StringBuffer log = new StringBuffer();
+
+ for (int i=0; i ");
+ t.keyboardTransliterate(rsource, index,
+ String.valueOf(source.charAt(i)));
+ // Append the string buffer with a vertical bar '|' where
+ // the committed index is.
+ String s = rsource.toString();
+ log.append(s.substring(0, index[Transliterator.CURSOR])).
+ append('|').
+ append(s.substring(index[Transliterator.CURSOR]));
+ }
+
+ // As a final step in keyboard transliteration, we must call
+ // transliterate to finish off any pending partial matches that
+ // were waiting for more input.
+ t.finishKeyboardTransliteration(rsource, index);
+ result = rsource.toString();
+ log.append(" => ").append(rsource.toString());
+
+ expectAux(t.getID() + ":Keyboard", log.toString(),
+ result.equals(expectedResult),
+ expectedResult);
+ }
+
+ void expectAux(String tag, String source,
+ String result, String expectedResult) {
+ expectAux(tag, source + " -> " + result,
+ result.equals(expectedResult),
+ expectedResult);
+ }
+
+ void expectAux(String tag, String summary, boolean pass,
+ String expectedResult) {
+ if (pass) {
+ logln("("+tag+") " + escape(summary));
+ } else {
+ errln("FAIL: ("+tag+") "
+ + escape(summary)
+ + ", expected " + escape(expectedResult));
+ }
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ */
+ public static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+
+ /*
+ static final String KANA_RT_DATA =
+"a "+
+
+"ba bi bu be bo "+
+"bya byi byu bye byo "+
+"bba "+
+
+"da di du de do "+
+"dya dyi dyu dye dyo "+
+"dha dhi dhu dhe dho "+
+"dda "+
+
+"e "+
+
+"fa fi fe fo "+
+"fya fyu fyo "+
+"ffa "+
+
+"ga gi gu ge go "+
+"gya gyi gyu gye gyo "+
+"gwa gwi gwu gwe gwo "+
+"gga "+
+
+"ha hi hu he ho "+
+"hya hyi hyu hye hyo "+
+"hha "+
+
+"i "+
+
+"ka ki ku ke ko "+
+"kwa kwi kwu kwe kwo "+
+"kya kyi kyu kye kyo "+
+"kka "+
+
+"ma mi mu me mo "+
+"mya myi myu mye myo "+
+"mba mfa mma mpa mva "+
+"m'' "+
+
+"na ni nu ne no "+
+"nya nyi nyu nye nyo "+
+"nn n'' n "+
+
+"o "+
+
+"pa pi pu pe po "+
+"pya pyi pyu pye pyo "+
+"ppa "+
+
+"qa qi qu qe qo "+
+"qya qyi qyu qye qyo "+
+"qqa "+
+
+"ra ri ru re ro "+
+"rya ryi ryu rye ryo "+
+"rra "+
+
+"sa si su se so "+
+"sya syi syu sye syo "+
+"ssya ssa "+
+
+"ta ti tu te to "+
+"tha thi thu the tho "+
+"tsa tsi tse tso "+
+"tya tyi tyu tye tyo "+
+"ttsa "+
+"tta "+
+
+"u "+
+
+"va vi vu ve vo "+
+"vya vyi vyu vye vyo "+
+"vva "+
+
+"wa wi we wo "+
+"wwa "+
+
+"ya yu ye yo "+
+"yya "+
+
+"za zi zu ze zo "+
+"zya zyi zyu zye zyo "+
+"zza "+
+
+"xa xi xu xe xo "+
+"xka xke "+
+"xtu "+
+"xwa "+
+"xya xyu xyo "+
+
+ "akka akki akku akke akko "+
+ "akkya akkyu akkyo "+
+
+ "atta atti attu atte atto "+
+ "attya attyu attyo "+
+ "adda addi addu adde addo "+
+
+ "atcha atchi atchu atche atcho "+
+
+ "assa assi assu asse asso "+
+ "assya assyu assyo "+
+
+ "ahha ahhi ahhu ahhe ahho "+
+ "appa appi appu appe appo "+
+
+ "an "+
+ "ana ani anu ane ano "+
+ "anna anni annu anne anno "+
+ "an'a an'i an'u an'e an'o "+
+
+ "annna annni annnu annne annno "+
+ "an'na an'ni an'nu an'ne an'no "+
+
+ "anka anki anku anke anko "+
+ "anga angi angu ange ango "+
+
+ "ansa ansi ansu anse anso "+
+ "anza anzi anzu anze anzo "+
+ "anzya anzyu anzyo "+
+
+ "anta anti antu ante anto "+
+ "antya antyu antyo "+
+ "anda andi andu ande ando "+
+
+ "ancha anchi anchu anche ancho "+
+ "anja anji anju anje anjo "+
+ "antsa antsu antso "+
+
+ "anpa anpi anpu anpe anpo "+
+ "ampa ampi ampu ampe ampo "+
+
+ "anba anbi anbu anbe anbo "+
+ "amba ambi ambu ambe ambo "+
+
+ "anma anmi anmu anme anmo "+
+ "amma ammi ammu amme ammo "+
+
+ "anwa anwi anwu anwe anwo "+
+
+ "anha anhi anhu anhe anho "+
+
+ "anya anyi anyu anye anyo "+
+ "annya annyi annyu annye annyo "+
+ "an'ya an'yi an'yu an'ye an'yo "+
+
+ "kkk "+
+ "ggg "+
+ "sss "+
+ "zzz "+
+ "ttt "+
+ "ddd "+
+ "nnn "+
+ "hhh "+
+ "bbb "+
+ "ppp "+
+ "mmm "+
+ "yyy "+
+ "rrr "+
+ "www ";
+*/
+
+ /*+
+
+ "A I U E O "+
+ "XA XI XU XE XO "+
+
+ "KA KI KU KE KO "+
+ "KYA KYI KYU KYE KYO "+
+ "KWA KWI KWU KWE KWO "+
+ "QA QI QU QE QO "+
+ "QYA QYI QYU QYE QYO "+
+ "XKA XKE "+
+
+ "GA GI GU GE GO "+
+ "GYA GYI GYU GYE GYO "+
+ "GWA GWI GWU GWE GWO "+
+
+ "SA SI SU SE SO "+
+ "SHA SHI SHU SHE SHO "+
+ "SYA SYI SYU SYE SYO "+
+
+ "ZA ZI ZU ZE ZO "+
+ "ZYA ZYI ZYU ZYE ZYO "+
+ "JA JI JU JE JO "+
+ "JYA JYU JYO "+
+
+ "TA TI TU TE TO "+
+ "XTU XTSU "+
+ "TYA TYU TYO "+
+ "CYA CYU CYO "+
+ "CHA CHI CHU CHE CHO "+
+ "TSA TSI TSU TSE TSO "+
+ "DA DI DU DE DO "+
+ "DYA DYU DYO "+
+ "THA THI THU THE THO "+
+ "DHA DHI DHU DHE DHO "+
+
+ "NA NI NU NE NO "+
+ "NYA NYU NYO "+
+
+ "HA HI HU HE HO "+
+ "HYA HYU HYO "+
+ "FA FI FU FE FO "+
+ "FYA FYU FYO "+
+ "BA BI BU BE BO "+
+ "BYA BYU BYO "+
+ "PA PI PU PE PO "+
+ "PYA PYU PYO "+
+
+ "MA MI MU ME MO "+
+ "MYA MYU MYO "+
+ "YA YI YU YE YO "+
+ "XYA XYI XYU XYE XYO "+
+
+ "RA RI RU RE RO "+
+ "LA LI LU LE LO "+
+ "RYA RYI RYU RYE RYO "+
+ "LYA LYI LYU LYE LYO "+
+
+ "WA WI WU WE WO "+
+ "VA VI VU VE VO "+
+ "VYA VYU VYO "+
+
+ "CYA CYI CYU CYE CYO "+
+
+ "NN "+
+ "N' "+
+ "N "+
+
+ "AKKA AKKI AKKU AKKE AKKO "+
+ "AKKYA AKKYU AKKYO "+
+
+ "ATTA ATTI ATTU ATTE ATTO "+
+ "ATTYA ATTYU ATTYO "+
+ "ADDA ADDI ADDU ADDE ADDO "+
+
+ "ATCHA ATCHI ATCHU ATCHE ATCHO "+
+
+ "ASSA ASSI ASSU ASSE ASSO "+
+ "ASSYA ASSYU ASSYO "+
+
+ "AHHA AHHI AHHU AHHE AHHO "+
+ "APPA APPI APPU APPE APPO "+
+
+ "AN "+
+ "ANA ANI ANU ANE ANO "+
+ "ANNA ANNI ANNU ANNE ANNO "+
+ "AN'A AN'I AN'U AN'E AN'O "+
+
+ "ANNNA ANNNI ANNNU ANNNE ANNNO "+
+ "AN'NA AN'NI AN'NU AN'NE AN'NO "+
+
+ "ANKA ANKI ANKU ANKE ANKO "+
+ "ANGA ANGI ANGU ANGE ANGO "+
+
+ "ANSA ANSI ANSU ANSE ANSO "+
+ "ANZA ANZI ANZU ANZE ANZO "+
+ "ANZYA ANZYU ANZYO "+
+
+ "ANTA ANTI ANTU ANTE ANTO "+
+ "ANTYA ANTYU ANTYO "+
+ "ANDA ANDI ANDU ANDE ANDO "+
+
+ "ANCHA ANCHI ANCHU ANCHE ANCHO "+
+ "ANJA ANJI ANJU ANJE ANJO "+
+ "ANTSA ANTSU ANTSO "+
+
+ "ANPA ANPI ANPU ANPE ANPO "+
+ "AMPA AMPI AMPU AMPE AMPO "+
+
+ "ANBA ANBI ANBU ANBE ANBO "+
+ "AMBA AMBI AMBU AMBE AMBO "+
+
+ "ANMA ANMI ANMU ANME ANMO "+
+ "AMMA AMMI AMMU AMME AMMO "+
+
+ "ANWA ANWI ANWU ANWE ANWO "+
+
+ "ANHA ANHI ANHU ANHE ANHO "+
+
+ "ANYA ANYI ANYU ANYE ANYO "+
+ "ANNYA ANNYI ANNYU ANNYE ANNYO "+
+ "AN'YA AN'YI AN'YU AN'YE AN'YO "+
+
+ "KKK "+
+ "GGG "+
+ "SSS "+
+ "ZZZ "+
+ "TTT "+
+ "DDD "+
+ "NNN "+
+ "HHH "+
+ "BBB "+
+ "PPP "+
+ "MMM "+
+ "YYY "+
+ "RRR "+
+ "WWW";*/
+}
diff --git a/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
new file mode 100755
index 00000000000..8417faf4b44
--- /dev/null
+++ b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
@@ -0,0 +1,118 @@
+import com.ibm.text.*;
+import java.text.*;
+import java.util.*;
+
+/**
+ * @test
+ * @summary General test of UnicodeSet
+ */
+public class UnicodeSetTest extends IntlTest {
+
+ public static void main(String[] args) throws Exception {
+ new UnicodeSetTest().run(args);
+ }
+
+ public void TestPatterns() {
+ UnicodeSet set = new UnicodeSet();
+ expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km");
+ expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz");
+ expectPattern(set, "[a\\-z]", "--aazz");
+ expectPattern(set, "[-az]", "--aazz");
+ expectPattern(set, "[az-]", "--aazz");
+ expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz");
+
+ // Throw in a test of complement
+ set.complement();
+ String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF';
+ expectPairs(set, exp);
+ }
+
+ public void TestAddRemove() {
+ UnicodeSet set = new UnicodeSet();
+ set.add('a', 'z');
+ expectPairs(set, "az");
+ set.remove('m', 'p');
+ expectPairs(set, "alqz");
+ set.remove('e', 'g');
+ expectPairs(set, "adhlqz");
+ set.remove('d', 'i');
+ expectPairs(set, "acjlqz");
+ set.remove('c', 'r');
+ expectPairs(set, "absz");
+ set.add('f', 'q');
+ expectPairs(set, "abfqsz");
+ set.remove('a', 'g');
+ expectPairs(set, "hqsz");
+ set.remove('a', 'z');
+ expectPairs(set, "");
+
+ // Try removing an entire set from another set
+ expectPattern(set, "[c-x]", "cx");
+ UnicodeSet set2 = new UnicodeSet();
+ expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
+ set.removeAll(set2);
+ expectPairs(set, "deluxx");
+
+ // Try adding an entire set to another set
+ expectPattern(set, "[jackiemclean]", "aacceein");
+ expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
+ set.addAll(set2);
+ expectPairs(set, "aacehort");
+
+ // Test commutativity
+ expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
+ expectPattern(set2, "[jackiemclean]", "aacceein");
+ set.addAll(set2);
+ expectPairs(set, "aacehort");
+ }
+
+ void expectPattern(UnicodeSet set,
+ String pattern,
+ String expectedPairs) {
+ set.applyPattern(pattern);
+ if (!set.getPairs().equals(expectedPairs)) {
+ errln("FAIL: applyPattern(\"" + pattern +
+ "\") => pairs \"" +
+ escape(set.getPairs()) + "\", expected \"" +
+ escape(expectedPairs) + "\"");
+ } else {
+ logln("Ok: applyPattern(\"" + pattern +
+ "\") => pairs \"" +
+ escape(set.getPairs()) + "\"");
+ }
+ }
+
+ void expectPairs(UnicodeSet set, String expectedPairs) {
+ if (!set.getPairs().equals(expectedPairs)) {
+ errln("FAIL: Expected pair list \"" +
+ escape(expectedPairs) + "\", got \"" +
+ escape(set.getPairs()) + "\"");
+ }
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ */
+ static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/text/CompoundTransliterator.java b/icu4j/src/com/ibm/text/CompoundTransliterator.java
new file mode 100755
index 00000000000..c3582237d42
--- /dev/null
+++ b/icu4j/src/com/ibm/text/CompoundTransliterator.java
@@ -0,0 +1,285 @@
+package com.ibm.text;
+
+import java.util.Enumeration;
+import java.util.Vector;
+
+/**
+ * A transliterator that is composed of two or more other
+ * transliterator objects linked together. For example, if one
+ * transliterator transliterates from script A to script B, and
+ * another transliterates from script B to script C, the two may be
+ * combined to form a new transliterator from A to C.
+ *
+ * Composed transliterators may not behave as expected. For
+ * example, inverses may not combine to form the identity
+ * transliterator. See the class documentation for {@link
+ * Transliterator} for details.
+ *
+ *
If a non-null UnicodeFilter is applied to a
+ * CompoundTransliterator, it has the effect of being
+ * logically anded with the filter of each transliterator in
+ * the chain.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class CompoundTransliterator extends Transliterator {
+
+ private static final boolean DEBUG = false;
+
+ private Transliterator[] trans;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructs a new compound transliterator given an array of
+ * transliterators. The array of transliterators may be of any
+ * length, including zero or one, however, useful compound
+ * transliterators have at least two components.
+ * @param transliterators array of Transliterator
+ * objects
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ public CompoundTransliterator(String ID, Transliterator[] transliterators,
+ UnicodeFilter filter) {
+ super(ID, filter);
+ trans = new Transliterator[transliterators.length];
+ System.arraycopy(transliterators, 0, trans, 0, trans.length);
+ }
+
+ /**
+ * Constructs a new compound transliterator given an array of
+ * transliterators. The array of transliterators may be of any
+ * length, including zero or one, however, useful compound
+ * transliterators have at least two components.
+ * @param transliterators array of Transliterator
+ * objects
+ */
+ public CompoundTransliterator(String ID, Transliterator[] transliterators) {
+ this(ID, transliterators, null);
+ }
+
+ /**
+ * Returns the number of transliterators in this chain.
+ * @return number of transliterators in this chain.
+ */
+ public int getCount() {
+ return trans.length;
+ }
+
+ /**
+ * Returns the transliterator at the given index in this chain.
+ * @param index index into chain, from 0 to getCount() - 1
+ * @return transliterator at the given index
+ */
+ public Transliterator getTransliterator(int index) {
+ return trans[index];
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return the new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ for (int i=0; i abca/u
+ * S C L S C L gl=f->a
+ *
+ * 2. upup, changes "x" to "XX"
+ *
+ * 4 7 a 4 7 a
+ * abca/u => abcAA/u
+ * S CL S C
+ * L gl=a->b
+ * 3. u-h, changes Unicode to hex
+ *
+ * 4 7 a 4 7 a d 0 3
+ * abcAA/u => abc/u0041/u0041/u
+ * S C L S C
+ * L gl=b->15
+ * 4. return
+ *
+ * 4 7 a d 0 3
+ * abc/u0041/u0041/u
+ * S C L
+ */
+
+ /**
+ * One more wrinkle. If there is a filter F for the compound
+ * transliterator as a whole, then we need to modify every
+ * non-null filter f in the chain to be f' = F & f. Then,
+ * when we're done, we restore the original filters.
+ *
+ * A possible future optimization is to change f to f' at
+ * construction time, but then if anyone else is using the
+ * transliterators in the chain outside of this context, they
+ * will get unexpected results.
+ */
+ UnicodeFilter F = getFilter();
+ UnicodeFilter[] f = null;
+ if (F != null) {
+ f = new UnicodeFilter[trans.length];
+ for (int i=0; i \""));
+ }
+
+ trans[i].handleKeyboardTransliterate(text, index);
+
+ if (DEBUG) {
+ System.out.println(escape(
+ substring(text, index[START], index[CURSOR]) + '|' +
+ substring(text, index[CURSOR], index[LIMIT]) +
+ '"'));
+ }
+
+ // Adjust overall limit for insertions/deletions
+ globalLimit += index[LIMIT] - limit;
+ limit = index[CURSOR]; // Move limit to end of committed text
+ }
+ // Cursor is good where it is -- where the last
+ // transliterator left it. Limit needs to be put back
+ // where it was, modulo adjustments for deletions/insertions.
+ index[LIMIT] = globalLimit;
+
+ } finally {
+ // Fixup the transliterator filters, if we had to modify them.
+ if (f != null) {
+ for (int i=0; ipreceding context.
+ * @return maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ int max = 0;
+ for (int i=0; i max) {
+ max = len;
+ }
+ }
+ return max;
+ }
+
+ /**
+ * DEBUG
+ * Returns a substring of a Replaceable.
+ */
+ private static final String substring(Replaceable str, int start, int limit) {
+ StringBuffer buf = new StringBuffer();
+ while (start < limit) {
+ buf.append(str.charAt(start++));
+ }
+ return buf.toString();
+ }
+
+ /**
+ * DEBUG
+ * Escapes non-ASCII characters as Unicode.
+ */
+ private static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java b/icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java
new file mode 100755
index 00000000000..18673e15fe7
--- /dev/null
+++ b/icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java
@@ -0,0 +1,130 @@
+package com.ibm.text;
+import java.util.*;
+
+/**
+ * A transliterator that converts from hexadecimal Unicode
+ * escape sequences to the characters they represent. For example, "U+0040"
+ * and '\u0040'. It recognizes the
+ * prefixes "U+", "u+", "\U", and "\u". Hex values may be
+ * upper- or lowercase.
+ *
+ * Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class HexToUnicodeTransliterator extends Transliterator {
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Package accessible ID for this transliterator.
+ */
+ static String _ID = "Hex-Unicode";
+
+ /**
+ * Constructs a transliterator.
+ */
+ public HexToUnicodeTransliterator() {
+ super(_ID, null);
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return the new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ int[] offsets = { start, limit, start };
+ handleKeyboardTransliterate(text, offsets);
+ return offsets[LIMIT];
+ }
+
+ /**
+ * Implements {@link Transliterator#handleKeyboardTransliterate}.
+ */
+ protected void handleKeyboardTransliterate(Replaceable text,
+ int[] offsets) {
+ /**
+ * Performs transliteration changing Unicode hexadecimal
+ * escapes to characters. For example, "U+0040" -> '@'. A fixed
+ * set of prefixes is recognized: "\u", "\U", "u+", "U+".
+ */
+ int cursor = offsets[CURSOR];
+ int limit = offsets[LIMIT];
+
+ int maxCursor = limit - 6;
+ loop:
+ while (cursor <= maxCursor) {
+ char c = filteredCharAt(text, cursor + 5);
+ int digit0 = Character.digit(c, 16);
+ if (digit0 < 0) {
+ if (c == '\\') {
+ cursor += 5;
+ } else if (c == 'U' || c == 'u' || c == '+') {
+ cursor += 4;
+ } else {
+ cursor += 6;
+ }
+ continue;
+ }
+
+ int u = digit0;
+
+ for (int i=4; i>=2; --i) {
+ c = filteredCharAt(text, cursor + i);
+ int digit = Character.digit(c, 16);
+ if (digit < 0) {
+ if (c == 'U' || c == 'u' || c == '+') {
+ cursor += i-1;
+ } else {
+ cursor += 6;
+ }
+ continue loop;
+ }
+ u |= digit << (4 * (5-i));
+ }
+
+ c = filteredCharAt(text, cursor);
+ char d = filteredCharAt(text, cursor + 1);
+ if (((c == 'U' || c == 'u') && d == '+')
+ || (c == '\\' && (d == 'U' || d == 'u'))) {
+
+ // At this point, we have a match; replace cursor..cursor+5
+ // with u.
+ text.replace(cursor, cursor+6, String.valueOf((char) u));
+ limit -= 5;
+ maxCursor -= 5;
+
+ ++cursor;
+ } else {
+ cursor += 6;
+ }
+ }
+
+ offsets[LIMIT] = limit;
+ offsets[CURSOR] = cursor;
+ }
+
+ private char filteredCharAt(Replaceable text, int i) {
+ char c;
+ UnicodeFilter filter = getFilter();
+ return (filter == null) ? text.charAt(i) :
+ (filter.isIn(c = text.charAt(i)) ? c : '\uFFFF');
+ }
+
+ /**
+ * Return the length of the longest context required by this transliterator.
+ * This is preceding context.
+ * @param direction either FORWARD
or REVERSE
+ * @return maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return 0;
+ }
+}
diff --git a/icu4j/src/com/ibm/text/Replaceable.java b/icu4j/src/com/ibm/text/Replaceable.java
new file mode 100755
index 00000000000..b4c8519689c
--- /dev/null
+++ b/icu4j/src/com/ibm/text/Replaceable.java
@@ -0,0 +1,77 @@
+package com.ibm.text;
+
+/**
+ * Replaceable
is an interface that supports the
+ * operation of replacing a substring with another piece of text.
+ * Replaceable
is needed in order to change a piece of
+ * text while retaining style attributes. For example, if the string
+ * "the bold font" has range (4, 8) replaced with "strong",
+ * then it becomes "the strong font".
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: Replaceable.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public interface Replaceable {
+ /**
+ * Return the number of characters in the text.
+ * @return number of characters in text
+ */
+ int length();
+
+ /**
+ * Return the character at the given offset into the text.
+ * @param offset an integer between 0 and length()
-1
+ * inclusive
+ * @return character of text at given offset
+ */
+ char charAt(int offset);
+
+ /**
+ * Copies characters from this object into the destination
+ * character array. The first character to be copied is at index
+ * srcStart
; the last character to be copied is at
+ * index srcLimit-1
(thus the total number of
+ * characters to be copied is srcLimit-srcStart
). The
+ * characters are copied into the subarray of dst
+ * starting at index dstStart
and ending at index
+ * dstStart + (srcLimit-srcStart) - 1
.
+ *
+ * @param srcStart the beginning index to copy, inclusive; 0
+ * <= start <= limit
.
+ * @param srcLimit the ending index to copy, exclusive;
+ * start <= limit <= length()
.
+ * @param dst the destination array.
+ * @param dstStart the start offset in the destination array.
+ */
+ void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
+
+ /**
+ * Replace a substring of this object with the given text.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param text the text to replace characters start
+ * to limit - 1
+ */
+ void replace(int start, int limit, String text);
+
+ /**
+ * Replace a substring of this object with the given text.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param chars the text to replace characters start
+ * to limit - 1
+ * @param charsStart the beginning index into chars
,
+ * inclusive; 0 <= start <= limit
.
+ * @param charsLen the number of characters of chars
.
+ */
+ void replace(int start, int limit, char[] chars,
+ int charsStart, int charsLen);
+ // Note: We use length rather than limit to conform to StringBuffer
+ // and System.arraycopy.
+}
diff --git a/icu4j/src/com/ibm/text/ReplaceableString.java b/icu4j/src/com/ibm/text/ReplaceableString.java
new file mode 100755
index 00000000000..d6a7df06db5
--- /dev/null
+++ b/icu4j/src/com/ibm/text/ReplaceableString.java
@@ -0,0 +1,159 @@
+package com.ibm.text;
+
+/**
+ * ReplaceableString
is an adapter class that implements the
+ * Replaceable
API around an ordinary StringBuffer
.
+ *
+ *
Note: This class does not support attributes and is not
+ * intended for general use. Most clients will need to implement
+ * {@link Replaceable} in their text representation class.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @see Replaceable
+ * @author Alan Liu
+ * @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class ReplaceableString implements Replaceable {
+ private StringBuffer buf;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a new object with the given initial contents.
+ * @param str initial contents
+ */
+ public ReplaceableString(String str) {
+ buf = new StringBuffer(str);
+ }
+
+ /**
+ * Construct a new object using buf
for internal
+ * storage. The contents of buf
at the time of
+ * construction are used as the initial contents. Note!
+ * Modifications to buf
will modify this object, and
+ * vice versa.
+ * @param buf object to be used as internal storage
+ */
+ public ReplaceableString(StringBuffer buf) {
+ this.buf = buf;
+ }
+
+ /**
+ * Construct a new empty object.
+ */
+ public ReplaceableString() {
+ buf = new StringBuffer();
+ }
+
+ /**
+ * Return the contents of this object as a String
.
+ * @return string contents of this object
+ */
+ public String toString() {
+ return buf.toString();
+ }
+
+ /**
+ * Return the internal storage of this object. Note! Any
+ * changes made to the returned object affect this object's
+ * contents, and vice versa.
+ * @return internal buffer used by this object
+ */
+ public StringBuffer getStringBuffer() {
+ return buf;
+ }
+
+ /**
+ * Return the number of characters contained in this object.
+ * Replaceable
API.
+ */
+ public int length() {
+ return buf.length();
+ }
+
+ /**
+ * Return the character at the given position in this object.
+ * Replaceable
API.
+ * @param offset offset into the contents, from 0 to
+ * length()
- 1
+ */
+ public char charAt(int offset) {
+ return buf.charAt(offset);
+ }
+
+ /**
+ * Copies characters from this object into the destination
+ * character array. The first character to be copied is at index
+ * srcStart
; the last character to be copied is at
+ * index srcLimit-1
(thus the total number of
+ * characters to be copied is srcLimit-srcStart
). The
+ * characters are copied into the subarray of dst
+ * starting at index dstStart
and ending at index
+ * dstStart + (srcLimit-srcStart) - 1
.
+ *
+ * @param srcStart the beginning index to copy, inclusive; 0
+ * <= start <= limit
.
+ * @param srcLimit the ending index to copy, exclusive;
+ * start <= limit <= length()
.
+ * @param dst the destination array.
+ * @param dstStart the start offset in the destination array.
+ */
+ public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
+ buf.getChars(srcStart, srcLimit, dst, dstStart);
+ }
+
+ /**
+ * Replace zero or more characters with new characters.
+ * Replaceable
API.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param text new text to replace characters start
to
+ * limit - 1
+ */
+ public void replace(int start, int limit, String text) {
+ if (start == limit) {
+ buf.insert(start, text);
+ } else {
+ char[] tail = null;
+ if (limit < buf.length()) {
+ tail = new char[buf.length() - limit];
+ buf.getChars(limit, buf.length(), tail, 0);
+ }
+ buf.setLength(start);
+ buf.append(text);
+ if (tail != null) {
+ buf.append(tail);
+ }
+ }
+ }
+
+ /**
+ * Replace a substring of this object with the given text.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= length()
.
+ * @param chars the text to replace characters start
+ * to limit - 1
+ * @param charsStart the beginning index into chars
,
+ * inclusive; 0 <= start <= limit
.
+ * @param charsLen the number of characters of chars
.
+ */
+ public void replace(int start, int limit, char[] chars,
+ int charsStart, int charsLen) {
+ char[] tail = null;
+ if (limit < buf.length()) {
+ tail = new char[buf.length() - limit];
+ buf.getChars(limit, buf.length(), tail, 0);
+ }
+ buf.setLength(start);
+ buf.append(chars, charsStart, charsLen);
+ if (tail != null) {
+ buf.append(tail);
+ }
+ }
+}
diff --git a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
new file mode 100755
index 00000000000..4a433e9479d
--- /dev/null
+++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java
@@ -0,0 +1,1187 @@
+package com.ibm.text;
+
+import java.util.Hashtable;
+import java.util.Vector;
+
+/**
+ * A transliterator that reads a set of rules in order to determine how to
+ * perform translations. Rules are stored in resource bundles indexed by name.
+ * Rules are separated by newline characters ('\n'); to include a literal
+ * newline, prefix it with a backslash ('\\\n'). Whitespace is significant. If
+ * the first character on a line is '#', the entire line is ignored as a
+ * comment.
+ *
+ *
Each set of rules consists of two groups, one forward, and one reverse.
+ * This is a convention that is not enforced; rules for one direction may be
+ * omitted, with the result that translations in that direction will not modify
+ * the source text.
+ *
+ *
Rule syntax
+ *
+ *
Rule statements take one of the following forms:
+ *
+ * alefmadda=\u0622
+ *
+ * - Variable definition. The name on the left is
+ * assigned the character or expression on the right. Names may not
+ * contain any special characters (see list below). Duplicate names
+ * (including duplicates of simple variables or category names)
+ * cause an exception to be thrown. If the right hand side consists
+ * of one character, then the variable stands for that character.
+ * In this example, after this statement, instances of the left hand
+ * name surrounded by braces, "
{alefmadda}
",
+ * will be replaced by the Unicode character U+0622. If the
+ * right hand side is longer than one character, then it is
+ * interpreted as a character category expression; see below for
+ * details.
+ *
+ * softvowel=[eiyEIY]
+ *
+ * - Category definition. The name on the left is assigned
+ * to stand for a set of characters. The same rules for names of simple
+ * variables apply. After this statement, the left hand variable will be
+ * interpreted as indicating a set of characters in appropriate contexts. The
+ * pattern syntax defining sets of characters is defined by {@link UnicodeSet}.
+ * Examples of valid patterns are:
+ *
+ *
+ * [abc] |
+ * The set containing the characters 'a', 'b', and 'c'. |
+ *
+ *
+ * [^abc] |
+ * The set of all characters except 'a', 'b', and 'c'. |
+ *
+ *
+ * [A-Z] |
+ * The set of all characters from 'A' to 'Z' in Unicode order. |
+ *
+ *
+ * [:Lu:] |
+ * The set of Unicode uppercase letters. See
+ * www.unicode.org
+ * for a complete list of categories and their two-letter codes. |
+ *
+ *
+ * [^a-z[:Lu:][:Ll:]] |
+ * The set of all characters except 'a' through 'z' and
+ * uppercase or lowercase letters. |
+ *
+ *
+ *
+ * See {@link UnicodeSet} for more documentation and examples.
+ *
+ *
+ * ai>{alefmadda}
+ *
+ * - Forward translation rule. This rule states that the
+ * string on the left will be changed to the string on the right when
+ * performing forward transliteration.
+ *
+ * ai<{alefmadda}
+ *
+ * - Reverse translation rule. This rule states that the
+ * string on the right will be changed to the string on the left when
+ * performing reverse transliteration.
+ *
+ *
+ *
+ * Forward and reverse translation rules consist of a match
+ * pattern and an output string. The match pattern consists
+ * of literal characters, optionally preceded by context, and optionally
+ * followed by context. Context characters, like literal pattern characters,
+ * must be matched in the text being transliterated. However, unlike literal
+ * pattern characters, they are not replaced by the output text. For example,
+ * the pattern "[abc]def
" indicates the characters
+ * "def
" must be preceded by "abc
" for a successful
+ * match. If there is a successful match, "def
" will be replaced,
+ * but not "abc
". The initial '[
' is optional, so
+ * "abc]def
" is equivalent to "[abc]def
". Another
+ * example is "123[456]
" (or "123[456
") in which the
+ * literal pattern "123
" must be followed by "456
".
+ *
+ *
The output string of a forward or reverse rule consists of characters to
+ * replace the literal pattern characters. If the output string contains the
+ * character '|
', this is taken to indicate the location of the
+ * cursor after replacement. The cursor is the point in the text
+ * at which the next replacement, if any, will be applied.
+ *
+ *
Example
+ *
+ *
The following example rules illustrate many of the features of the rule
+ * language.
+ *
+ * Rule 1. |
+ * abc]def>x|y |
+ * Rule 2. |
+ * xyz>r |
+ * Rule 3. |
+ * yz>q |
+ *
+ *
+ * Applying these rules to the string "adefabcdefz
" yields the
+ * following results:
+ *
+ *
+ * |adefabcdefz |
+ * Initial state, no rules match. Advance cursor. |
+ * a|defabcdefz |
+ * Still no match. Rule 1 does not match because the preceding
+ * context is not present. |
+ * ad|efabcdefz |
+ * Still no match. Keep advancing until there is a match... |
+ * ade|fabcdefz |
+ * ... |
+ * adef|abcdefz |
+ * ... |
+ * adefa|bcdefz |
+ * ... |
+ * adefab|cdefz |
+ * ... |
+ * adefabc|defz |
+ * Rule 1 matches; replace "def " with "xy "
+ * and back up the cursor to before the 'y '. |
+ * adefabcx|yz |
+ * Although "xyz " is present, rule 2 does not match
+ * because the cursor is before the 'y ', not before the
+ * 'x '. Rule 3 does match. Replace "yz " with
+ * "q ". |
+ * adefabcxq| |
+ * The cursor is at the end; transliteration is complete. |
+ *
+ *
+ * The order of rules is significant. If multiple rules may match at some
+ * point, the first matching rule is applied.
+ *
+ *
Forward and reverse rules may have an empty output string. Otherwise, an
+ * empty left or right hand side of any statement is a syntax error.
+ *
+ *
Single quotes are used to quote the special characters
+ * =><{}[]|
. To specify a single quote itself, inside or
+ * outside of quotes, use two single quotes in a row. For example, the rule
+ * "'>'>o''clock
" changes the string ">
" to
+ * the string "o'clock
".
+ *
+ *
Notes
+ *
+ *
While a RuleBasedTransliterator is being built, it checks that the rules
+ * are added in proper order. For example, if the rule "a>x" is followed by the
+ * rule "ab>y", then the second rule will throw an exception. The reason is
+ * that the second rule can never be triggered, since the first rule always
+ * matches anything it matches. In other words, the first rule masks
+ * the second rule. There is a cost of O(n^2) to make this check; in real-world
+ * tests it appears to approximately double build time.
+ *
+ *
One optimization that can be made is to add a pragma to the rule language,
+ * "#pragma order", that turns off ordering checking. This pragma can then be
+ * added to all of our resource-based rules (after we build these once and
+ * determine that there are no ordering errors). I haven't made this change yet
+ * in the interests of keeping the code from getting too byzantine.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class RuleBasedTransliterator extends Transliterator {
+ /**
+ * Direction constant passed to constructor to create a transliterator
+ * using the forward rules.
+ */
+ public static final int FORWARD = 0;
+
+ /**
+ * Direction constant passed to constructor to create a transliterator
+ * using the reverse rules.
+ */
+ public static final int REVERSE = 1;
+
+ private Data data;
+
+ static final boolean DEBUG = false;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructs a new transliterator from the given rules.
+ * @param rules rules, separated by '\n'
+ * @param direction either FORWARD or REVERSE.
+ * @exception IllegalArgumentException if rules are malformed
+ * or direction is invalid.
+ */
+ public RuleBasedTransliterator(String ID, String rules, int direction,
+ UnicodeFilter filter) {
+ super(ID, filter);
+ if (direction != FORWARD && direction != REVERSE) {
+ throw new IllegalArgumentException("Invalid direction");
+ }
+ data = parse(rules, direction);
+ }
+
+ /**
+ * Constructs a new transliterator from the given rules in the
+ * FORWARD
direction.
+ * @param rules rules, separated by '\n'
+ * @exception IllegalArgumentException if rules are malformed
+ * or direction is invalid.
+ */
+ public RuleBasedTransliterator(String ID, String rules) {
+ this(ID, rules, FORWARD, null);
+ }
+
+ RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {
+ super(ID, filter);
+ this.data = data;
+ }
+
+ static Data parse(String rules, int direction) {
+ return new Parser(rules, direction).getData();
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result buffer to receive the transliterated text; previous
+ * contents are discarded
+ */
+ public void transliterate(String text, int start, int limit,
+ StringBuffer result) {
+ /* In the following loop there is a virtual buffer consisting of the
+ * text transliterated so far followed by the untransliterated text. There is
+ * also a cursor, which may be in the already transliterated buffer or just
+ * before the untransliterated text.
+ *
+ * Example: rules 1. ab>x|y
+ * 2. yc>z
+ *
+ * []|eabcd start - no match, copy e to tranlated buffer
+ * [e]|abcd match rule 1 - copy output & adjust cursor
+ * [ex|y]cd match rule 2 - copy output & adjust cursor
+ * [exz]|d no match, copy d to transliterated buffer
+ * [exzd]| done
+ *
+ * cursor: an index into the virtual buffer, 0..result.length()-1.
+ * Matches take place at the cursor. If there is no match, the cursor
+ * is advanced, and one character is moved from the source text to the
+ * result buffer.
+ *
+ * start, limit: these designate the substring of the source text which
+ * has not been processed yet. The range of offsets is start..limit-1.
+ * At any moment the virtual buffer consists of result +
+ * text.substring(start, limit).
+ */
+ int cursor = 0;
+ result.setLength(0);
+ while (start < limit || cursor < result.length()) {
+ TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result,
+ cursor, data.setVariables, getFilter());
+ if (DEBUG) {
+ StringBuffer buf = new StringBuffer(
+ result.toString() + '#' + text.substring(start, limit));
+ buf.insert(cursor <= result.length()
+ ? cursor : (cursor + 1),
+ '|');
+ System.err.print((r == null ? "nomatch:" : ("match:" + r + ", "))
+ + buf);
+ }
+
+ if (r == null) {
+ if (cursor == result.length()) {
+ result.append(text.charAt(start++));
+ }
+ ++cursor;
+ } else {
+ // resultPad is length of result to right of cursor; >= 0
+ int resultPad = result.length() - cursor;
+ char[] tail = null;
+ if (r.getKeyLength() > resultPad) {
+ start += r.getKeyLength() - resultPad;
+ } else if (r.getKeyLength() < resultPad) {
+ tail = new char[resultPad - r.getKeyLength()];
+ result.getChars(cursor + r.getKeyLength(), result.length(),
+ tail, 0);
+ }
+ result.setLength(cursor);
+ result.append(r.getOutput());
+ if (tail != null) {
+ result.append(tail);
+ }
+ cursor += r.getCursorPos();
+ }
+
+ if (DEBUG) {
+ StringBuffer buf = new StringBuffer(
+ result.toString() + '#' + text.substring(start, limit));
+ buf.insert(cursor <= result.length()
+ ? cursor : (cursor + 1),
+ '|');
+ System.err.println(" => " + buf);
+ }
+ }
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return The new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ /* When using Replaceable, the algorithm is simpler, since we don't have
+ * two separate buffers. We keep start and limit fixed the entire time,
+ * relative to the text -- limit may move numerically if text is
+ * inserted or removed. The cursor moves from start to limit, with
+ * replacements happening under it.
+ *
+ * Example: rules 1. ab>x|y
+ * 2. yc>z
+ *
+ * |eabcd start - no match, advance cursor
+ * e|abcd match rule 1 - change text & adjust cursor
+ * ex|ycd match rule 2 - change text & adjust cursor
+ * exz|d no match, advance cursor
+ * exzd| done
+ */
+ int cursor = start;
+ while (cursor < limit) {
+ TransliterationRule r = data.ruleSet.findMatch(text, start, limit,
+ cursor, data.setVariables, getFilter());
+ if (r == null) {
+ ++cursor;
+ } else {
+ text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
+ limit += r.getOutput().length() - r.getKeyLength();
+ cursor += r.getCursorPos();
+ }
+ }
+ return limit;
+ }
+
+ /**
+ * Implements {@link Transliterator#handleKeyboardTransliterate}.
+ */
+ protected void handleKeyboardTransliterate(Replaceable text,
+ int[] index) {
+ int start = index[START];
+ int limit = index[LIMIT];
+ int cursor = index[CURSOR];
+
+ if (DEBUG) {
+ System.out.print("\"" +
+ escape(rsubstring(text, start, cursor)) + '|' +
+ escape(rsubstring(text, cursor, limit)) + "\"");
+ }
+
+ boolean partial[] = new boolean[1];
+
+ while (cursor < limit) {
+ TransliterationRule r = data.ruleSet.findIncrementalMatch(
+ text, start, limit, cursor, data.setVariables, partial, getFilter());
+ /* If we match a rule then apply it by replacing the key
+ * with the rule output and repositioning the cursor
+ * appropriately. If we get a partial match, then we
+ * can't do anything without more text; return with the
+ * cursor at the current position. If we get null, then
+ * there is no match at this position, and we can advance
+ * the cursor.
+ */
+ if (r == null) {
+ if (partial[0]) {
+ break;
+ } else {
+ ++cursor;
+ }
+ } else {
+ text.replace(cursor, cursor + r.getKeyLength(), r.getOutput());
+ limit += r.getOutput().length() - r.getKeyLength();
+ cursor += r.getCursorPos();
+ }
+ }
+
+ if (DEBUG) {
+ System.out.println(" -> \"" +
+ escape(rsubstring(text, start, cursor)) + '|' +
+ escape(rsubstring(text, cursor, cursor)) + '|' +
+ escape(rsubstring(text, cursor, limit)) + "\"");
+ }
+
+ index[LIMIT] = limit;
+ index[CURSOR] = cursor;
+ }
+
+ /**
+ * Returns the length of the longest context required by this transliterator.
+ * This is preceding context.
+ * @return Maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return data.ruleSet.getMaximumContextLength();
+ }
+
+
+ /**
+ * FOR DEBUGGING: Return a substring of a Replaceable.
+ */
+ private static String rsubstring(Replaceable r, int start, int limit) {
+ StringBuffer buf = new StringBuffer();
+ while (start < limit) {
+ buf.append(r.charAt(start++));
+ }
+ return buf.toString();
+ }
+
+ /**
+ * FOR DEBUGGING: Escape non-ASCII characters as Unicode.
+ */
+ private static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ if (c == '\\') {
+ buf.append("\\\\"); // That is, "\\"
+ } else {
+ buf.append(c);
+ }
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+
+
+
+
+
+ static class Data {
+ public Data() {
+ variableNames = new Hashtable();
+ setVariables = new Hashtable();
+ ruleSet = new TransliterationRuleSet();
+ }
+
+ /**
+ * Rule table. May be empty.
+ */
+ public TransliterationRuleSet ruleSet;
+
+ /**
+ * Map variable name (String) to variable (Character). A variable
+ * name may correspond to a single literal character, in which
+ * case the character is stored in this hash. It may also
+ * correspond to a UnicodeSet, in which case a character is
+ * again stored in this hash, but the character is a stand-in: it
+ * is a key for a secondary lookup in data.setVariables. The stand-in
+ * also represents the UnicodeSet in the stored rules.
+ */
+ public Hashtable variableNames;
+
+ /**
+ * Map category variable (Character) to set (UnicodeSet).
+ * Variables that correspond to a set of characters are mapped
+ * from variable name to a stand-in character in data.variableNames.
+ * The stand-in then serves as a key in this hash to lookup the
+ * actual UnicodeSet object. In addition, the stand-in is
+ * stored in the rule text to represent the set of characters.
+ */
+ public Hashtable setVariables;
+ }
+
+
+
+
+
+
+ private static class Parser {
+ private String rules;
+
+ private int direction;
+
+ private Data data;
+
+ /**
+ * The next available stand-in for variables. This starts at some point in
+ * the private use area (discovered dynamically) and increments up toward
+ * variableLimit
. At any point during parsing, available
+ * variables are variableNext..variableLimit-1
.
+ */
+ private char variableNext;
+
+ /**
+ * The last available stand-in for variables. This is discovered
+ * dynamically. At any point during parsing, available variables are
+ * variableNext..variableLimit-1
.
+ */
+ private char variableLimit;
+
+ // Operators
+ private static final char VARIABLE_DEF_OP = '=';
+ private static final char FORWARD_RULE_OP = '>';
+ private static final char REVERSE_RULE_OP = '<';
+
+ private static final String OPERATORS = "=><";
+
+ // Other special characters
+ private static final char QUOTE = '\'';
+ private static final char VARIABLE_REF_OPEN = '{';
+ private static final char VARIABLE_REF_CLOSE = '}';
+ private static final char CONTEXT_OPEN = '[';
+ private static final char CONTEXT_CLOSE = ']';
+ private static final char CURSOR_POS = '|';
+ private static final char RULE_COMMENT_CHAR = '#';
+
+ /**
+ * Specials must be quoted in rules to be used as literals.
+ * Specials may not occur in variable names.
+ */
+ private static final String SPECIALS = "'{}[]|#" + OPERATORS;
+
+ /**
+ * Specials that must be quoted in variable definitions.
+ */
+ private static final String DEF_SPECIALS = "'{}";
+
+ /**
+ * @param rules list of rules, separated by newline characters
+ * @exception IllegalArgumentException if there is a syntax error in the
+ * rules
+ */
+ public Parser(String rules, int direction) {
+ this.rules = rules;
+ this.direction = direction;
+ data = new Data();
+ parseRules();
+ }
+
+ public Data getData() {
+ return data;
+ }
+
+ /**
+ * Parse the given string as a sequence of rules, separated by newline
+ * characters ('\n'), and cause this object to implement those rules. Any
+ * previous rules are discarded. Typically this method is called exactly
+ * once, during construction.
+ * @exception IllegalArgumentException if there is a syntax error in the
+ * rules
+ */
+ private void parseRules() {
+ determineVariableRange();
+
+ int n = rules.length();
+ int i = 0;
+ while (i0 && rules.charAt(limit-1) == '\\') {
+ limit = rules.indexOf('\n', limit+1);
+ }
+
+ if (limit == -1) {
+ limit = n;
+ }
+ // Skip over empty lines and line starting with #
+ if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) {
+ applyRule(i, limit);
+ }
+ i = limit + 1;
+ }
+
+ data.ruleSet.freeze();
+ }
+
+ /**
+ * Parse the given substring as a rule, and append it to the rules currently
+ * represented in this object.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @exception IllegalArgumentException if there is a syntax error in the
+ * rules
+ */
+ private void applyRule(int start, int limit) {
+ /* General description of parsing: Initially, rules contain two types of
+ * quoted characters. First, there are variable references, such as
+ * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of
+ * the first steps in parsing a rule is to resolve such quoted matter.
+ * Quotes are removed early, leaving unquoted literal matter. Variable
+ * references are resolved and replaced by single characters. In some
+ * instances these characters represent themselves; in others, they
+ * stand for categories of characters. Character categories are either
+ * predefined (e.g., "{Lu}"), or are defined by the user using a
+ * statement (e.g., "vowels:aeiouAEIOU").
+ *
+ * Another early step in parsing is to split each rule into component
+ * pieces. These pieces are, for every rule, a left-hand side, a right-
+ * hand side, and an operator. The left- and right-hand sides may not
+ * be empty, except for the output patterns of forward and reverse
+ * rules. In addition to this partitioning, the match patterns of
+ * forward and reverse rules must be partitioned into antecontext,
+ * postcontext, and literal pattern, where the context portions may or
+ * may not be present. Finally, output patterns must have the cursor
+ * indicator '|' detected and removed, with its position recorded.
+ *
+ * Quote removal, variable resolution, and sub-pattern splitting must
+ * all happen at once. This is due chiefly to the quoting mechanism,
+ * which allows special characters to appear at arbitrary positions in
+ * the final unquoted text. (For this reason, alteration of the rule
+ * language is somewhat clumsy; it entails reassessment and revision of
+ * the parsing methods as a whole.)
+ *
+ * After this processing of rules is complete, the final end products
+ * are unquoted pieces of text of various types, and an integer cursor
+ * position, if one is specified. These processed raw materials are now
+ * easy to deal with; other classes such as UnicodeSet and
+ * TransliterationRule need know nothing of quoting or variables.
+ */
+ StringBuffer left = new StringBuffer();
+ StringBuffer right = new StringBuffer();
+ StringBuffer anteContext = new StringBuffer();
+ StringBuffer postContext = new StringBuffer();
+ int cursorPos[] = new int[1];
+
+ char operator = parseRule(start, limit, left, right,
+ anteContext, postContext, cursorPos);
+
+ switch (operator) {
+ case VARIABLE_DEF_OP:
+ applyVariableDef(left.toString(), right.toString());
+ break;
+ case FORWARD_RULE_OP:
+ if (direction == FORWARD) {
+ data.ruleSet.addRule(new TransliterationRule(
+ left.toString(), right.toString(),
+ anteContext.toString(), postContext.toString(),
+ cursorPos[0]));
+ } // otherwise ignore the rule; it's not the direction we want
+ break;
+ case REVERSE_RULE_OP:
+ if (direction == REVERSE) {
+ data.ruleSet.addRule(new TransliterationRule(
+ right.toString(), left.toString(),
+ anteContext.toString(), postContext.toString(),
+ cursorPos[0]));
+ } // otherwise ignore the rule; it's not the direction we want
+ break;
+ }
+ }
+
+ /**
+ * Add a variable definition.
+ * @param name the name of the variable. It must not already be defined.
+ * @param pattern the value of the variable. It may be a single character
+ * or a pattern describing a character set.
+ * @exception IllegalArgumentException if there is a syntax error
+ */
+ private final void applyVariableDef(String name, String pattern) {
+ validateVariableName(name);
+ if (data.variableNames.get(name) != null) {
+ throw new IllegalArgumentException("Duplicate variable definition: "
+ + name + '=' + pattern);
+ }
+//! if (UnicodeSet.getCategoryID(name) >= 0) {
+//! throw new IllegalArgumentException("Reserved variable name: "
+//! + name);
+//! }
+ if (pattern.length() < 1) {
+ throw new IllegalArgumentException("Variable definition missing: "
+ + name);
+ }
+ if (pattern.length() == 1) {
+ // Got a single character variable definition
+ data.variableNames.put(name, new Character(pattern.charAt(0)));
+ } else {
+ // Got more than one character; parse it as a category
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException("Private use variables exhausted");
+ }
+ Character c = new Character(variableNext++);
+ data.variableNames.put(name, c);
+ data.setVariables.put(c, new UnicodeSet(pattern));
+ }
+ }
+
+ /**
+ * Given a rule, parses it into three pieces: The left side, the right side,
+ * and the operator. Returns the operator. Quotes and variable references
+ * are resolved; the otuput text in all StringBuffer
parameters
+ * is literal text. This method delegates to other parsing methods to
+ * handle the match pattern, output pattern, and other sub-patterns in the
+ * rule.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param left left side of rule is appended to this buffer
+ * with the quotes removed and variables resolved
+ * @param right right side of rule is appended to this buffer
+ * with the quotes removed and variables resolved
+ * @param anteContext the preceding context of the match pattern,
+ * if there is one, is appended to this buffer
+ * @param postContext the following context of the match pattern,
+ * if there is one, is appended to this buffer
+ * @param cursorPos if there is a cursor in the output pattern, its
+ * offset is stored in cursorPos[0]
+ * @return The operator character, one of the characters in OPERATORS.
+ */
+ private char parseRule(int start, int limit,
+ StringBuffer left, StringBuffer right,
+ StringBuffer anteContext,
+ StringBuffer postContext,
+ int[] cursorPos) {
+ if (false) {
+ System.err.println("Parsing " + rules.substring(start, limit));
+ }
+ /* Parse the rule into three pieces -- left, operator, and right,
+ * parsing out quotes. The result is that left and right will have
+ * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted
+ * operators throw an exception. Two quotes inside or outside
+ * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock".
+ */
+ int i = quotedIndexOf(rules, start, limit, OPERATORS);
+ if (i < 0) {
+ throw new IllegalArgumentException(
+ "Syntax error: "
+ + rules.substring(start, limit));
+ }
+ char c = rules.charAt(i);
+ switch (c) {
+ case FORWARD_RULE_OP:
+ if (i == start) {
+ throw new IllegalArgumentException(
+ "Empty left side: "
+ + rules.substring(start, limit));
+ }
+ parseMatchPattern(start, i, left, anteContext, postContext);
+ if (i != (limit-1)) {
+ parseOutputPattern(i+1, limit, right, cursorPos);
+ }
+ break;
+ case REVERSE_RULE_OP:
+ if (i == (limit-1)) {
+ throw new IllegalArgumentException(
+ "Empty right side: "
+ + rules.substring(start, limit));
+ }
+ if (i != start) {
+ parseOutputPattern(start, i, left, cursorPos);
+ }
+ parseMatchPattern(i+1, limit, right, anteContext, postContext);
+ break;
+ default:
+ if (i == start || i == (limit-1)) {
+ throw new IllegalArgumentException(
+ "Empty left or right side: "
+ + rules.substring(start, limit));
+ }
+ parseSubPattern(start, i, left);
+ parseDefPattern(i+1, limit, right);
+ break;
+ }
+ return c;
+ }
+
+ /**
+ * Parses the match pattern of a forward or reverse rule. Given the raw
+ * match pattern, return the match text and the context on both sides, if
+ * any. Resolves all quotes and variables.
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param text the key to be matched will be appended to this buffer
+ * @param anteContext the preceding context, if any, will be appended
+ * to this buffer.
+ * @param postContext the following context, if any, will be appended
+ * to this buffer.
+ */
+ private void parseMatchPattern(int start, int limit,
+ StringBuffer text,
+ StringBuffer anteContext,
+ StringBuffer postContext) {
+ if (start >= limit) {
+ throw new IllegalArgumentException(
+ "Empty expression in rule: "
+ + rules.substring(start, limit));
+ }
+ if (anteContext != null) {
+ // Ignore optional opening and closing context characters
+ if (rules.charAt(start) == CONTEXT_OPEN) {
+ ++start;
+ }
+ if (rules.charAt(limit-1) == CONTEXT_CLOSE) {
+ --limit;
+ }
+ // The four possibilities are:
+ // key
+ // anteContext]key
+ // anteContext]key[postContext
+ // key[postContext
+ int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE));
+ int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN));
+ if (ante >= 0 && post >= 0 && ante > post) {
+ throw new IllegalArgumentException(
+ "Syntax error in context specifier: "
+ + rules.substring(start, limit));
+ }
+ if (ante >= 0) {
+ parseSubPattern(start, ante, anteContext);
+ start = ante+1;
+ }
+ if (post >= 0) {
+ parseSubPattern(post+1, limit, postContext);
+ limit = post;
+ }
+ }
+ parseSubPattern(start, limit, text);
+ }
+
+ private final void parseSubPattern(int start, int limit,
+ StringBuffer text) {
+ parseSubPattern(start, limit, text, null, SPECIALS);
+ }
+
+ /**
+ * Parse a variable definition sub pattern. This kind of sub
+ * pattern differs in the set of characters that are considered
+ * special. In particular, the '[' and ']' characters are not
+ * special, since these are used in UnicodeSet patterns.
+ */
+ private final void parseDefPattern(int start, int limit,
+ StringBuffer text) {
+ parseSubPattern(start, limit, text, null, DEF_SPECIALS);
+ }
+
+ /**
+ * Parses the output pattern of a forward or reverse rule. Given the
+ * output pattern, return the output text and the position of the cursor,
+ * if any. Resolves all quotes and variables.
+ * @param rules the string to be parsed
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param text the output text will be appended to this buffer
+ * @param cursorPos if this parameter is not null, then cursorPos[0]
+ * will be set to the cursor position, or -1 if there is none. If this
+ * parameter is null, then cursors will be disallowed.
+ */
+ private final void parseOutputPattern(int start, int limit,
+ StringBuffer text,
+ int[] cursorPos) {
+ parseSubPattern(start, limit, text, cursorPos, SPECIALS);
+ }
+
+ /**
+ * Parses a sub-pattern of a rule. Return the text and the position of the cursor,
+ * if any. Resolves all quotes and variables.
+ * @param rules the string to be parsed
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= rules.length()
.
+ * @param text the output text will be appended to this buffer
+ * @param cursorPos if this parameter is not null, then cursorPos[0]
+ * will be set to the cursor position, or -1 if there is none. If this
+ * parameter is null, then cursors will be disallowed.
+ * @param specials characters that must be quoted; typically either
+ * SPECIALS or DEF_SPECIALS.
+ */
+ private void parseSubPattern(int start, int limit,
+ StringBuffer text,
+ int[] cursorPos,
+ String specials) {
+ boolean inQuote = false;
+
+ if (start >= limit) {
+ throw new IllegalArgumentException("Empty expression in rule");
+ }
+ if (cursorPos != null) {
+ cursorPos[0] = -1;
+ }
+ for (int i=start; i= 0) {
+ throw new IllegalArgumentException("Multiple cursors: "
+ + rules.substring(start, limit));
+ }
+ cursorPos[0] = text.length();
+ } else if (specials.indexOf(c) >= 0) {
+ throw new IllegalArgumentException("Unquoted special character: "
+ + rules.substring(start, limit));
+ } else {
+ text.append(c);
+ }
+ }
+ }
+
+ private static void validateVariableName(String name) {
+ if (indexOf(name, SPECIALS) >= 0) {
+ throw new IllegalArgumentException(
+ "Special character in variable name: "
+ + name);
+ }
+ }
+
+ /**
+ * Returns the single character value of the given variable name. Defined
+ * names are recognized.
+ *
+ * NO LONGER SUPPORTED:
+ * If a Unicode category name is given, a standard character variable
+ * in the range firstCategoryVariable to lastCategoryVariable is returned,
+ * with value firstCategoryVariable + n, where n is the category
+ * number.
+ * @exception IllegalArgumentException if the name is unknown.
+ */
+ private Character getVariableDef(String name) {
+ Character ch = (Character) data.variableNames.get(name);
+//! if (ch == null) {
+//! int id = UnicodeSet.getCategoryID(name);
+//! if (id >= 0) {
+//! ch = new Character((char) (firstCategoryVariable + id));
+//! data.variableNames.put(name, ch);
+//! data.setVariables.put(ch, new UnicodeSet(id));
+//! }
+//! }
+ if (ch == null) {
+ throw new IllegalArgumentException("Undefined variable: "
+ + name);
+ }
+ return ch;
+ }
+
+ /**
+ * Determines what part of the private use region of Unicode we can use for
+ * variable stand-ins. The correct way to do this is as follows: Parse each
+ * rule, and for forward and reverse rules, take the FROM expression, and
+ * make a hash of all characters used. The TO expression should be ignored.
+ * When done, everything not in the hash is available for use. In practice,
+ * this method may employ some other algorithm for improved speed.
+ */
+ private final void determineVariableRange() {
+ Range r = new Range('\uE000', 0x1900); // Private use area
+ r = r.largestUnusedSubrange(rules);
+
+ if (r == null) {
+ throw new RuntimeException(
+ "No private use characters available for variables");
+ }
+
+ variableNext = r.start;
+ variableLimit = (char) (r.start + r.length);
+
+ if (variableNext >= variableLimit) {
+ throw new RuntimeException(
+ "Too few private use characters available for variables");
+ }
+ }
+
+ /**
+ * Returns the index of the first character in a set, ignoring quoted text.
+ * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
+ * found by a search for "h". Unlike String.indexOf(), this method searches
+ * not for a single character, but for any character of the string
+ * setOfChars
.
+ * @param text text to be searched
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param setOfChars string with one or more distinct characters
+ * @return Offset of the first character in setOfChars
+ * found, or -1 if not found.
+ * @see #indexOf
+ */
+ private static int quotedIndexOf(String text, int start, int limit,
+ String setOfChars) {
+ for (int i=start; i= 0) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the index of the first character in a set. Unlike
+ * String.indexOf(), this method searches not for a single character, but
+ * for any character of the string setOfChars
.
+ * @param text text to be searched
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param setOfChars string with one or more distinct characters
+ * @return Offset of the first character in setOfChars
+ * found, or -1 if not found.
+ * @see #quotedIndexOf
+ */
+ private static int indexOf(String text, int start, int limit,
+ String setOfChars) {
+ for (int i=start; i= 0) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the index of the first character in a set. Unlike
+ * String.indexOf(), this method searches not for a single character, but
+ * for any character of the string setOfChars
.
+ * @param text text to be searched
+ * @param setOfChars string with one or more distinct characters
+ * @return Offset of the first character in setOfChars
+ * found, or -1 if not found.
+ * @see #quotedIndexOf
+ */
+ private static int indexOf(String text, String setOfChars) {
+ return indexOf(text, 0, text.length(), setOfChars);
+ }
+
+
+
+ /**
+ * A range of Unicode characters. Support the operations of testing for
+ * inclusion (does this range contain this character?) and splitting.
+ * Splitting involves breaking a range into two smaller ranges around a
+ * character inside the original range. The split character is not included
+ * in either range. If the split character is at either extreme end of the
+ * range, one of the split products is an empty range.
+ *
+ * This class is used internally to determine the largest available private
+ * use character range for variable stand-ins.
+ */
+ private static class Range implements Cloneable {
+ char start;
+ int length;
+
+ Range(char start, int length) {
+ this.start = start;
+ this.length = length;
+ }
+
+ public Object clone() {
+ return new Range(start, length);
+ }
+
+ boolean contains(char c) {
+ return c >= start && (c - start) < length;
+ }
+
+ /**
+ * Assume that contains(c) is true. Split this range into two new
+ * ranges around the character c. Make this range one of the new ranges
+ * (modify it in place) and return the other new range. The character
+ * itself is not included in either range. If the split results in an
+ * empty range (that is, if c == start or c == start + length - 1) then
+ * return null.
+ */
+ Range split(char c) {
+ if (c == start) {
+ ++start;
+ --length;
+ return null;
+ } else if (c - start == length - 1) {
+ --length;
+ return null;
+ } else {
+ ++c;
+ Range r = new Range(c, start + length - c);
+ length = --c - start;
+ return r;
+ }
+ }
+
+ /**
+ * Finds the largest unused subrange by the given string. A
+ * subrange is unused by a string if the string contains no
+ * characters in that range. If the given string contains no
+ * characters in this range, then this range itself is
+ * returned.
+ */
+ Range largestUnusedSubrange(String str) {
+ int n = str.length();
+
+ Vector v = new Vector(1);
+ v.addElement(clone());
+ for (int i=0; i bestRange.length) {
+ bestRange = r;
+ }
+ }
+
+ return bestRange;
+ }
+ }
+ }
+}
diff --git a/icu4j/src/com/ibm/text/TransliterationRule.java b/icu4j/src/com/ibm/text/TransliterationRule.java
new file mode 100755
index 00000000000..383c77ed340
--- /dev/null
+++ b/icu4j/src/com/ibm/text/TransliterationRule.java
@@ -0,0 +1,530 @@
+package com.ibm.text;
+
+import java.util.Dictionary;
+
+/**
+ * A transliteration rule used by
+ * RuleBasedTransliterator
.
+ * TransliterationRule
is an immutable object.
+ *
+ * A rule consists of an input pattern and an output string. When
+ * the input pattern is matched, the output string is emitted. The
+ * input pattern consists of zero or more characters which are matched
+ * exactly (the key) and optional context. Context must match if it
+ * is specified. Context may be specified before the key, after the
+ * key, or both. The key, preceding context, and following context
+ * may contain variables. Variables represent a set of Unicode
+ * characters, such as the letters a through z.
+ * Variables are detected by looking up each character in a supplied
+ * variable list to see if it has been so defined.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+class TransliterationRule {
+ /**
+ * Constant returned by getMatchDegree()
indicating a mismatch
+ * between the text and this rule. One or more characters of the context or
+ * key do not match the text.
+ * @see #getMatchDegree
+ */
+ public static final int MISMATCH = 0;
+
+ /**
+ * Constant returned by getMatchDegree()
indicating a partial
+ * match between the text and this rule. All characters of the text match
+ * the corresponding context or key, but more characters are required for a
+ * complete match. There are some key or context characters at the end of
+ * the pattern that remain unmatched because the text isn't long enough.
+ * @see #getMatchDegree
+ */
+ public static final int PARTIAL_MATCH = 1;
+
+ /**
+ * Constant returned by getMatchDegree()
indicating a complete
+ * match between the text and this rule. The text matches all context and
+ * key characters.
+ * @see #getMatchDegree
+ */
+ public static final int FULL_MATCH = 2;
+
+ /**
+ * The string that must be matched.
+ */
+ private String key;
+
+ /**
+ * The string that is emitted if the key, anteContext, and postContext
+ * are matched.
+ */
+ private String output;
+
+ /**
+ * The string that must match before the key. Must not be the empty string.
+ * May be null; if null, then there is no matching requirement before the
+ * key.
+ */
+ private String anteContext;
+
+ /**
+ * The string that must match after the key. Must not be the empty string.
+ * May be null; if null, then there is no matching requirement after the
+ * key.
+ */
+ private String postContext;
+
+ /**
+ * The position of the cursor after emitting the output string, from 0 to
+ * output.length(). For most rules with no special cursor specification,
+ * the cursorPos is output.length().
+ */
+ private int cursorPos;
+
+ /**
+ * A string used to implement masks().
+ */
+ private String maskKey;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a new rule with the given key, output text, and other
+ * attributes. Zero, one, or two context strings may be specified. A
+ * cursor position may be specified for the output text.
+ * @param key the string to match
+ * @param output the string to produce when the key
is seen
+ * @param anteContext if not null and not empty, then it must be matched
+ * before the key
+ * @param postContext if not null and not empty, then it must be matched
+ * after the key
+ * @param cursorPos a position for the cursor after the output
+ * is emitted. If less than zero, then the cursor is placed after the
+ * output
; that is, -1 is equivalent to
+ * output.length()
. If greater than
+ * output.length()
then an exception is thrown.
+ * @exception IllegalArgumentException if the cursor position is out of
+ * range.
+ */
+ public TransliterationRule(String key, String output,
+ String anteContext, String postContext,
+ int cursorPos) {
+ this.key = key;
+ this.output = output;
+ this.anteContext = (anteContext != null && anteContext.length() > 0)
+ ? anteContext : null;
+ this.postContext = (postContext != null && postContext.length() > 0)
+ ? postContext : null;
+ this.cursorPos = cursorPos < 0 ? output.length() : cursorPos;
+ if (this.cursorPos > output.length()) {
+ throw new IllegalArgumentException("Illegal cursor position");
+ }
+
+ /* The mask key is needed when we are adding individual rules to a rule
+ * set, for performance. Here are the numbers: Without mask key, 13.0
+ * seconds. With mask key, 6.2 seconds. However, once the rules have
+ * been added to the set, then they can be discarded to free up space.
+ * This is what the freeze() method does. After freeze() has been
+ * called, the method masks() must NOT be called.
+ */
+ maskKey = key;
+ if (postContext != null) {
+ maskKey += postContext;
+ }
+ }
+
+ /**
+ * Return the length of the key. Equivalent to getKey().length()
.
+ * @return the length of the match key.
+ */
+ public int getKeyLength() {
+ return key.length();
+ }
+
+ /**
+ * Return the key.
+ * @return the match key.
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
+ * Return the output string.
+ * @return the output string.
+ */
+ public String getOutput() {
+ return output;
+ }
+
+ /**
+ * Return the position of the cursor within the output string.
+ * @return a value from 0 to getOutput().length()
, inclusive.
+ */
+ public int getCursorPos() {
+ return cursorPos;
+ }
+
+ /**
+ * Return the preceding context length. This method is needed to
+ * support the Transliterator
method
+ * getMaximumContextLength()
.
+ */
+ public int getAnteContextLength() {
+ return anteContext == null ? 0 : anteContext.length();
+ }
+
+ /**
+ * Return true if this rule masks another rule. If r1 masks r2 then
+ * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
+ * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
+ * "[c]a>x" masks "[dc]a>y".
+ *
+ *
This method must not be called after freeze() is called.
+ */
+ public boolean masks(TransliterationRule r2) {
+ /* There are three cases of masking. In each instance, rule1
+ * masks rule2.
+ *
+ * 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
+ *
+ * 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
+ * prefix2 ends with prefix1, suffix2 starts with suffix1.
+ *
+ * 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
+ * prefix2 ends with prefix1, suffix2 starts with suffix1.
+ */
+
+ /* LIMITATION of the current mask algorithm: Some rule
+ * maskings are currently not detected. For example,
+ * "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
+ * we need a subset operator on UnicodeSet objects, which we
+ * currently do not have. This can be added later.
+ */
+ return ((maskKey.length() < r2.maskKey.length() &&
+ r2.maskKey.startsWith(maskKey)) ||
+ (r2.anteContext != null && maskKey.equals(r2.maskKey) &&
+ ((anteContext == null) ||
+ (anteContext.length() < r2.anteContext.length() &&
+ r2.anteContext.endsWith(anteContext)))));
+ }
+
+ /**
+ * Free up space. Once this method is called, masks() must NOT be called.
+ * If it is called, an exception will be thrown.
+ */
+ public void freeze() {
+ maskKey = null;
+ }
+
+ /**
+ * Return a string representation of this object.
+ * @return string representation of this object
+ */
+ public String toString() {
+ return getClass().getName() + '['
+ + escape((anteContext != null ? ("[" + anteContext + ']') : "")
+ + key
+ + (postContext != null ? ("[" + postContext + ']') : "")
+ + " -> "
+ + (cursorPos < output.length()
+ ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
+ : output))
+ + ']';
+ }
+
+ /**
+ * Return true if this rule matches the given text. The text being matched
+ * occupies a virtual buffer consisting of the contents of
+ * result
concatenated to a substring of text
.
+ * The substring is specified by start
and limit
.
+ * The value of cursor
is an index into this virtual buffer,
+ * from 0 to the length of the buffer. In terms of the parameters,
+ * cursor
must be between 0 and result.length() + limit -
+ * start
.
+ * @param text the untranslated text
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result translated text so far
+ * @param cursor position at which to translate next, an offset into result.
+ * If greater than or equal to result.length(), represents offset start +
+ * cursor - result.length() into text.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ public boolean matches(String text, int start, int limit,
+ StringBuffer result, int cursor,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ return
+ (anteContext == null
+ || regionMatches(text, start, limit, result,
+ cursor - anteContext.length(),
+ anteContext, variables, filter)) &&
+ regionMatches(text, start, limit, result, cursor,
+ key, variables, filter) &&
+ (postContext == null
+ || regionMatches(text, start, limit, result,
+ cursor + key.length(),
+ postContext, variables, filter));
+ }
+
+ /**
+ * Return true if this rule matches the given text.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ public boolean matches(Replaceable text, int start, int limit,
+ int cursor, Dictionary variables,
+ UnicodeFilter filter) {
+ return
+ (anteContext == null
+ || regionMatches(text, start, limit, cursor - anteContext.length(),
+ anteContext, variables, filter)) &&
+ regionMatches(text, start, limit, cursor,
+ key, variables, filter) &&
+ (postContext == null
+ || regionMatches(text, start, limit, cursor + key.length(),
+ postContext, variables, filter));
+ }
+
+ /**
+ * Return the degree of match between this rule and the given text. The
+ * degree of match may be mismatch, a partial match, or a full match. A
+ * mismatch means at least one character of the text does not match the
+ * context or key. A partial match means some context and key characters
+ * match, but the text is not long enough to match all of them. A full
+ * match means all context and key characters match.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return one of MISMATCH
, PARTIAL_MATCH
, or
+ * FULL_MATCH
.
+ * @see #MISMATCH
+ * @see #PARTIAL_MATCH
+ * @see #FULL_MATCH
+ */
+ public int getMatchDegree(Replaceable text, int start, int limit,
+ int cursor, Dictionary variables,
+ UnicodeFilter filter) {
+ if (anteContext != null
+ && !regionMatches(text, start, limit, cursor - anteContext.length(),
+ anteContext, variables, filter)) {
+ return MISMATCH;
+ }
+ int len = getRegionMatchLength(text, start, limit, cursor,
+ key, variables, filter);
+ if (len < 0) {
+ return MISMATCH;
+ }
+ if (len < key.length()) {
+ return PARTIAL_MATCH;
+ }
+ if (postContext == null) {
+ return FULL_MATCH;
+ }
+ len = getRegionMatchLength(text, start, limit,
+ cursor + key.length(),
+ postContext, variables, filter);
+ return (len < 0) ? MISMATCH
+ : ((len == postContext.length()) ? FULL_MATCH
+ : PARTIAL_MATCH);
+ }
+
+ /**
+ * Return true if a template matches the text. The entire length of the
+ * template is compared to the text at the cursor. As in
+ * matches()
, the text being matched occupies a virtual buffer
+ * consisting of the contents of result
concatenated to a
+ * substring of text
. See matches()
for details.
+ * @param text the untranslated text
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result translated text so far
+ * @param cursor position at which to translate next, an offset into result.
+ * If greater than or equal to result.length(), represents offset start +
+ * cursor - result.length() into text.
+ * @param template the text to match against. All characters must match.
+ * @param variables a dictionary of variables mapping Character
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return true if there is a match
+ */
+ protected static boolean regionMatches(String text, int start, int limit,
+ StringBuffer result, int cursor,
+ String template,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ int rlen = result.length();
+ if (cursor < 0
+ || (cursor + template.length()) > (rlen + limit - start)) {
+ return false;
+ }
+ for (int i=0; i0 <= start
+ * <= limit.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param template the text to match against. All characters must match.
+ * @param variables a dictionary of variables mapping Character
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return true if there is a match
+ */
+ protected static boolean regionMatches(Replaceable text, int start, int limit,
+ int cursor,
+ String template, Dictionary variables,
+ UnicodeFilter filter) {
+ if (cursor < start
+ || (cursor + template.length()) > limit) {
+ return false;
+ }
+ for (int i=0; i0 <= start
+ * <= limit.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param template the text to match against. All characters must match.
+ * @param variables a dictionary of variables mapping Character
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return -1 if there is a mismatch, 0 if the text is not long enough to
+ * match any characters, otherwise the number of characters of text that
+ * match this rule.
+ */
+ protected static int getRegionMatchLength(Replaceable text, int start,
+ int limit, int cursor,
+ String template,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ if (cursor < start) {
+ return -1;
+ }
+ int i;
+ for (i=0; iCharacter
+ * to UnicodeSet
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ protected static boolean charMatches(char keyChar, char textChar,
+ Dictionary variables, UnicodeFilter filter) {
+ UnicodeSet set = null;
+ return (filter == null || filter.isIn(textChar)) &&
+ ((set = (UnicodeSet) variables.get(new Character(keyChar)))
+ == null) ?
+ keyChar == textChar : set.contains(textChar);
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ */
+ public static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ buf.append(c);
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/text/TransliterationRuleSet.java b/icu4j/src/com/ibm/text/TransliterationRuleSet.java
new file mode 100755
index 00000000000..d57bf75464a
--- /dev/null
+++ b/icu4j/src/com/ibm/text/TransliterationRuleSet.java
@@ -0,0 +1,218 @@
+package com.ibm.text;
+
+import java.util.*;
+
+/**
+ * A set of rules for a RuleBasedTransliterator
. This set encodes
+ * the transliteration in one direction from one set of characters or short
+ * strings to another. A RuleBasedTransliterator
consists of up to
+ * two such sets, one for the forward direction, and one for the reverse.
+ *
+ * A TransliterationRuleSet
has one important operation, that of
+ * finding a matching rule at a given point in the text. This is accomplished
+ * by the findMatch()
method.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+class TransliterationRuleSet {
+ /* Note: There was an old implementation that indexed by first letter of
+ * key. Problem with this is that key may not have a meaningful first
+ * letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
+ * rules whose intial key letter is a category variable. However, the
+ * problem is that they must be kept in order with respect to other rules.
+ * One solution -- add a sequence number to each rule. Do the usual
+ * first-letter lookup, and also a lookup from the spare bin with rules like
+ * {Lu}>*. Take the lower sequence number. This seems complex and not
+ * worth the trouble, but we may revisit this later. For documentation (or
+ * possible resurrection) the old code is included below, commented out
+ * with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
+ * implementation, rules
is a Hashtable, not a Vector.
+ */
+
+ /**
+ * Vector of rules, in the order added.
+ */
+ private Vector rules;
+
+ /**
+ * Length of the longest preceding context
+ */
+ private int maxContextLength;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a new empty rule set.
+ */
+ public TransliterationRuleSet() {
+ rules = new Vector();
+ maxContextLength = 0;
+ }
+
+ /**
+ * Return the maximum context length.
+ * @return the length of the longest preceding context.
+ */
+ public int getMaximumContextLength() {
+ return maxContextLength;
+ }
+
+ /**
+ * Add a rule to this set. Rules are added in order, and order is
+ * significant.
+ *
+ *
Once freeze() is called, this method must not be called.
+ * @param rule the rule to add
+ */
+ public void addRule(TransliterationRule rule) {
+
+ // Build time, no checking : 3562 ms
+ // Build time, with checking: 6234 ms
+
+ for (int i=0; i maxContextLength) {
+ maxContextLength = len;
+ }
+ }
+
+ /**
+ * Free up space. Once this method is called, addRule() must NOT
+ * be called again.
+ */
+ public void freeze() {
+ for (int i=0; iresult concatenated to a substring of text
.
+ * The substring is specified by start
and limit
.
+ * The value of cursor
is an index into this virtual buffer,
+ * from 0 to the length of the buffer. In terms of the parameters,
+ * cursor
must be between 0 and result.length() + limit -
+ * start
.
+ * @param text the untranslated text
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result tranlated text
+ * @param cursor position at which to translate next, an offset into result.
+ * If greater than or equal to result.length(), represents offset start +
+ * cursor - result.length() into text.
+ * @param variables a dictionary mapping variables to the sets they
+ * represent (maps Character
to UnicodeSet
)
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return the matching rule, or null if none found.
+ */
+ public TransliterationRule findMatch(String text, int start, int limit,
+ StringBuffer result, int cursor,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
+ TransliterationRule rule = (TransliterationRule) e.nextElement();
+ if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
+ return rule;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Attempt to find a matching rule at the specified point in the text.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param variables a dictionary mapping variables to the sets they
+ * represent (maps Character
to UnicodeSet
)
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return the matching rule, or null if none found.
+ */
+ public TransliterationRule findMatch(Replaceable text, int start, int limit,
+ int cursor,
+ Dictionary variables,
+ UnicodeFilter filter) {
+ for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
+ TransliterationRule rule = (TransliterationRule) e.nextElement();
+ if (rule.matches(text, start, limit, cursor, variables, filter)) {
+ return rule;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Attempt to find a matching rule at the specified point in the text.
+ * Unlike findMatch()
, this method does an incremental match.
+ * An incremental match requires that there be no partial matches that might
+ * pre-empt the full match that is found. If there are partial matches,
+ * then null is returned. A non-null result indicates that a full match has
+ * been found, and that it cannot be pre-empted by a partial match
+ * regardless of what additional text is added to the translation buffer.
+ * @param text the text, both translated and untranslated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param cursor position at which to translate next, representing offset
+ * into text. This value must be between start
and
+ * limit
.
+ * @param variables a dictionary mapping variables to the sets they
+ * represent (maps Character
to UnicodeSet
)
+ * @param partial output parameter. partial[0]
is set to
+ * true if a partial match is returned.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return the matching rule, or null if none found, or if the text buffer
+ * does not have enough text yet to unambiguously match a rule.
+ */
+ public TransliterationRule findIncrementalMatch(Replaceable text, int start,
+ int limit, int cursor,
+ Dictionary variables,
+ boolean partial[],
+ UnicodeFilter filter) {
+ partial[0] = false;
+ for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
+ TransliterationRule rule = (TransliterationRule) e.nextElement();
+ int match = rule.getMatchDegree(text, start, limit, cursor,
+ variables, filter);
+ switch (match) {
+ case TransliterationRule.FULL_MATCH:
+ return rule;
+ case TransliterationRule.PARTIAL_MATCH:
+ partial[0] = true;
+ return null;
+ }
+ }
+ return null;
+ }
+}
diff --git a/icu4j/src/com/ibm/text/Transliterator.java b/icu4j/src/com/ibm/text/Transliterator.java
new file mode 100755
index 00000000000..83171a961e7
--- /dev/null
+++ b/icu4j/src/com/ibm/text/Transliterator.java
@@ -0,0 +1,860 @@
+package com.ibm.text;
+
+import java.util.*;
+import java.text.MessageFormat;
+
+/**
+ * Transliterator
is an abstract class that
+ * transliterates text from one format to another. The most common
+ * kind of transliterator is a script, or alphabet, transliterator.
+ * For example, a Russian to Latin transliterator changes Russian text
+ * written in Cyrillic characters to phonetically equivalent Latin
+ * characters. It does not translate Russian to English!
+ * Transliteration, unlike translation, operates on characters, without
+ * reference to the meanings of words and sentences.
+ *
+ * Although script conversion is its most common use, a
+ * transliterator can actually perform a more general class of tasks.
+ * In fact, Transliterator
defines a very general API
+ * which specifies only that a segment of the input text is replaced
+ * by new text. The particulars of this conversion are determined
+ * entirely by subclasses of Transliterator
.
+ *
+ *
Transliterators are stateless
+ *
+ *
Transliterator
objects are stateless; they
+ * retain no information between calls to
+ * transliterate()
. As a result, threads may share
+ * transliterators without synchronizing them. This might seem to
+ * limit the complexity of the transliteration operation. In
+ * practice, subclasses perform complex transliterations by delaying
+ * the replacement of text until it is known that no other
+ * replacements are possible. In other words, although the
+ * Transliterator
objects are stateless, the source text
+ * itself embodies all the needed information, and delayed operation
+ * allows arbitrary complexity.
+ *
+ *
Batch transliteration
+ *
+ *
The simplest way to perform transliteration is all at once, on a
+ * string of existing text. This is referred to as batch
+ * transliteration. For example, given a string input
+ * and a transliterator t
, the call
+ *
+ *
String result = t.transliterate(input);
+ *
+ *
+ * will transliterate it and return the result. Other methods allow
+ * the client to specify a substring to be transliterated and to use
+ * {@link Replaceable} objects instead of strings, in order to
+ * preserve out-of-band information (such as text styles).
+ *
+ * Keyboard transliteration
+ *
+ *
Somewhat more involved is keyboard, or incremental
+ * transliteration. This is the transliteration of text that is
+ * arriving from some source (typically the user's keyboard) one
+ * character at a time, or in some other piecemeal fashion.
+ *
+ *
In keyboard transliteration, a Replaceable
buffer
+ * stores the text. As text is inserted, as much as possible is
+ * transliterated on the fly. This means a GUI that displays the
+ * contents of the buffer may show text being modified as each new
+ * character arrives.
+ *
+ *
Consider the simple RuleBasedTransliterator
:
+ *
+ *
+ * th>{theta}
+ * t>{tau}
+ *
+ *
+ * When the user types 't', nothing will happen, since the
+ * transliterator is waiting to see if the next character is 'h'. To
+ * remedy this, we introduce the notion of a cursor, marked by a '|'
+ * in the output string:
+ *
+ *
+ * t>|{tau}
+ * {tau}h>{theta}
+ *
+ *
+ * Now when the user types 't', tau appears, and if the next character
+ * is 'h', the tau changes to a theta. This is accomplished by
+ * maintaining a cursor position (independent of the insertion point,
+ * and invisible in the GUI) across calls to
+ * keyboardTransliterate()
. Typically, the cursor will
+ * be coincident with the insertion point, but in a case like the one
+ * above, it will precede the insertion point.
+ *
+ * Keyboard transliteration methods maintain a set of three indices
+ * that are updated with each call to
+ * keyboardTransliterate()
, including the cursor, start,
+ * and limit. Since these indices are changed by the method, they are
+ * passed in an int[]
array. The START
index
+ * marks the beginning of the substring that the transliterator will
+ * look at. It is advanced as text becomes committed (but it is not
+ * the committed index; that's the CURSOR
). The
+ * CURSOR
index, described above, marks the point at
+ * which the transliterator last stopped, either because it reached
+ * the end, or because it required more characters to disambiguate
+ * between possible inputs. The CURSOR
can also be
+ * explicitly set by rules in a RuleBasedTransliterator
.
+ * Any characters before the CURSOR
index are frozen;
+ * future keyboard transliteration calls within this input sequence
+ * will not change them. New text is inserted at the
+ * LIMIT
index, which marks the end of the substring that
+ * the transliterator looks at.
+ *
+ *
Because keyboard transliteration assumes that more characters
+ * are to arrive, it is conservative in its operation. It only
+ * transliterates when it can do so unambiguously. Otherwise it waits
+ * for more characters to arrive. When the client code knows that no
+ * more characters are forthcoming, perhaps because the user has
+ * performed some input termination operation, then it should call
+ * finishKeyboardTransliteration()
to complete any
+ * pending transliterations.
+ *
+ *
Inverses
+ *
+ *
Pairs of transliterators may be inverses of one another. For
+ * example, if transliterator A transliterates characters by
+ * incrementing their Unicode value (so "abc" -> "def"), and
+ * transliterator B decrements character values, then A
+ * is an inverse of B and vice versa. If we compose A
+ * with B in a compound transliterator, the result is the
+ * indentity transliterator, that is, a transliterator that does not
+ * change its input text.
+ *
+ * The Transliterator
method getInverse()
+ * returns a transliterator's inverse, if one exists, or
+ * null
otherwise. However, the result of
+ * getInverse()
usually will not be a true
+ * mathematical inverse. This is because true inverse transliterators
+ * are difficult to formulate. For example, consider two
+ * transliterators: AB, which transliterates the character 'A'
+ * to 'B', and BA, which transliterates 'B' to 'A'. It might
+ * seem that these are exact inverses, since
+ *
+ *
"A" x AB -> "B"
+ * "B" x BA -> "A"
+ *
+ * where 'x' represents transliteration. However,
+ *
+ * "ABCD" x AB -> "BBCD"
+ * "BBCD" x BA -> "AACD"
+ *
+ * so AB composed with BA is not the
+ * identity. Nonetheless, BA may be usefully considered to be
+ * AB's inverse, and it is on this basis that
+ * AB.getInverse()
could legitimately return
+ * BA.
+ *
+ * IDs and display names
+ *
+ *
A transliterator is designated by a short identifier string or
+ * ID. IDs follow the format source-destination,
+ * where source describes the entity being replaced, and
+ * destination describes the entity replacing
+ * source. The entities may be the names of scripts,
+ * particular sequences of characters, or whatever else it is that the
+ * transliterator converts to or from. For example, a transliterator
+ * from Russian to Latin might be named "Russian-Latin". A
+ * transliterator from keyboard escape sequences to Latin-1 characters
+ * might be named "KeyboardEscape-Latin1". By convention, system
+ * entity names are in English, with the initial letters of words
+ * capitalized; user entity names may follow any format so long as
+ * they do not contain dashes.
+ *
+ *
In addition to programmatic IDs, transliterator objects have
+ * display names for presentation in user interfaces, returned by
+ * {@link #getDisplayName}.
+ *
+ *
Factory methods and registration
+ *
+ *
In general, client code should use the factory method
+ * getInstance()
to obtain an instance of a
+ * transliterator given its ID. Valid IDs may be enumerated using
+ * getAvailableIDs()
. Since transliterators are
+ * stateless, multiple calls to getInstance()
with the
+ * same ID will return the same object.
+ *
+ *
In addition to the system transliterators registered at startup,
+ * user transliterators may be registered by calling
+ * registerInstance()
at run time. To register a
+ * transliterator subclass without instantiating it (until it is
+ * needed), users may call registerClass()
.
+ *
+ *
Subclassing
+ *
+ *
Subclasses must implement the abstract
+ * transliterate()
method. They should also override the
+ * transliterate()
method taking a String
+ * and StringBuffer
if the performance of these methods
+ * can be improved over the performance obtained by the default
+ * implementations in this class. Subclasses must also implement
+ * handleKeyboardTransliterate()
.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: Transliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public abstract class Transliterator {
+ /**
+ * In the keyboardTransliterate()
+ * index[]
array, the beginning index, inclusive
+ * @see #keyboardTransliterate
+ */
+ public static final int START = 0;
+
+ /**
+ * In the keyboardTransliterate()
+ * index[]
array, the ending index, exclusive
+ * @see #keyboardTransliterate
+ */
+ public static final int LIMIT = 1;
+
+ /**
+ * In the keyboardTransliterate()
+ * index[]
array, the next character to be considered
+ * for transliteration
+ * @see #keyboardTransliterate
+ */
+ public static final int CURSOR = 2;
+
+ /**
+ * Programmatic name, e.g., "Latin-Arabic".
+ */
+ private String ID;
+
+ /**
+ * This transliterator's filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ private UnicodeFilter filter;
+
+ /**
+ * Dictionary of known transliterators. Keys are String
+ * names, values are one of the following:
+ *
+ *
Transliterator
objects
+ *
+ * Class
objects. Such objects must represent
+ * subclasses of Transliterator
, and must satisfy the
+ * constraints described in registerClass()
+ *
+ * RULE_BASED_PLACEHOLDER
, in which case the ID
+ * will have its first '-' removed and be appended to
+ * RB_RULE_BASED_PREFIX to form a resource bundle name from which
+ * the RB_RULE key is looked up to obtain the rule.
+ *
+ * REVERSE_RULE_BASED_PLACEHOLDER
. Like
+ * RULE_BASED_PLACEHOLDER
, except the entity names in
+ * the ID are reversed, and the argument
+ * RuleBasedTransliterator.REVERSE is pased to the
+ * RuleBasedTransliterator constructor.
+ *
+ */
+ private static Hashtable cache;
+
+ /**
+ * Internal object used to stand for instances of
+ * RuleBasedTransliterator
that have not been
+ * constructed yet in the cache
. When a
+ * getInstance()
call retrieves this object, it is
+ * replaced by the actual RuleBasedTransliterator
.
+ * This allows Transliterator
to delay instantiation
+ * of such transliterators until they are needed.
+ */
+ private static final Object RULE_BASED_PLACEHOLDER = new Object();
+
+ /**
+ * Internal object used to stand for instances of
+ * RuleBasedTransliterator
that have not been
+ * constructed yet in the cache
. These instances are
+ * constructed with an argument
+ * RuleBasedTransliterator.REVERSE
.
+ */
+ private static final Object REVERSE_RULE_BASED_PLACEHOLDER = new Object();
+
+ /**
+ * Prefix for resource bundle key for the display name for a
+ * transliterator. The ID is appended to this to form the key.
+ * The resource bundle value should be a String.
+ */
+ private static final String RB_DISPLAY_NAME_PREFIX = "T:";
+
+ /**
+ * Resource bundle key for display name pattern.
+ * The resource bundle value should be a String forming a
+ * MessageFormat pattern, e.g.:
+ * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
+ */
+ private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
+
+ /**
+ * Resource bundle key for the list of RuleBasedTransliterator IDs.
+ * The resource bundle value should be a String[] with each element
+ * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
+ * to obtain the class name in which the RB_RULE key will be sought.
+ */
+ private static final String RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs";
+
+ /**
+ * Resource bundle containing display name keys and the
+ * RB_RULE_BASED_IDS array.
+ *
+ * If we ever integrate this with the Sun JDK, the resource bundle
+ * root will change to java.text.resources.LocaleElements
+ */
+ private static final String RB_LOCALE_ELEMENTS =
+ "com.ibm.text.resources.LocaleElements";
+
+ /**
+ * Prefix for resource bundle containing RuleBasedTransliterator
+ * RB_RULE string. The ID is munged to remove the first '-' then appended
+ * to this String to obtain the class name.
+ */
+ private static final String RB_RULE_BASED_PREFIX =
+ "com.ibm.text.resources.TransliterationRule";
+
+ /**
+ * Resource bundle key for the RuleBasedTransliterator rule.
+ */
+ private static final String RB_RULE = "Rule";
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Default constructor.
+ * @param ID the string identifier for this transliterator
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ */
+ protected Transliterator(String ID, UnicodeFilter filter) {
+ if (ID == null) {
+ throw new NullPointerException();
+ }
+ this.ID = ID;
+ this.filter = filter;
+ }
+
+ /**
+ * Transliterates the segment of a string that begins at the
+ * character at offset start
and extends to the
+ * character at offset limit - 1
, with optional
+ * filtering. A default implementaion is provided here;
+ * subclasses should provide a more efficient implementation if
+ * possible.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param result buffer to receive the transliterated text; previous
+ * contents are discarded
+ */
+ public void transliterate(String text, int start, int limit,
+ StringBuffer result) {
+ /* This is a default implementation that should be replaced by
+ * a more efficient subclass implementation if possible.
+ */
+ result.setLength(0);
+ result.append(text.substring(start, limit));
+ transliterate(new ReplaceableString(result),
+ 0, result.length());
+ }
+
+ /**
+ * Transliterates a segment of a string, with optional filtering.
+ * Subclasses must override this abstract method.
+ *
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @param filter the filter. Any character for which
+ * filter.isIn() returns false will not be
+ * altered by this transliterator. If filter is
+ * null then no filtering is applied.
+ * @return The new limit index. The text previously occupying [start,
+ * limit)
has been transliterated, possibly to a string of a different
+ * length, at [start,
new-limit)
, where
+ * new-limit is the return value.
+ */
+ public abstract int transliterate(Replaceable text, int start, int limit);
+
+ /**
+ * Transliterates an entire string. Convenience method.
+ * @param text the string to be transliterated
+ * @param result buffer to receive the transliterated text; previous
+ * contents are discarded
+ */
+ public final void transliterate(String text, StringBuffer result) {
+ transliterate(text, 0, text.length(), result);
+ }
+
+ /**
+ * Transliterate an entire string and returns the result. Convenience method.
+ *
+ * @param text the string to be transliterated
+ * @return The transliterated text
+ */
+ public final String transliterate(String text) {
+ StringBuffer result = new StringBuffer();
+ transliterate(text, 0, text.length(), result);
+ return result.toString();
+ }
+
+ /**
+ * Transliterates an entire string in place. Convenience method.
+ * @param text the string to be transliterated
+ */
+ public final void transliterate(Replaceable text) {
+ transliterate(text, 0, text.length());
+ }
+
+ /**
+ * Transliterates the portion of the text buffer that can be
+ * transliterated unambiguosly after new text has been inserted,
+ * typically as a result of a keyboard event. The new text in
+ * insertion
will be inserted into text
+ * at index[LIMIT]
, advancing
+ * index[LIMIT]
by insertion.length()
.
+ * Then the transliterator will try to transliterate characters of
+ * text
between index[CURSOR]
and
+ * index[LIMIT]
. Characters before
+ * index[CURSOR]
will not be changed.
+ *
+ *
Upon return, values in index[]
will be updated.
+ * index[START]
will be advanced to the first
+ * character that future calls to this method will read.
+ * index[CURSOR]
and index[LIMIT]
will
+ * be adjusted to delimit the range of text that future calls to
+ * this method may change.
+ *
+ *
Typical usage of this method begins with an initial call
+ * with index[START]
and index[LIMIT]
+ * set to indicate the portion of text
to be
+ * transliterated, and index[CURSOR] == index[START]
.
+ * Thereafter, index[]
can be used without
+ * modification in future calls, provided that all changes to
+ * text
are made via this method.
+ *
+ *
This method assumes that future calls may be made that will
+ * insert new text into the buffer. As a result, it only performs
+ * unambiguous transliterations. After the last call to this
+ * method, there may be untransliterated text that is waiting for
+ * more input to resolve an ambiguity. In order to perform these
+ * pending transliterations, clients should call {@link
+ * #finishKeyboardTransliteration} after the last call to this
+ * method has been made.
+ *
+ * @param text the buffer holding transliterated and untransliterated text
+ * @param index an array of three integers.
+ *
+ *
index[START]
: the beginning index,
+ * inclusive; 0 <= index[START] <= index[LIMIT]
.
+ *
+ * index[LIMIT]
: the ending index, exclusive;
+ * index[START] <= index[LIMIT] <= text.length()
.
+ * insertion
is inserted at
+ * index[LIMIT]
.
+ *
+ * index[CURSOR]
: the next character to be
+ * considered for transliteration; index[START] <=
+ * index[CURSOR] <= index[LIMIT]
. Characters before
+ * index[CURSOR]
will not be changed by future calls
+ * to this method.
+ *
+ * @param insertion text to be inserted and possibly
+ * transliterated into the translation buffer at
+ * index[LIMIT]
. If null
then no text
+ * is inserted.
+ * @see #START
+ * @see #LIMIT
+ * @see #CURSOR
+ * @see #handleKeyboardTransliterate
+ * @exception IllegalArgumentException if index[]
+ * is invalid
+ */
+ public final void keyboardTransliterate(Replaceable text, int[] index,
+ String insertion) {
+ if (index.length < 3 ||
+ index[START] < 0 ||
+ index[LIMIT] > text.length() ||
+ index[CURSOR] < index[START] ||
+ index[CURSOR] > index[LIMIT]) {
+ throw new IllegalArgumentException("Invalid index array");
+ }
+
+ int originalStart = index[START];
+ if (insertion != null) {
+ text.replace(index[LIMIT], index[LIMIT], insertion);
+ index[LIMIT] += insertion.length();
+ }
+
+ handleKeyboardTransliterate(text, index);
+
+ index[START] = Math.max(index[CURSOR] - getMaximumContextLength(),
+ originalStart);
+ }
+
+ /**
+ * Transliterates the portion of the text buffer that can be
+ * transliterated unambiguosly after a new character has been
+ * inserted, typically as a result of a keyboard event. This is a
+ * convenience method; see {@link
+ * #keyboardTransliterate(Replaceable, int[], String)} for details.
+ * @param text the buffer holding transliterated and
+ * untransliterated text
+ * @param index an array of three integers. See {@link
+ * #keyboardTransliterate(Replaceable, int[], String)}.
+ * @param insertion text to be inserted and possibly
+ * transliterated into the translation buffer at
+ * index[LIMIT]
.
+ * @see #keyboardTransliterate(Replaceable, int[], String)
+ */
+ public final void keyboardTransliterate(Replaceable text, int[] index,
+ char insertion) {
+ keyboardTransliterate(text, index, String.valueOf(insertion));
+ }
+
+ /**
+ * Transliterates the portion of the text buffer that can be
+ * transliterated unambiguosly. This is a convenience method; see
+ * {@link #keyboardTransliterate(Replaceable, int[], String)} for
+ * details.
+ * @param text the buffer holding transliterated and
+ * untransliterated text
+ * @param index an array of three integers. See {@link
+ * #keyboardTransliterate(Replaceable, int[], String)}.
+ * @see #keyboardTransliterate(Replaceable, int[], String)
+ */
+ public final void keyboardTransliterate(Replaceable text, int[] index) {
+ keyboardTransliterate(text, index, null);
+ }
+
+ /**
+ * Finishes any pending transliterations that were waiting for
+ * more characters. Clients should call this method as the last
+ * call after a sequence of one or more calls to
+ * keyboardTransliterate()
.
+ * @param text the buffer holding transliterated and
+ * untransliterated text.
+ * @param index the array of indices previously passed to {@link
+ * #keyboardTransliterate}
+ */
+ public final void finishKeyboardTransliteration(Replaceable text,
+ int[] index) {
+ transliterate(text, index[START], index[LIMIT]);
+ }
+
+ /**
+ * Abstract method that concrete subclasses define to implement
+ * keyboard transliteration. This method should transliterate all
+ * characters between index[CURSOR]
and
+ * index[LIMIT]
that can be unambiguously
+ * transliterated, regardless of future insertions of text at
+ * index[LIMIT]
. index[CURSOR]
should
+ * be advanced past committed characters (those that will not
+ * change in future calls to this method).
+ * index[LIMIT]
should be updated to reflect text
+ * replacements that shorten or lengthen the text between
+ * index[CURSOR]
and index[LIMIT]
. Upon
+ * return, neither index[CURSOR]
nor
+ * index[LIMIT]
should be less than the initial value
+ * of index[CURSOR]
. index[START]
+ * should not be changed.
+ *
+ * @param text the buffer holding transliterated and
+ * untransliterated text
+ * @param index an array of three integers. See {@link
+ * #keyboardTransliterate(Replaceable, int[], String)}.
+ * @see #keyboardTransliterate
+ */
+ protected abstract void handleKeyboardTransliterate(Replaceable text,
+ int[] index);
+
+ /**
+ * Returns the length of the longest context required by this transliterator.
+ * This is preceding context. The default implementation supplied
+ * by Transliterator
returns zero; subclasses
+ * that use preceding context should override this method to return the
+ * correct value. For example, if a transliterator translates "ddd" (where
+ * d is any digit) to "555" when preceded by "(ddd)", then the preceding
+ * context length is 5, the length of "(ddd)".
+ *
+ * @return The maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return 0;
+ }
+
+ /**
+ * Returns a programmatic identifier for this transliterator.
+ * If this identifier is passed to getInstance()
, it
+ * will return this object, if it has been registered.
+ * @see #registerInstance
+ * @see #registerClass
+ * @see #getAvailableIDs
+ */
+ public final String getID() {
+ return ID;
+ }
+
+ /**
+ * Returns a name for this transliterator that is appropriate for
+ * display to the user in the default locale. See {@link
+ * #getDisplayName(Locale)} for details.
+ */
+ public final String getDisplayName() {
+ return getDisplayName(Locale.getDefault());
+ }
+
+ /**
+ * Returns a name for this transliterator that is appropriate for
+ * display to the user in the given locale. This name is taken
+ * from the locale resource data in the standard manner of the
+ * java.text
package.
+ *
+ * If no localized names exist in the system resource bundles,
+ * a name is synthesized using a localized
+ * MessageFormat
pattern from the resource data. The
+ * arguments to this pattern are an integer followed by one or two
+ * strings. The integer is the number of strings, either 1 or 2.
+ * The strings are formed by splitting the ID for this
+ * transliterator at the first '-'. If there is no '-', then the
+ * entire ID forms the only string.
+ * @param inLocale the Locale in which the display name should be
+ * localized.
+ * @see java.text.MessageFormat
+ */
+ public String getDisplayName(Locale inLocale) {
+ ResourceBundle bundle = ResourceBundle.getBundle(
+ RB_LOCALE_ELEMENTS, inLocale);
+
+ try {
+ return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
+ } catch (MissingResourceException e) {}
+
+ try {
+ // Construct the formatter first; if getString() fails
+ // we'll exit the try block
+ MessageFormat format = new MessageFormat(
+ bundle.getString(RB_DISPLAY_NAME_PATTERN));
+ // Construct the argument array
+ int i = ID.indexOf('-');
+ Object[] args = (i < 0)
+ ? new Object[] { new Integer(1), ID }
+ : new Object[] { new Integer(2), ID.substring(0, i),
+ ID.substring(i+1) };
+ // Format it using the pattern in the resource
+ return format.format(args);
+ } catch (MissingResourceException e2) {}
+
+ // We should not reach this point unless there is something
+ // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
+ // been deleted from the root RB_LOCALE_ELEMENTS resource.
+ throw new RuntimeException();
+ }
+
+ /**
+ * Returns the filter used by this transliterator, or null
+ * if this transliterator uses no filter.
+ */
+ public UnicodeFilter getFilter() {
+ return filter;
+ }
+
+ /**
+ * Changes the filter used by this transliterator. If the filter
+ * is set to null then no filtering will occur.
+ *
+ *
Callers must take care if a transliterator is in use by
+ * multiple threads. The filter should not be changed by one
+ * thread while another thread may be transliterating.
+ */
+ public void setFilter(UnicodeFilter filter) {
+ this.filter = filter;
+ }
+
+ /**
+ * Returns this transliterator's inverse. See the class
+ * documentation for details. This implementation simply inverts
+ * the two entities in the ID and attempts to retrieve the
+ * resulting transliterator. That is, if getID()
+ * returns "A-B", then this method will return the result of
+ * getInstance("B-A")
, or null
if that
+ * call fails.
+ *
+ *
This method does not take filtering into account. The
+ * returned transliterator will have no filter.
+ *
+ *
Subclasses with knowledge of their inverse may wish to
+ * override this method.
+ *
+ * @return a transliterator that is an inverse, not necessarily
+ * exact, of this transliterator, or null
if no such
+ * transliterator is registered.
+ * @see #registerInstance
+ */
+ public Transliterator getInverse() {
+ int i = ID.indexOf('-');
+ if (i >= 0) {
+ String inverseID = ID.substring(i+1) + '-' + ID.substring(0, i);
+ return internalGetInstance(inverseID);
+ }
+ return null;
+ }
+
+ /**
+ * Returns a Transliterator
object given its ID.
+ * The ID must be either a system transliterator ID or a ID registered
+ * using registerInstance()
.
+ *
+ * @param ID a valid ID, as enumerated by getAvailableIDs()
+ * @return A Transliterator
object with the given ID
+ * @exception IllegalArgumentException if the given ID is invalid.
+ * @see #registerInstance
+ * @see #getAvailableIDs
+ * @see #getID
+ */
+ public static Transliterator getInstance(String ID) {
+ Transliterator t = internalGetInstance(ID);
+ if (t != null) {
+ return t;
+ }
+ throw new IllegalArgumentException("Unsupported transliterator: "
+ + ID);
+ }
+
+ /**
+ * Returns a transliterator object given its ID. Unlike getInstance(),
+ * this method returns null if it cannot make use of the given ID.
+ */
+ private static Transliterator internalGetInstance(String ID) {
+ Object obj = cache.get(ID);
+ RuleBasedTransliterator.Data data = null;
+
+ if (obj instanceof RuleBasedTransliterator.Data) {
+ data = (RuleBasedTransliterator.Data) obj;
+ // Fall through to construct transliterator from cached Data object.
+ } else if (obj instanceof Class) {
+ try {
+ return (Transliterator) ((Class) obj).newInstance();
+ } catch (InstantiationException e) {
+ } catch (IllegalAccessException e2) {}
+ } else {
+ synchronized (cache) {
+ boolean isReverse = (obj == REVERSE_RULE_BASED_PLACEHOLDER);
+ String resourceName = RB_RULE_BASED_PREFIX;
+ int i = ID.indexOf('-');
+ if (i < 0) {
+ resourceName += ID;
+ } else {
+ String IDLeft = ID.substring(0, i);
+ String IDRight = ID.substring(i+1);
+ resourceName += isReverse ? (IDRight + IDLeft)
+ : (IDLeft + IDRight);
+ }
+ try {
+ ResourceBundle resource = ResourceBundle.getBundle(resourceName);
+
+ data = RuleBasedTransliterator.parse(resource.getString(RB_RULE),
+ isReverse
+ ? RuleBasedTransliterator.REVERSE
+ : RuleBasedTransliterator.FORWARD);
+
+ cache.put(ID, data);
+ // Fall through to construct transliterator from Data object.
+ } catch (MissingResourceException e) {}
+ }
+ }
+
+ if (data != null) {
+ return new RuleBasedTransliterator(ID, data, null);
+ }
+
+ return null;
+ }
+
+ /**
+ * Registers a subclass of Transliterator
with the
+ * system. This subclass must have a public constructor taking no
+ * arguments. When that constructor is called, the resulting
+ * object must return the ID
passed to this method if
+ * its getID()
method is called.
+ *
+ * @param ID the result of getID()
for this
+ * transliterator
+ * @param transClass a subclass of Transliterator
+ * @see #registerInstance
+ * @see #unregister
+ */
+ public static void registerClass(String ID, Class transClass) {
+ cache.put(ID, transClass);
+ }
+
+ /**
+ * Unregisters a transliterator or class. This may be either
+ * a system transliterator or a user transliterator or class.
+ *
+ * @param ID the ID of the transliterator or class
+ * @return the Object
that was registered with
+ * ID
, or null
if none was
+ * @see #registerInstance
+ * @see #registerClass
+ */
+ public static Object unregister(String ID) {
+ return cache.remove(ID);
+ }
+
+ /**
+ * Returns an enumeration over the programmatic names of registered
+ * Transliterator
objects. This includes both system
+ * transliterators and user transliterators registered using
+ * registerInstance()
. The enumerated names may be
+ * passed to getInstance()
.
+ *
+ * @return An Enumeration
over String
objects
+ * @see #getInstance
+ * @see #registerInstance
+ */
+ public static final Enumeration getAvailableIDs() {
+ return cache.keys();
+ }
+
+ static {
+ ResourceBundle bundle = ResourceBundle.getBundle(RB_LOCALE_ELEMENTS);
+
+ try {
+ String[] ruleBasedIDs = bundle.getStringArray(RB_RULE_BASED_IDS);
+
+ cache = new Hashtable();
+
+ for (int i=0; iUnicodeFilter defines a protocol for selecting a
+ * subset of the full range (U+0000 to U+FFFF) of Unicode characters.
+ * Currently, filters are used in conjunction with classes like {@link
+ * Transliterator} to only process selected characters through a
+ * transformation.
+ *
+ * {@link UnicodeFilterLogic}
+ */
+
+public interface UnicodeFilter {
+
+ /**
+ * Returns true for characters that are in the selected
+ * subset. In other words, if a character is to be
+ * filtered, then isIn() returns
+ * false.
+ */
+ public boolean isIn(char c);
+}
diff --git a/icu4j/src/com/ibm/text/UnicodeFilterLogic.java b/icu4j/src/com/ibm/text/UnicodeFilterLogic.java
new file mode 100755
index 00000000000..f9e6ec1c609
--- /dev/null
+++ b/icu4j/src/com/ibm/text/UnicodeFilterLogic.java
@@ -0,0 +1,112 @@
+package com.ibm.text;
+
+/**
+ * UnicodeFilterLogic
provides logical operators on
+ * {@link UnicodeFilter} objects. This class cannot be instantiated;
+ * it consists only of static methods. The static methods return
+ * filter objects that perform logical inversion (not),
+ * intersection (and), or union (or) of the given
+ * filter objects.
+ */
+public final class UnicodeFilterLogic {
+
+ /**
+ * Returns a UnicodeFilter that implements the inverse of
+ * the given filter.
+ */
+ public static UnicodeFilter not(final UnicodeFilter f) {
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return !f.isIn(c);
+ }
+ };
+ }
+
+ /**
+ * Returns a UnicodeFilter that implements a short
+ * circuit AND of the result of the two given filters. That is,
+ * if f.isIn() is false, then g.isIn()
+ * is not called, and isIn() returns false.
+ *
+ * Either f or g must be non-null.
+ */
+ public static UnicodeFilter and(final UnicodeFilter f,
+ final UnicodeFilter g) {
+ if (f == null) {
+ return g;
+ }
+ if (g == null) {
+ return f;
+ }
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return f.isIn(c) && g.isIn(c);
+ }
+ };
+ }
+
+ /**
+ * Returns a UnicodeFilter that implements a short
+ * circuit AND of the result of the given filters. That is, if
+ * f[i].isIn() is false, then
+ * f[j].isIn() is not called, where j > i, and
+ * isIn() returns false.
+ */
+ public static UnicodeFilter and(final UnicodeFilter[] f) {
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ for (int i=0; iUnicodeFilter that implements a short
+ * circuit OR of the result of the two given filters. That is, if
+ * f.isIn() is true, then g.isIn() is
+ * not called, and isIn() returns true.
+ *
+ * Either f or g must be non-null.
+ */
+ public static UnicodeFilter or(final UnicodeFilter f,
+ final UnicodeFilter g) {
+ if (f == null) {
+ return g;
+ }
+ if (g == null) {
+ return f;
+ }
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ return f.isIn(c) || g.isIn(c);
+ }
+ };
+ }
+
+ /**
+ * Returns a UnicodeFilter that implements a short
+ * circuit OR of the result of the given filters. That is, if
+ * f[i].isIn() is false, then
+ * f[j].isIn() is not called, where j > i, and
+ * isIn() returns true.
+ */
+ public static UnicodeFilter or(final UnicodeFilter[] f) {
+ return new UnicodeFilter() {
+ public boolean isIn(char c) {
+ for (int i=0; icharacter classes used in regular expressions.
+ * Such classes specify a subset of the set of all Unicode characters,
+ * which in this implementation is the characters from U+0000 to
+ * U+FFFF, ignoring surrogates.
+ *
+ * This class supports two APIs. The first is modeled after Java 2's
+ * java.util.Set
interface, although this class does not
+ * implement that interface. All methods of Set
are
+ * supported, with the modification that they take a character range
+ * or single character instead of an Object
, and they
+ * take a UnicodeSet
instead of a Collection
.
+ *
+ *
The second API is the
+ * applyPattern()
/toPattern()
API from the
+ * java.text.Format
-derived classes. Unlike the
+ * methods that add characters, add categories, and control the logic
+ * of the set, the method applyPattern()
sets all
+ * attributes of a UnicodeSet
at once, based on a
+ * string pattern.
+ *
+ *
In addition, the set complement operation is supported through
+ * the complement()
method.
+ *
+ *
Pattern syntax
+ *
+ * Patterns are accepted by the constructors and the
+ * applyPattern()
methods and returned by the
+ * toPattern()
method. These patterns follow a syntax
+ * similar to that employed by version 8 regular expression character
+ * classes:
+ *
+ *
+ *
+ *
+ * pattern := |
+ * ('[' '^'? item* ']') |
+ * ('[:' '^'? category ':]') |
+ *
+ *
+ * item := |
+ * char | (char '-' char) | pattern-expr
+ * |
+ *
+ *
+ * pattern-expr := |
+ * pattern | pattern-expr pattern |
+ * pattern-expr op pattern
+ * |
+ *
+ *
+ * op := |
+ * '&' | '-'
+ * |
+ *
+ *
+ * special := |
+ * '[' | ']' | '-'
+ * |
+ *
+ *
+ * char := |
+ * any character that is not special
+ * | ('\u005C' any character)
+ * | ('\u005Cu' hex hex hex hex)
+ * |
+ *
+ *
+ * hex := |
+ * any character for which
+ * Character.digit(c, 16)
+ * returns a non-negative result |
+ *
+ *
+ * category := |
+ * 'M' | 'N' | 'Z' | 'C' | 'L' | 'P' |
+ * 'S' | 'Mn' | 'Mc' | 'Me' | 'Nd' | 'Nl' | 'No' | 'Zs' | 'Zl' |
+ * 'Zp' | 'Cc' | 'Cf' | 'Cs' | 'Co' | 'Cn' | 'Lu' | 'Ll' | 'Lt'
+ * | 'Lm' | 'Lo' | 'Pc' | 'Pd' | 'Ps' | 'Pe' | 'Po' | 'Sm' |
+ * 'Sc' | 'Sk' | 'So' |
+ *
+ *
+ *
+ *
+ *
+ * Legend:
+ *
+ * a := b |
+ * |
+ * a may be replaced by b |
+ *
+ *
+ * a? |
+ * |
+ * zero or one instance of a
+ * |
+ *
+ *
+ * a* |
+ * |
+ * one or more instances of a
+ * |
+ *
+ *
+ * a | b |
+ * |
+ * either a or b
+ * |
+ *
+ *
+ * 'a' |
+ * |
+ * the literal string between the quotes |
+ *
+ *
+ * |
+ *
+ *
+ *
+ *
+ * Patterns specify individual characters, ranges of characters, and
+ * Unicode character categories. When elements are concatenated, they
+ * specify their union. To complement a set, place a '^' immediately
+ * after the opening '[' or '[:'. In any other location, '^' has no
+ * special meaning.
+ *
+ * Ranges are indicated by placing two a '-' between two
+ * characters, as in "a-z". This specifies the range of all
+ * characters from the left to the right, in Unicode order. If the
+ * left and right characters are the same, then the range consists of
+ * just that character. If the left character is greater than the
+ * right character it is a syntax error. If a '-' occurs as the first
+ * character after the opening '[' or '[^', or if it occurs as the
+ * last character before the closing ']', then it is taken as a
+ * literal. Thus "[a\u005C-b]", "[-ab]", and "[ab-]" all indicate the same
+ * set of three characters, 'a', 'b', and '-'.
+ *
+ *
Sets may be intersected using the '&' operator or the asymmetric
+ * set difference may be taken using the '-' operator, for example,
+ * "[[:L:]&[\u005Cu0000-\u005Cu0FFF]]" indicates the set of all Unicode letters
+ * with values less than 4096. Operators ('&' and '|') have equal
+ * precedence and bind left-to-right. Thus
+ * "[[:L:]-[a-z]-[\u005Cu0100-\u005Cu01FF]]" is equivalent to
+ * "[[[:L:]-[a-z]]-[\u005Cu0100-\u005Cu01FF]]". This only really matters for
+ * difference; intersection is commutative.
+ *
+ *
+ * [a] | The set containing 'a'
+ * |
[a-z] | The set containing 'a'
+ * through 'z' and all letters in between, in Unicode order
+ * |
[^a-z] | The set containing
+ * all characters but 'a' through 'z',
+ * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF
+ * |
[[pat1][pat2]]
+ * | The union of sets specified by pat1 and pat2
+ * |
[[pat1]&[pat2]]
+ * | The intersection of sets specified by pat1 and pat2
+ * |
[[pat1]-[pat2]]
+ * | The asymmetric difference of sets specified by pat1 and
+ * pat2
+ * |
[:Lu:]
+ * | The set of characters belonging to the given
+ * Unicode category, as defined by Character.getType() ; in
+ * this case, Unicode uppercase letters
+ * |
[:L:]
+ * | The set of characters belonging to all Unicode categories
+ * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]] .
+ * |
+ *
+ * Character categories.
+ *
+ * Character categories are specified using the POSIX-like syntax
+ * '[:Lu:]'. The complement of a category is specified by inserting
+ * '^' after the opening '[:'. The following category names are
+ * recognized. Actual determination of category data uses
+ * Character.getType()
, so it reflects the underlying
+ * implmementation used by Character
. As of Java 2 and
+ * JDK 1.1.8, this is Unicode 2.1.2.
+ *
+ *
+ * Normative
+ * Mn = Mark, Non-Spacing
+ * Mc = Mark, Spacing Combining
+ * Me = Mark, Enclosing
+ *
+ * Nd = Number, Decimal Digit
+ * Nl = Number, Letter
+ * No = Number, Other
+ *
+ * Zs = Separator, Space
+ * Zl = Separator, Line
+ * Zp = Separator, Paragraph
+ *
+ * Cc = Other, Control
+ * Cf = Other, Format
+ * Cs = Other, Surrogate
+ * Co = Other, Private Use
+ * Cn = Other, Not Assigned
+ *
+ * Informative
+ * Lu = Letter, Uppercase
+ * Ll = Letter, Lowercase
+ * Lt = Letter, Titlecase
+ * Lm = Letter, Modifier
+ * Lo = Letter, Other
+ *
+ * Pc = Punctuation, Connector
+ * Pd = Punctuation, Dash
+ * Ps = Punctuation, Open
+ * Pe = Punctuation, Close
+ * *Pi = Punctuation, Initial quote
+ * *Pf = Punctuation, Final quote
+ * Po = Punctuation, Other
+ *
+ * Sm = Symbol, Math
+ * Sc = Symbol, Currency
+ * Sk = Symbol, Modifier
+ * So = Symbol, Other
+ *
+ * *Unsupported by Java (and hence unsupported by UnicodeSet).
+ *
+ * @author Alan Liu
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */
+public class UnicodeSet {
+ /**
+ * The internal representation is a StringBuffer of even length.
+ * Each pair of characters represents a range that is included in
+ * the set. A single character c is represented as cc. Thus, the
+ * ranges in the set are (a,b), a and b inclusive, where a =
+ * pairs.charAt(i) and b = pairs.charAt(i+1) for all even i, 0 <=
+ * i <= pairs.length()-2. Pairs are always stored in ascending
+ * Unicode order. Pairs are always stored in shortest form. For
+ * example, if the pair "hh", representing the single character
+ * 'h', is added to the pairs list "agik", representing the ranges
+ * 'a'-'g' and 'i'-'k', the result is "ak", not "aghhik".
+ *
+ * This representation format was originally used in Richard
+ * Gillam's CharSet class.
+ */
+ private StringBuffer pairs;
+
+ private static final String CATEGORY_NAMES =
+ // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
+ //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8
+ "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
+
+ private static final int UNSUPPORTED_CATEGORY = 17;
+
+ private static final int CATEGORY_COUNT = 29;
+
+ /**
+ * A cache mapping character category integers, as returned by
+ * Character.getType(), to pairs strings. Entries are initially
+ * null and are created on demand.
+ */
+ private static final String[] CATEGORY_PAIRS_CACHE =
+ new String[CATEGORY_COUNT];
+
+ //----------------------------------------------------------------
+ // Debugging and testing
+ //----------------------------------------------------------------
+
+ /**
+ * Return the representation of this set as a list of character
+ * ranges. Ranges are listed in ascending Unicode order. For
+ * example, the set [a-zA-M3] is represented as "33AMaz".
+ */
+ public String getPairs() {
+ return pairs.toString();
+ }
+
+ //----------------------------------------------------------------
+ // Public API
+ //----------------------------------------------------------------
+
+ /**
+ * Constructs an empty set.
+ */
+ public UnicodeSet() {
+ pairs = new StringBuffer();
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @exception IllegalArgumentException if the pattern contains
+ * a syntax error.
+ */
+ public UnicodeSet(String pattern) {
+ applyPattern(pattern, false);
+ }
+
+ /**
+ * Constructs a set from the given pattern, optionally ignoring
+ * white space. See the class description for the syntax of the
+ * pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param ignoreSpaces if true
, all spaces in the
+ * pattern are ignored, except those preceded by '\u005C'. Spaces are
+ * those characters for which Character.isSpaceChar()
+ * is true
.
+ * @exception IllegalArgumentException
if the pattern
+ * contains a syntax error.
+ */
+ public UnicodeSet(String pattern, boolean ignoreSpaces) {
+ applyPattern(pattern, ignoreSpaces);
+ }
+
+ /**
+ * Constructs a set from the given Unicode character category.
+ * @param category an integer indicating the character category as
+ * returned by Character.getType()
.
+ * @exception IllegalArgumentException
if the given
+ * category is invalid.
+ */
+ public UnicodeSet(int category) {
+ if (category < 0 || category >= CATEGORY_COUNT ||
+ category == UNSUPPORTED_CATEGORY) {
+ throw new IllegalArgumentException("Invalid category");
+ }
+ pairs = new StringBuffer(getCategoryPairs(category));
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given
+ * pattern. See the class description for the syntax of the
+ * pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @exception IllegalArgumentException
if the pattern
+ * contains a syntax error.
+ */
+ public final void applyPattern(String pattern) {
+ applyPattern(pattern, false);
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given
+ * pattern, optionally ignoring white space. See the class
+ * description for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param ignoreSpaces if true
, all spaces in the
+ * pattern are ignored. Spaces are those characters for which
+ * Character.isSpaceChar()
is true
.
+ * Characters preceded by '\\' are escaped, losing any special
+ * meaning they otherwise have. Spaces may be included by
+ * escaping them.
+ * @exception IllegalArgumentException
if the pattern
+ * contains a syntax error.
+ */
+ public void applyPattern(String pattern, boolean ignoreSpaces) {
+ ParsePosition pos = new ParsePosition(0);
+
+ // To ignore spaces, create a new pattern without spaces. We
+ // have to process all '\' escapes. If '\' is encountered,
+ // insert it and the following character (if any -- let parse
+ // deal with any syntax errors) in the pattern. This allows
+ // escaped spaces.
+ if (ignoreSpaces) {
+ StringBuffer pat = new StringBuffer();
+ for (int i=0; in, where 0 <=
n <= 65536
.
+ *
+ * @return the number of elements in this set (its cardinality).
+ */
+ public int size() {
+ int n = 0;
+ for (int i=0; itrue if this set contains no elements.
+ *
+ * @return true if this set contains no elements.
+ */
+ public boolean isEmpty() {
+ return pairs.length() == 0;
+ }
+
+ /**
+ * Returns true if this set contains the specified range
+ * of chars.
+ *
+ * @return true if this set contains the specified range
+ * of chars.
+ */
+ public boolean contains(char first, char last) {
+ // Set i to the end of the smallest range such that its end
+ // point >= last, or pairs.length() if no such range exists.
+ int i = 1;
+ while (ipairs.charAt(i)) i+=2;
+ return i=pairs.charAt(i-1);
+ }
+
+ /**
+ * Returns true if this set contains the specified char.
+ *
+ * @return true if this set contains the specified char.
+ */
+ public boolean contains(char c) {
+ return contains(c, c);
+ }
+
+ /**
+ * Adds the specified range to this set if it is not already
+ * present. If this set already contains the specified range,
+ * the call leaves this set unchanged. If last > first
+ * then an empty range is added, leaving the set unchanged.
+ *
+ * @param first first character, inclusive, of range to be added
+ * to this set.
+ * @param last last character, inclusive, of range to be added
+ * to this set.
+ */
+ public void add(char first, char last) {
+ if (first <= last) {
+ addPair(pairs, first, last);
+ }
+ }
+
+ /**
+ * Adds the specified character to this set if it is not already
+ * present. If this set already contains the specified character,
+ * the call leaves this set unchanged.
+ */
+ public final void add(char c) {
+ add(c, c);
+ }
+
+ /**
+ * Removes the specified range from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns. If last > first
then an empty range is
+ * removed, leaving the set unchanged.
+ *
+ * @param first first character, inclusive, of range to be removed
+ * from this set.
+ * @param last last character, inclusive, of range to be removed
+ * from this set.
+ */
+ public void remove(char first, char last) {
+ if (first <= last) {
+ removePair(pairs, first, last);
+ }
+ }
+
+ /**
+ * Removes the specified character from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns.
+ */
+ public final void remove(char c) {
+ remove(c, c);
+ }
+
+ /**
+ * Returns true if the specified set is a subset
+ * of this set.
+ *
+ * @param c set to be checked for containment in this set.
+ * @return true if this set contains all of the elements of the
+ * specified set.
+ */
+ public boolean containsAll(UnicodeSet c) {
+ // The specified set is a subset if all of its pairs are contained
+ // in this set.
+ int i = 1;
+ for (int j=0; j= last, or pairs.length() if no such range
+ // exists.
+ while (ipairs.charAt(i)) i+=2;
+ if (i>pairs.length() || c.pairs.charAt(j) < pairs.charAt(i-1)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Adds all of the elements in the specified set to this set if
+ * they're not already present. This operation effectively
+ * modifies this set so that its value is the union of the two
+ * sets. The behavior of this operation is unspecified if the specified
+ * collection is modified while the operation is in progress.
+ *
+ * @param c set whose elements are to be added to this set.
+ * @see #add(char, char)
+ */
+ public void addAll(UnicodeSet c) {
+ doUnion(pairs, c.pairs.toString());
+ }
+
+ /**
+ * Retains only the elements in this set that are contained in the
+ * specified set. In other words, removes from this set all of
+ * its elements that are not contained in the specified set. This
+ * operation effectively modifies this set so that its value is
+ * the intersection of the two sets.
+ *
+ * @param c set that defines which elements this set will retain.
+ */
+ public void retainAll(UnicodeSet c) {
+ doIntersection(pairs, c.pairs.toString());
+ }
+
+ /**
+ * Removes from this set all of its elements that are contained in the
+ * specified set. This operation effectively modifies this
+ * set so that its value is the asymmetric set difference of
+ * the two sets.
+ *
+ * @param c set that defines which elements will be removed from
+ * this set.
+ */
+ public void removeAll(UnicodeSet c) {
+ doDifference(pairs, c.pairs.toString());
+ }
+
+ /**
+ * Inverts this set. This operation modifies this set so that
+ * its value is its complement. This is equivalent to the pseudo code:
+ * this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this)
.
+ */
+ public void complement() {
+ doComplement(pairs);
+ }
+
+ /**
+ * Removes all of the elements from this set. This set will be
+ * empty after this call returns.
+ */
+ public void clear() {
+ pairs.setLength(0);
+ }
+
+ /**
+ * Compares the specified object with this set for equality. Returns
+ * true if the specified object is also a set, the two sets
+ * have the same size, and every member of the specified set is
+ * contained in this set (or equivalently, every member of this set is
+ * contained in the specified set).
+ *
+ * @param o Object to be compared for equality with this set.
+ * @return true if the specified Object is equal to this set.
+ */
+ public boolean equals(Object o) {
+ return o instanceof UnicodeSet &&
+ pairs.equals(((UnicodeSet)o).pairs);
+ }
+
+ /**
+ * Returns the hash code value for this set.
+ *
+ * @return the hash code value for this set.
+ * @see Object#hashCode()
+ */
+ public int hashCode() {
+ return pairs.hashCode();
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Pattern parsing
+ //----------------------------------------------------------------
+
+ /**
+ * Parses the given pattern, starting at the given position. The
+ * character at pattern.charAt(pos.getIndex()) must be '[', or the
+ * parse fails. Parsing continues until the corresponding closing
+ * ']'. If a syntax error is encountered between the opening and
+ * closing brace, the parse fails. Upon return from a successful
+ * parse, the ParsePosition is updated to point to the character
+ * following the closing ']', and a StringBuffer containing a
+ * pairs list for the parsed pattern is returned. This method calls
+ * itself recursively to parse embedded subpatterns.
+ *
+ * @param pattern the string containing the pattern to be parsed.
+ * The portion of the string from pos.getIndex(), which must be a
+ * '[', to the corresponding closing ']', is parsed.
+ * @param pos upon entry, the position at which to being parsing.
+ * The character at pattern.charAt(pos.getIndex()) must be a '['.
+ * Upon return from a successful parse, pos.getIndex() is either
+ * the character after the closing ']' of the parsed pattern, or
+ * pattern.length() if the closing ']' is the last character of
+ * the pattern string.
+ * @return a StringBuffer containing a pairs list for the parsed
+ * substring of pattern
+ * @exception IllegalArgumentException if the parse fails.
+ */
+ private static StringBuffer parse(String pattern, ParsePosition pos) {
+
+ boolean invert = false;
+ StringBuffer pairsBuf = new StringBuffer();
+
+ /**
+ * Nodes: 0 - idle, waiting for '['
+ * 10 - like 11, but immediately after "[" or "[^"
+ * 11 - awaiting x, "]", "[...]", or "[:...:]"
+ * 21 - after x
+ * 23 - after x-
+ *
+ * The parsing state machine moves from node 0 through zero or more
+ * other nodes back to node 0, in a successful parse.
+ */
+ int node = 0;
+ char first = 0;
+ int i;
+
+ /**
+ * This loop iterates over the characters in the pattern. We
+ * start at the position specified by pos. We exit the loop
+ * when either a matching closing ']' is seen, or we read all
+ * characters of the pattern.
+ */
+ for (i=pos.getIndex(); i= pattern.length()) {
+ throw new IllegalArgumentException("Invalid \\u escape");
+ }
+ c = '\u0000';
+ for (int j=(++i)+4; i "aq". addPair("ampz", 'n',
+ * 'o') => "az".
+ */
+ private static void addPair(StringBuffer pairs, char c, char d) {
+ char a = 0;
+ char b = 0;
+ for (int i=0; i "ak".
+ * removePair("ampz", 'l', 'q') => "akrz".
+ */
+ private static void removePair(StringBuffer pairs, char c, char d) {
+ // Iterate over pairs until we find a pair that overlaps
+ // with the given range.
+ for (int i=0; i= a.
+ // rangeEdited is set to true if we have modified the
+ // range a-b (the range at i) in place.
+ boolean rangeEdited = false;
+ if (c > a) {
+ // If c is after a and before b, then we have overlap
+ // of this sort: a--c==b--d or a--c==d--b, where a-b
+ // and c-d are the ranges of interest. We need to
+ // add the range a,c-1.
+ pairs.setCharAt(i+1, (char)(c-1));
+ // i is already a
+ rangeEdited = true;
+ }
+ if (d < b) {
+ // If d is after a and before b, we overlap like this:
+ // c--a==d--b or a--c==d--b, where a-b is the range at
+ // i and c-d is the range being removed. We need to
+ // add the range d+1,b.
+ if (rangeEdited) {
+ pairs.insert(i+2, new char[] { (char)(d+1), b });
+ i += 2;
+ } else {
+ pairs.setCharAt(i, (char)(d+1));
+ // i+1 is already b
+ rangeEdited = true;
+ }
+ }
+ if (!rangeEdited) {
+ // If we didn't add any ranges, that means the entire
+ // range a-b must be deleted, since we have
+ // c--a==b--d.
+ stringBufferDelete(pairs, i, i+2);
+ i -= 2;
+ }
+ }
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Fundamental operators
+ //----------------------------------------------------------------
+
+ /**
+ * Changes the pairs list to represent the complement of the set it
+ * currently represents. The pairs list will be normalized (in
+ * order and in shortest possible form) if the original pairs list
+ * was normalized.
+ */
+ private static void doComplement(StringBuffer pairs) {
+ if (pairs.length() == 0) {
+ pairs.append('\u0000').append('\uffff');
+ return;
+ }
+
+ // Change each end to a start and each start to an end of the
+ // gaps between the ranges. That is, 3-7 9-12 becomes x-2 8-8
+ // 13-x, where 'x' represents a range that must now be fixed
+ // up.
+ for (int i=0; i 0 && c1.charAt(i - 1) > ub)
+ ub = c1.charAt(i - 1);
+
+ // now advance j to the first character that is greater
+ // that "ub" plus one
+ while (j < c2.length() && c2.charAt(j) <= ub + 1)
+ ++j;
+
+ // if j points to the endpoint of a range, update "ub"
+ // to that character, or if j points to the start of
+ // a range and the endpoint of the preceding range is
+ // greater than "ub", update "up" to _that_ character
+ if (j % 2 == 1)
+ ub = c2.charAt(j);
+ else if (j > 0 && c2.charAt(j - 1) > ub)
+ ub = c2.charAt(j - 1);
+ }
+ // when we finally fall out of this loop, we will have stitched
+ // together a series of ranges that overlap or touch, i and j
+ // will both point to starting points of ranges, and "ub" will
+ // be the endpoint of the range we're working on. Write "ub"
+ // to the result
+ result.append(ub);
+
+ // loop back around to create the next range in the result
+ }
+
+ // we fall out to here when we've exhausted all the characters in
+ // one of the operands. We can append all of the remaining characters
+ // in the other operand without doing any extra work.
+ if (i < c1.length())
+ result.append(c1.substring(i));
+ if (j < c2.length())
+ result.append(c2.substring(j));
+
+ pairs.setLength(0);
+ pairs.append(result.toString());
+ }
+
+ /**
+ * Given two pairs lists, changes the first in place to represent
+ * the asymmetric difference of the two sets.
+ */
+ private static void doDifference(StringBuffer pairs, String pairs2) {
+ StringBuffer p2 = new StringBuffer(pairs2);
+ doComplement(p2);
+ doIntersection(pairs, p2.toString());
+ }
+
+ /**
+ * Given two pairs lists, changes the first in place to represent
+ * the intersection of the two sets.
+ *
+ * This implementation format was stolen from Richard Gillam's
+ * CharSet class.
+ */
+ private static void doIntersection(StringBuffer pairs, String c2) {
+ StringBuffer result = new StringBuffer();
+ String c1 = pairs.toString();
+
+ int i = 0;
+ int j = 0;
+ int oldI;
+ int oldJ;
+
+ // iterate until we've exhausted one of the operands
+ while (i < c1.length() && j < c2.length()) {
+
+ // advance j until it points to a character that is larger than
+ // the one i points to. If this is the beginning of a one-
+ // character range, advance j to point to the end
+ if (i < c1.length() && i % 2 == 0) {
+ while (j < c2.length() && c2.charAt(j) < c1.charAt(i))
+ ++j;
+ if (j < c2.length() && j % 2 == 0 && c2.charAt(j) == c1.charAt(i))
+ ++j;
+ }
+
+ // if j points to the endpoint of a range, save the current
+ // value of i, then advance i until it reaches a character
+ // which is larger than the character pointed at
+ // by j. All of the characters we've advanced over (except
+ // the one currently pointed to by i) are added to the result
+ oldI = i;
+ while (j % 2 == 1 && i < c1.length() && c1.charAt(i) <= c2.charAt(j))
+ ++i;
+ result.append(c1.substring(oldI, i));
+
+ // if i points to the endpoint of a range, save the current
+ // value of j, then advance j until it reaches a character
+ // which is larger than the character pointed at
+ // by i. All of the characters we've advanced over (except
+ // the one currently pointed to by i) are added to the result
+ oldJ = j;
+ while (i % 2 == 1 && j < c2.length() && c2.charAt(j) <= c1.charAt(i))
+ ++j;
+ result.append(c2.substring(oldJ, j));
+
+ // advance i until it points to a character larger than j
+ // If it points at the beginning of a one-character range,
+ // advance it to the end of that range
+ if (j < c2.length() && j % 2 == 0) {
+ while (i < c1.length() && c1.charAt(i) < c2.charAt(j))
+ ++i;
+ if (i < c1.length() && i % 2 == 0 && c2.charAt(j) == c1.charAt(i))
+ ++i;
+ }
+ }
+
+ pairs.setLength(0);
+ pairs.append(result.toString());
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Generation of pairs for Unicode categories
+ //----------------------------------------------------------------
+
+ /**
+ * Returns a pairs string for the given category, given its name.
+ * The category name must be either a two-letter name, such as
+ * "Lu", or a one letter name, such as "L". One-letter names
+ * indicate the logical union of all two-letter names that start
+ * with that letter. Case is significant. If the name starts
+ * with the character '^' then the complement of the given
+ * character set is returned.
+ *
+ * Although individual categories such as "Lu" are cached, we do
+ * not currently cache single-letter categories such as "L" or
+ * complements such as "^Lu" or "^L". It would be easy to cache
+ * these as well in a hashtable should the need arise.
+ */
+ private static String getCategoryPairs(String catName) {
+ boolean invert = (catName.length() > 1 &&
+ catName.charAt(0) == '^');
+ if (invert) {
+ catName = catName.substring(1);
+ }
+
+ StringBuffer cat = null;
+
+ // if we have two characters, search the category map for that
+ // code and either construct and return a UnicodeSet from the
+ // data in the category map or throw an exception
+ if (catName.length() == 2) {
+ int i = CATEGORY_NAMES.indexOf(catName);
+ if (i>=0 && i%2==0) {
+ i /= 2;
+ if (i != UNSUPPORTED_CATEGORY) {
+ String pairs = getCategoryPairs(i);
+ if (!invert) {
+ return pairs;
+ }
+ cat = new StringBuffer(pairs);
+ }
+ }
+ } else if (catName.length() == 1) {
+ // if we have one character, search the category map for
+ // codes beginning with that letter, and union together
+ // all of the matching sets that we find (or throw an
+ // exception if there are no matches)
+ for (int i=0; i= 0) {
+ pairs.append((char)first).append((char)last);
+ }
+ first = last = i;
+ }
+ }
+ }
+ if (first >= 0) {
+ pairs.append((char)first).append((char)last);
+ }
+ CATEGORY_PAIRS_CACHE[cat] = pairs.toString();
+ }
+ return CATEGORY_PAIRS_CACHE[cat];
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Utility methods
+ //----------------------------------------------------------------
+
+ /**
+ * Returns the character after the given position, or '\uFFFF' if
+ * there is none.
+
+ */
+ private static final char charAfter(String str, int i) {
+ return ((++i) < str.length()) ? str.charAt(i) : '\uFFFF';
+ }
+
+ /**
+ * Deletes a range of character from a StringBuffer, from start to
+ * limit-1. This is not part of JDK 1.1 StringBuffer, but is
+ * present in Java 2.
+ * @param start inclusive start of range
+ * @param limit exclusive end of range
+ */
+ private static void stringBufferDelete(StringBuffer buf,
+ int start, int limit) {
+ // In Java 2 just use:
+ // buf.delete(start, limit);
+ char[] chars = null;
+ if (buf.length() > limit) {
+ chars = new char[buf.length() - limit];
+ buf.getChars(limit, buf.length(), chars, 0);
+ }
+ buf.setLength(start);
+ if (chars != null) {
+ buf.append(chars);
+ }
+ }
+}
diff --git a/icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java b/icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java
new file mode 100755
index 00000000000..1e688f65fa9
--- /dev/null
+++ b/icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java
@@ -0,0 +1,172 @@
+package com.ibm.text;
+import java.util.*;
+
+/**
+ * A transliterator that converts from Unicode characters to
+ * hexadecimal Unicode escape sequences. It outputs a
+ * prefix specified in the constructor and optionally converts the hex
+ * digits to uppercase.
+ *
+ * Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class UnicodeToHexTransliterator extends Transliterator {
+
+ /**
+ * Package accessible ID for this transliterator.
+ */
+ static String _ID = "Unicode-Hex";
+
+ private String prefix;
+
+ private boolean uppercase;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructs a transliterator.
+ * @param prefix the string that will precede the four hex
+ * digits for UNICODE_HEX transliterators. Ignored
+ * if direction is HEX_UNICODE.
+ * @param uppercase if true, the four hex digits will be
+ * converted to uppercase; otherwise they will be lowercase.
+ * Ignored if direction is HEX_UNICODE.
+ */
+ public UnicodeToHexTransliterator(String prefix, boolean uppercase,
+ UnicodeFilter filter) {
+ super(_ID, filter);
+ this.prefix = prefix;
+ this.uppercase = uppercase;
+ }
+
+ /**
+ * Constructs a transliterator with the default prefix "\u"
+ * that outputs uppercase hex digits.
+ */
+ public UnicodeToHexTransliterator() {
+ this("\\u", true, null);
+ }
+
+ /**
+ * Returns the string that precedes the four hex digits.
+ * @return prefix string
+ */
+ public String getPrefix() {
+ return prefix;
+ }
+
+ /**
+ * Sets the string that precedes the four hex digits.
+ *
+ *
Callers must take care if a transliterator is in use by
+ * multiple threads. The prefix should not be changed by one
+ * thread while another thread may be transliterating.
+ * @param prefix prefix string
+ */
+ public void setPrefix(String prefix) {
+ this.prefix = prefix;
+ }
+
+ /**
+ * Returns true if this transliterator outputs uppercase hex digits.
+ */
+ public boolean isUppercase() {
+ return uppercase;
+ }
+
+ /**
+ * Sets if this transliterator outputs uppercase hex digits.
+ *
+ *
Callers must take care if a transliterator is in use by
+ * multiple threads. The uppercase mode should not be changed by
+ * one thread while another thread may be transliterating.
+ * @param outputUppercase if true, then this transliterator
+ * outputs uppercase hex digits.
+ */
+ public void setUppercase(boolean outputUppercase) {
+ uppercase = outputUppercase;
+ }
+
+ /**
+ * Transliterates a segment of a string. Transliterator
API.
+ * @param text the string to be transliterated
+ * @param start the beginning index, inclusive; 0 <= start
+ * <= limit
.
+ * @param limit the ending index, exclusive; start <= limit
+ * <= text.length()
.
+ * @return the new limit index
+ */
+ public int transliterate(Replaceable text, int start, int limit) {
+ int[] offsets = { start, limit, start };
+ handleKeyboardTransliterate(text, offsets);
+ return offsets[LIMIT];
+ }
+
+ /**
+ * Implements {@link Transliterator#handleKeyboardTransliterate}.
+ */
+ protected void handleKeyboardTransliterate(Replaceable text,
+ int[] offsets) {
+ /**
+ * Performs transliteration changing all characters to
+ * Unicode hexadecimal escapes. For example, '@' -> "U+0040",
+ * assuming the prefix is "U+".
+ */
+ int cursor = offsets[CURSOR];
+ int limit = offsets[LIMIT];
+
+ UnicodeFilter filter = getFilter();
+
+ loop:
+ while (cursor < limit) {
+ char c = text.charAt(cursor);
+ if (filter != null && !filter.isIn(c)) {
+ ++cursor;
+ continue;
+ }
+ String hex = hex(c);
+ text.replace(cursor, cursor+1, hex);
+ int len = hex.length();
+ cursor += len; // Advance cursor by 1 and adjust for new text
+ --len;
+ limit += len;
+ }
+
+ offsets[LIMIT] = limit;
+ offsets[CURSOR] = cursor;
+ }
+
+ /**
+ * Return the length of the longest context required by this transliterator.
+ * This is preceding context.
+ * @param direction either FORWARD
or REVERSE
+ * @return maximum number of preceding context characters this
+ * transliterator needs to examine
+ */
+ protected int getMaximumContextLength() {
+ return 0;
+ }
+
+ /**
+ * Form escape sequence.
+ */
+ private final String hex(char c) {
+ StringBuffer buf = new StringBuffer();
+ buf.append(prefix);
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ String h = Integer.toHexString(c);
+ buf.append(uppercase ? h.toUpperCase() : h);
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/text/components/AppletFrame.java b/icu4j/src/com/ibm/text/components/AppletFrame.java
new file mode 100755
index 00000000000..cf6cc399ddd
--- /dev/null
+++ b/icu4j/src/com/ibm/text/components/AppletFrame.java
@@ -0,0 +1,126 @@
+package com.ibm.text.components;
+import java.applet.*;
+import java.net.URL;
+import java.util.Enumeration;
+import java.awt.*;
+import java.awt.event.*;
+
+/**
+ *
A Frame that runs an Applet within itself, making it possible
+ * for an applet to run as an application. Usage:
+ *
+ *
+ * public class MyApplet extends Applet {
+ * public static void main(String args[]) {
+ * MyApplet applet = new MyApplet();
+ * new AppletFrame("My Applet Running As An App", applet, 640, 480);
+ * }
+ * ...
+ * }
+ *
+ *
+ * Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: AppletFrame.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class AppletFrame extends Frame implements AppletStub, AppletContext {
+
+ Applet applet;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Construct a Frame running the given Applet with the default size
+ * of 640 by 480.
+ * When the Frame is closed, the applet's stop() method is called,
+ * the Frame is dispose()d of, and System.exit(0) is called.
+ *
+ * @param name the Frame title
+ * @param applet the applet to be run
+ */
+ public AppletFrame(String name, Applet applet) {
+ this(name, applet, 640, 480);
+ }
+
+ /**
+ * Construct a Frame running the given Applet with the given size.
+ * When the Frame is closed, the applet's stop() method is called,
+ * the Frame is dispose()d of, and System.exit(0) is called.
+ *
+ * @param name the Frame title
+ * @param applet the applet to be run
+ * @param width width of the Frame
+ * @param height height of the Frame
+ */
+ public AppletFrame(String name, Applet applet, int width, int height) {
+ super(name);
+ this.applet = applet;
+ applet.setStub(this);
+
+ resize(width, height);
+ add("Center", applet);
+ show();
+ addWindowListener(new WindowAdapter() {
+ public void windowClosing(WindowEvent e) {
+ AppletFrame.this.applet.stop();
+ dispose();
+ System.exit(0);
+ }
+ });
+
+ applet.init();
+ applet.start();
+ }
+
+ // AppletStub API
+ public void appletResize(int width,
+ int height) {
+ resize(width, height);
+ }
+
+ public AppletContext getAppletContext() {
+ return this;
+ }
+
+ public URL getCodeBase() {
+ return null;
+ }
+
+ public URL getDocumentBase() {
+ return null;
+ }
+
+ public String getParameter(String name) {
+ return "PARAMETER";
+ }
+
+ public boolean isActive() {
+ return true;
+ }
+
+ // AppletContext API
+ public Applet getApplet(String name) {
+ return applet;
+ }
+
+ public Enumeration getApplets() {
+ return null;
+ }
+
+ public AudioClip getAudioClip(URL url) {
+ return null;
+ }
+
+ public Image getImage(URL url) {
+ return null;
+ }
+
+ public void showDocument(URL url) {}
+ public void showDocument(URL url, String target) {}
+
+ public void showStatus(String status) {
+ System.out.println(status);
+ }
+}
diff --git a/icu4j/src/com/ibm/text/components/DumbTextComponent.java b/icu4j/src/com/ibm/text/components/DumbTextComponent.java
new file mode 100755
index 00000000000..a400b9a76f1
--- /dev/null
+++ b/icu4j/src/com/ibm/text/components/DumbTextComponent.java
@@ -0,0 +1,708 @@
+package com.ibm.text.components;
+import java.awt.*;
+import java.awt.event.*;
+import java.text.*;
+import java.awt.datatransfer.*;
+
+// LIU: Changed from final to non-final
+public class DumbTextComponent extends Canvas
+ implements KeyListener, MouseListener, MouseMotionListener, FocusListener
+ {
+ private transient static final String copyright =
+ "Copyright \u00A9 1998, Mark Davis. All Rights Reserved.";
+ private transient static boolean DEBUG = false;
+
+ private String contents = "";
+ private Selection selection = new Selection();
+ private boolean editable = true;
+
+ private transient Selection tempSelection = new Selection();
+ private transient boolean focus;
+ private transient BreakIterator lineBreaker = BreakIterator.getLineInstance();
+ private transient BreakIterator wordBreaker = BreakIterator.getWordInstance();
+ private transient BreakIterator charBreaker = BreakIterator.getCharacterInstance();
+ private transient int lineAscent;
+ private transient int lineHeight;
+ private transient int lineLeading;
+ private transient int lastHeight = 10;
+ private transient int lastWidth = 50;
+ private static final int MAX_LINES = 200; // LIU: Use symbolic name
+ private transient int[] lineStarts = new int[MAX_LINES]; // LIU
+ private transient int lineCount = 1;
+
+ private transient boolean valid = false;
+ private transient FontMetrics fm;
+ private transient boolean redoLines = true;
+ private transient boolean doubleClick = false;
+ private transient TextListener textListener;
+ private transient ActionListener selectionListener;
+ private transient Image cacheImage;
+ private transient Dimension mySize;
+ private transient int xInset = 5;
+ private transient int yInset = 5;
+ private transient Point startPoint = new Point();
+ private transient Point endPoint = new Point();
+ private transient Point caretPoint = new Point();
+ private transient static String clipBoard;
+
+ private static final char CR = '\015'; // LIU
+
+ // ============================================
+
+ public DumbTextComponent() {
+ addMouseListener(this);
+ addMouseMotionListener(this);
+ addKeyListener(this);
+ addFocusListener(this);
+ setCursor(Cursor.getPredefinedCursor(Cursor.TEXT_CURSOR));
+
+ }
+
+// ================ Events ====================
+
+ public boolean isFocusTraversable() { return true; }
+
+ public void addActionListener(ActionListener l) {
+ selectionListener = AWTEventMulticaster.add(selectionListener, l);
+ }
+
+ public void removeActionListener(ActionListener l) {
+ selectionListener = AWTEventMulticaster.remove(selectionListener, l);
+ }
+
+ public void addTextListener(TextListener l) {
+ textListener = AWTEventMulticaster.add(textListener, l);
+ }
+
+ public void removeTextListener(TextListener l) {
+ textListener = AWTEventMulticaster.remove(textListener, l);
+ }
+
+ private transient boolean pressed;
+
+ public void mousePressed(MouseEvent e) {
+ if (DEBUG) System.out.println("mousePressed");
+ if (pressed) {
+ select(e,false);
+ } else {
+ doubleClick = e.getClickCount() > 1;
+ requestFocus();
+ select(e, true);
+ pressed = true;
+ }
+ }
+
+ public void mouseDragged(MouseEvent e) {
+ if (DEBUG) System.out.println("mouseDragged");
+ select(e, false);
+ }
+
+ public void mouseReleased(MouseEvent e) {
+ if (DEBUG) System.out.println("mouseReleased");
+ pressed = false;
+ }
+
+ public void mouseEntered(MouseEvent e) {
+ //if (pressed) select(e, false);
+ }
+
+ public void mouseExited(MouseEvent e){
+ //if (pressed) select(e, false);
+ }
+
+ public void mouseClicked(MouseEvent e) {}
+ public void mouseMoved(MouseEvent e) {}
+
+
+ public void focusGained(FocusEvent e) {
+ if (DEBUG) System.out.println("focusGained");
+ focus = true;
+ valid = false;
+ repaint(16);
+ }
+ public void focusLost(FocusEvent e) {
+ if (DEBUG) System.out.println("focusLost");
+ focus = false;
+ valid = false;
+ repaint(16);
+ }
+
+ public void select(MouseEvent e, boolean first) {
+ point2Offset(e.getPoint(), tempSelection);
+ if (first) {
+ if ((e.getModifiers() & InputEvent.SHIFT_MASK) == 0) {
+ tempSelection.anchor = tempSelection.caret;
+ }
+ }
+ // fix words
+ if (doubleClick) {
+ tempSelection.expand(wordBreaker);
+ }
+ select(tempSelection);
+ }
+
+ public void keyPressed(KeyEvent e) {
+ int code = e.getKeyCode();
+ if (DEBUG) System.out.println("keyPressed "
+ + hex((char)code) + ", " + hex((char)e.getModifiers()));
+ int start = selection.getStart();
+ int end = selection.getEnd();
+ boolean shift = (e.getModifiers() & KeyEvent.SHIFT_MASK) != 0;
+ boolean ctrl = (e.getModifiers() & KeyEvent.CTRL_MASK) != 0;
+ switch (code) {
+ case KeyEvent.VK_Q:
+ if (!ctrl || !editable) break;
+ fixHex();
+ break;
+ case KeyEvent.VK_V:
+ if (!ctrl || !editable) break;
+ insertText(clipBoard);
+ break;
+ case KeyEvent.VK_C:
+ if (!ctrl) break;
+ clipBoard = contents.substring(selection.getStart(), selection.getEnd());
+ break;
+ case KeyEvent.VK_X:
+ if (!ctrl) break;
+ clipBoard = contents.substring(selection.getStart(), selection.getEnd());
+ if (editable) break;
+ insertText("");
+ break;
+ case KeyEvent.VK_A:
+ if (!ctrl) break;
+ select(Integer.MAX_VALUE, 0, false);
+ break;
+ case KeyEvent.VK_RIGHT:
+ tempSelection.set(selection);
+ tempSelection.nextBound(ctrl ? wordBreaker : charBreaker, +1, shift);
+ select(tempSelection);
+ break;
+ case KeyEvent.VK_LEFT:
+ tempSelection.set(selection);
+ tempSelection.nextBound(ctrl ? wordBreaker : charBreaker, -1, shift);
+ select(tempSelection);
+ break;
+ case KeyEvent.VK_UP: // LIU: Add support for up arrow
+ tempSelection.set(selection);
+ tempSelection.caret = lineDelta(tempSelection.caret, -1);
+ if (!shift) {
+ tempSelection.anchor = tempSelection.caret;
+ }
+ select(tempSelection);
+ break;
+ case KeyEvent.VK_DOWN: // LIU: Add support for down arrow
+ tempSelection.set(selection);
+ tempSelection.caret = lineDelta(tempSelection.caret, +1);
+ if (!shift) {
+ tempSelection.anchor = tempSelection.caret;
+ }
+ select(tempSelection);
+ break;
+ case KeyEvent.VK_DELETE: // LIU: Add delete key support
+ if (!editable) break;
+ if (contents.length() == 0) break;
+ start = selection.getStart();
+ end = selection.getEnd();
+ if (start == end) {
+ ++end;
+ if (end > contents.length()) {
+ getToolkit().beep();
+ return;
+ }
+ }
+ replaceRange("", start, end);
+ break;
+ }
+ }
+
+ /**
+ * LIU: Given an offset into contents, moves up or down by lines,
+ * according to lineStarts[].
+ * @param off the offset into contents
+ * @param delta how many lines to move up (< 0) or down (> 0)
+ * @return the new offset into contents
+ */
+ private int lineDelta(int off, int delta) {
+ int line = findLine(off, false);
+ int posInLine = off - lineStarts[line];
+ // System.out.println("off=" + off + " at " + line + ":" + posInLine);
+ line += delta;
+ if (line < 0) {
+ line = posInLine = 0;
+ } else if (line >= lineCount) {
+ return contents.length();
+ }
+ off = lineStarts[line] + posInLine;
+ if (off >= lineStarts[line+1]) {
+ off = lineStarts[line+1] - 1;
+ }
+ return off;
+ }
+
+ public void keyReleased(KeyEvent e) {
+ int code = e.getKeyCode();
+ if (DEBUG) System.out.println("keyReleased "
+ + hex((char)code) + ", " + hex((char)e.getModifiers()));
+ }
+
+ public void keyTyped(KeyEvent e) {
+ char ch = e.getKeyChar();
+ if (DEBUG) System.out.println("keyTyped "
+ + hex((char)ch) + ", " + hex((char)e.getModifiers()));
+ if ((e.getModifiers() & KeyEvent.CTRL_MASK) != 0) return;
+ switch (ch) {
+ case KeyEvent.CHAR_UNDEFINED:
+ break;
+ case KeyEvent.VK_BACK_SPACE:
+ if (!editable) break;
+ if (contents.length() == 0) break;
+ int start = selection.getStart();
+ int end = selection.getEnd();
+ if (start == end) {
+ --start;
+ if (start < 0) {
+ getToolkit().beep(); // LIU: Add audio feedback of NOP
+ return;
+ }
+ }
+ replaceRange("", start, end);
+ break;
+ default:
+ if (!editable) break;
+ // LIU: Dispatch to subclass API
+ handleKeyTyped(e);
+ break;
+ }
+ }
+
+ // LIU: Subclass API for handling of key typing
+ protected void handleKeyTyped(KeyEvent e) {
+ insertText(String.valueOf(e.getKeyChar()));
+ }
+
+// ===================== Control ======================
+
+ public synchronized void setEditable(boolean b) {
+ editable = b;
+ }
+
+ public boolean isEditable() {
+ return editable;
+ }
+
+ public void select(Selection newSelection) {
+ newSelection.pin(contents);
+ if (!selection.equals(newSelection)) {
+ selection.set(newSelection);
+ if (selectionListener != null) {
+ selectionListener.actionPerformed(
+ new ActionEvent(this, ActionEvent.ACTION_PERFORMED,
+ "Selection Changed", 0));
+ }
+ repaint(10);
+ valid = false;
+ }
+ }
+
+ public void select(int start, int end) {
+ select(start, end, false);
+ }
+
+ public void select(int start, int end, boolean clickAfter) {
+ tempSelection.set(start, end, clickAfter);
+ select(tempSelection);
+ }
+
+ public int getSelectionStart() {
+ return selection.getStart();
+ }
+
+ public int getSelectionEnd() {
+ return selection.getEnd();
+ }
+
+ public void setBounds(int x, int y, int w, int h) {
+ super.setBounds(x,y,w,h);
+ redoLines = true;
+ }
+
+ public Dimension getPreferredSize() {
+ return new Dimension(lastWidth,lastHeight);
+ }
+
+ public Dimension getMaximumSize() {
+ return new Dimension(lastWidth,lastHeight);
+ }
+
+ public Dimension getMinimumSize() {
+ return new Dimension(lastHeight,lastHeight);
+ }
+
+ public void setText(String text) {
+ setText2(text);
+ select(tempSelection.set(selection).pin(contents));
+ }
+
+ public void setText2(String text) {
+ contents = text;
+ charBreaker.setText(text);
+ wordBreaker.setText(text);
+ lineBreaker.setText(text);
+ redoLines = true;
+ if (textListener != null)
+ textListener.textValueChanged(
+ new TextEvent(this, TextEvent.TEXT_VALUE_CHANGED));
+ repaint(16);
+ }
+
+ public void insertText(String text) {
+ replaceRange(text, selection.getStart(), selection.getEnd());
+ }
+
+ public void replaceRange(String s, int start, int end) {
+ setText2(contents.substring(0,start) + s
+ + contents.substring(end));
+ select(tempSelection.set(selection).
+ fixAfterReplace(start, end, s.length()));
+ }
+
+ public String getText() {
+ return contents;
+ }
+
+ public void setFont(Font font) {
+ super.setFont(font);
+ redoLines = true;
+ repaint(16);
+ }
+
+ // ================== Graphics ======================
+
+ public void update(Graphics g) {
+ if (DEBUG) System.out.println("update");
+ paint(g);
+ }
+
+ public void paint(Graphics g) {
+ mySize = getSize();
+ if (cacheImage == null
+ || cacheImage.getHeight(this) != mySize.height
+ || cacheImage.getWidth(this) != mySize.width) {
+ cacheImage = createImage(mySize.width, mySize.height);
+ valid = false;
+ }
+ if (!valid || redoLines) {
+ if (DEBUG) System.out.println("painting");
+ paint2(cacheImage.getGraphics());
+ valid = true;
+ }
+ //getToolkit().sync();
+ if (DEBUG) System.out.println("copying");
+ g.drawImage(cacheImage,
+ 0, 0, mySize.width, mySize.height,
+ 0, 0, mySize.width, mySize.height,
+ this);
+ }
+
+ public void paint2(Graphics g) {
+ g.clearRect(0, 0, mySize.width, mySize.height);
+ if (DEBUG) System.out.println("print");
+ if (focus) g.setColor(Color.black);
+ else g.setColor(Color.gray);
+ g.drawRect(0,0,mySize.width-1,mySize.height-1);
+ g.setClip(1,1,
+ mySize.width-2,mySize.height-2);
+ g.setColor(Color.black);
+ g.setFont(getFont());
+ fm = g.getFontMetrics();
+ lineAscent = fm.getAscent();
+ lineLeading = fm.getLeading();
+ lineHeight = lineAscent + fm.getDescent() + lineLeading;
+ int y = yInset + lineAscent;
+ String lastSubstring = "";
+ if (redoLines) fixLineStarts(mySize.width-xInset-xInset);
+ for (int i = 0; i < lineCount; y += lineHeight, ++i) {
+ // LIU: Don't display terminating ^M characters
+ int lim = lineStarts[i+1];
+ if (lim > 0 && contents.length() > 0 &&
+ contents.charAt(lim-1) == CR) --lim;
+ lastSubstring = contents.substring(lineStarts[i],lim);
+ g.drawString(lastSubstring, xInset, y);
+ }
+ drawSelection(g, lastSubstring);
+ lastHeight = y + yInset - lineHeight + yInset;
+ lastWidth = mySize.width-xInset-xInset;
+ }
+
+ void paintRect(Graphics g, int x, int y, int w, int h) {
+ if (focus) {
+ g.fillRect(x, y, w, h);
+ } else {
+ g.drawRect(x, y, w-1, h-1);
+ }
+ }
+
+ public void drawSelection(Graphics g, String lastSubstring) {
+ g.setXORMode(Color.black);
+ if (selection.isCaret()) {
+ offset2Point(selection.caret, selection.clickAfter, caretPoint);
+ } else {
+ if (focus) g.setColor(Color.blue);
+ else g.setColor(Color.yellow);
+ offset2Point(selection.getStart(), true, startPoint);
+ offset2Point(selection.getEnd(), false, endPoint);
+ if (selection.getStart() == selection.caret)
+ caretPoint.setLocation(startPoint);
+ else caretPoint.setLocation(endPoint);
+ if (startPoint.y == endPoint.y) {
+ paintRect(g, startPoint.x, startPoint.y,
+ Math.max(1,endPoint.x-startPoint.x), lineHeight);
+ } else {
+ paintRect(g, startPoint.x, startPoint.y,
+ (mySize.width-xInset)-startPoint.x, lineHeight);
+ if (startPoint.y + lineHeight < endPoint.y)
+ paintRect(g, xInset, startPoint.y + lineHeight,
+ (mySize.width-xInset)-xInset, endPoint.y - startPoint.y - lineHeight);
+ paintRect(g, xInset, endPoint.y, endPoint.x-xInset, lineHeight);
+ }
+ }
+ if (focus || selection.isCaret()) {
+ if (focus) g.setColor(Color.green);
+ else g.setColor(Color.red);
+ int line = caretPoint.x - (selection.clickAfter ? 0 : 1);
+ g.fillRect(line, caretPoint.y, 1, lineHeight);
+ int w = lineHeight/12 + 1;
+ int braces = line - (selection.clickAfter ? -1 : w);
+ g.fillRect(braces, caretPoint.y, w, 1);
+ g.fillRect(braces, caretPoint.y + lineHeight - 1, w, 1);
+ }
+ }
+
+ public Point offset2Point(int off, boolean start, Point p) {
+ int line = findLine(off, start);
+ int width = 0;
+ try {
+ width = fm.stringWidth(
+ contents.substring(lineStarts[line], off));
+ } catch (Exception e) {
+ System.out.println(e);
+ }
+ p.x = width + xInset;
+ if (p.x > mySize.width - xInset)
+ p.x = mySize.width - xInset;
+ p.y = lineHeight * line + yInset;
+ return p;
+ }
+
+ private int findLine(int off, boolean start) {
+ // if it is start, then go to the next line!
+ if (start) ++off;
+ for (int i = 1; i < lineCount; ++i) {
+ // LIU: This was <= ; changed to < to make caret after
+ // final CR in line appear at START of next line.
+ if (off < lineStarts[i]) return i-1;
+ }
+ // LIU: Check for special case; after CR at end of the last line
+ if (off == lineStarts[lineCount] &&
+ off > 0 && contents.length() > 0 && contents.charAt(off-1) == CR) {
+ return lineCount;
+ }
+ return lineCount-1;
+ }
+
+ // offsets on any line will go from start,true to end,false
+ // excluding start,false and end,true
+ public Selection point2Offset(Point p, Selection o) {
+ if (p.y < yInset) {
+ o.caret = 0;
+ o.clickAfter = true;
+ return o;
+ }
+ int line = (p.y - yInset)/lineHeight;
+ if (line >= lineCount) {
+ o.caret = contents.length();
+ o.clickAfter = false;
+ return o;
+ }
+ int target = p.x - xInset;
+ if (target <= 0) {
+ o.caret = lineStarts[line];
+ o.clickAfter = true;
+ return o;
+ }
+ int lowGuess = lineStarts[line];
+ int lowWidth = 0;
+ int highGuess = lineStarts[line+1];
+ int highWidth = fm.stringWidth(contents.substring(lineStarts[line],highGuess));
+ if (target >= highWidth) {
+ o.caret = lineStarts[line+1];
+ o.clickAfter = false;
+ return o;
+ }
+ while (lowGuess < highGuess - 1) {
+ int guess = (lowGuess + highGuess)/2;
+ int width = fm.stringWidth(contents.substring(lineStarts[line],guess));
+ if (width <= target) {
+ lowGuess = guess;
+ lowWidth = width;
+ if (width == target) break;
+ } else {
+ highGuess = guess;
+ highWidth = width;
+ }
+ }
+ // at end, either lowWidth < target < width(low+1), or lowWidth = target
+ int highBound = charBreaker.following(lowGuess);
+ int lowBound = charBreaker.previous();
+ // we are now at character boundaries
+ if (lowBound != lowGuess)
+ lowWidth = fm.stringWidth(contents.substring(lineStarts[line],lowBound));
+ if (highBound != highGuess)
+ highWidth = fm.stringWidth(contents.substring(lineStarts[line],highBound));
+ // we now have the right widths
+ if (target - lowWidth < highWidth - target) {
+ o.caret = lowBound;
+ o.clickAfter = true;
+ } else {
+ o.caret = highBound;
+ o.clickAfter = false;
+ }
+ // we now have the closest!
+ return o;
+ }
+
+ private void fixLineStarts(int width) {
+ lineCount = 1;
+ lineStarts[0] = 0;
+ if (contents.length() == 0) {
+ lineStarts[1] = 0;
+ return;
+ }
+ int end = 0;
+ // LIU: Add check for MAX_LINES
+ for (int start = 0; start < contents.length() && lineCount < MAX_LINES;
+ start = end) {
+ end = nextLine(fm, start, width);
+ lineStarts[lineCount++] = end;
+ if (end == start) { // LIU: Assertion
+ throw new RuntimeException("nextLine broken");
+ }
+ }
+ --lineCount;
+ redoLines = false;
+ }
+
+ // LIU: Enhanced to wrap long lines. Bug with return of start fixed.
+ public int nextLine(FontMetrics fm, int start, int width) {
+ int len = contents.length();
+ for (int i = start; i < len; ++i) {
+ // check for line separator
+ char ch = (contents.charAt(i));
+ if (ch >= 0x000A && ch <= 0x000D || ch == 0x2028 || ch == 0x2029) {
+ len = i + 1;
+ if (ch == 0x000D && i+1 < len && contents.charAt(i+1) == 0x000A) // crlf
+ ++len; // grab extra char
+ break;
+ }
+ }
+ String subject = contents.substring(start,len);
+ if (visibleWidth(fm, subject) <= width)
+ return len;
+
+ // LIU: Remainder of this method rewritten to accomodate lines
+ // longer than the component width by first trying to break
+ // into lines; then words; finally chars.
+ int n = findFittingBreak(fm, subject, width, lineBreaker);
+ if (n == 0) {
+ n = findFittingBreak(fm, subject, width, wordBreaker);
+ }
+ if (n == 0) {
+ n = findFittingBreak(fm, subject, width, charBreaker);
+ }
+ return n > 0 ? start + n : len;
+ }
+
+ /**
+ * LIU: Finds the longest substring that fits a given width
+ * composed of subunits returned by a BreakIterator. If the smallest
+ * subunit is too long, returns 0.
+ * @param fm metrics to use
+ * @param line the string to be fix into width
+ * @param width line.substring(0, result) must be <= width
+ * @param breaker the BreakIterator that will be used to find subunits
+ * @return maximum characters, at boundaries returned by breaker,
+ * that fit into width, or zero on failure
+ */
+ private int findFittingBreak(FontMetrics fm, String line, int width,
+ BreakIterator breaker) {
+ breaker.setText(line);
+ int last = breaker.first();
+ int end = breaker.next();
+ while (end != BreakIterator.DONE &&
+ visibleWidth(fm, line.substring(0, end)) <= width) {
+ last = end;
+ end = breaker.next();
+ }
+ return last;
+ }
+
+ public int visibleWidth(FontMetrics fm, String s) {
+ int i;
+ for (i = s.length()-1; i >= 0; --i) {
+ char ch = s.charAt(i);
+ if (!(ch == ' ' || ch >= 0x000A && ch <= 0x000D || ch == 0x2028 || ch == 0x2029))
+ return fm.stringWidth(s.substring(0,i+1));;
+ }
+ return 0;
+ }
+
+// =============== Utility ====================
+
+ private void fixHex() {
+ if (selection.getEnd() == 0) return;
+ int store = 0;
+ int places = 1;
+ int count = 0;
+ int min = Math.min(8,selection.getEnd());
+ for (int i = 0; i < min; ++i) {
+ char ch = contents.charAt(selection.getEnd()-1-i);
+ int value = Character.getNumericValue(ch);
+ if (value < 0 || value > 15) break;
+ store += places * value;
+ ++count;
+ places *= 16;
+ }
+ String add = "";
+ int bottom = store & 0xFFFF;
+ if (store >= 0xD8000000 && store < 0xDC000000
+ && bottom >= 0xDC00 && bottom < 0xE000) { // surrogates
+ add = "" + (char)(store >> 16) + (char)bottom;
+ } else if (store > 0xFFFF && store <= 0x10FFFF) {
+ store -= 0x10000;
+ add = "" + (char)(((store >> 10) & 0x3FF) + 0xD800)
+ + (char)((store & 0x3FF) + 0xDC00);
+
+ } else if (count >= 4) {
+ count = 4;
+ add = ""+(char)(store & 0xFFFF);
+ } else {
+ count = 1;
+ char ch = contents.charAt(selection.getEnd()-1);
+ add = hex(ch);
+ if (ch >= 0xDC00 && ch <= 0xDFFF && selection.getEnd() > 1) {
+ ch = contents.charAt(selection.getEnd()-2);
+ if (ch >= 0xD800 && ch <= 0xDBFF) {
+ count = 2;
+ add = hex(ch) + add;
+ }
+ }
+ }
+ replaceRange(add, selection.getEnd()-count, selection.getEnd());
+ }
+
+ public static String hex(char ch) {
+ String result = Integer.toString(ch,16).toUpperCase();
+ result = "0000".substring(result.length(),4) + result;
+ return result;
+ }
+}
diff --git a/icu4j/src/com/ibm/text/components/Selection.java b/icu4j/src/com/ibm/text/components/Selection.java
new file mode 100755
index 00000000000..985b36f3521
--- /dev/null
+++ b/icu4j/src/com/ibm/text/components/Selection.java
@@ -0,0 +1,155 @@
+package com.ibm.text.components;
+import java.text.*;
+
+public final class Selection {
+
+ public int anchor;
+ public int caret;
+ public boolean clickAfter;
+
+ public int getStart() {
+ return anchor < caret ? anchor : caret;
+ }
+
+ public int getEnd() {
+ return anchor > caret ? anchor : caret;
+ }
+
+ public boolean isCaret() {
+ return anchor == caret;
+ }
+
+ public Selection set(Selection other) {
+ anchor = other.anchor;
+ caret = other.caret;
+ clickAfter = other.clickAfter;
+ return this;
+ }
+
+ public Selection set(int anchor, int caret, boolean clickAfter) {
+ this.anchor = anchor;
+ this.caret = caret;
+ this.clickAfter = clickAfter;
+ return this;
+ }
+
+ public boolean equals(Object other) {
+ Selection other2 = (Selection)other;
+ return anchor == other2.anchor
+ && caret == other2.caret
+ && clickAfter == other2.clickAfter;
+ }
+
+ public boolean isLessThan(Selection other) {
+ return getStart() < other.getEnd();
+ }
+
+ public Selection pin(String text) {
+ if (anchor > text.length()) {
+ anchor = text.length();
+ } else if (anchor < 0) {
+ anchor = 0;
+ }
+ if (caret > text.length()) {
+ caret = text.length();
+ clickAfter = true;
+ } else if (caret < 0) {
+ caret = 0;
+ clickAfter = false;
+ }
+ return this;
+ }
+
+ public Selection swap(Selection after) {
+ int temp = anchor;
+ anchor = after.anchor;
+ after.anchor = temp;
+ temp = caret;
+ caret = after.caret;
+ after.caret = temp;
+ boolean b = clickAfter;
+ clickAfter = after.clickAfter;
+ after.clickAfter = b;
+ return this;
+ }
+
+ public Selection fixAfterReplace(int start, int end, int len) {
+ if (anchor >= start) {
+ if (anchor < end) anchor = end;
+ anchor = start + len + anchor - end;
+ }
+ if (caret >= start) {
+ if (caret < end) caret = end;
+ caret = start + len + caret - end;
+ }
+ return this;
+ }
+
+ // Mac & Windows considerably different
+ // Mac: end++. If start!=end, start=end
+ // SHIFT: move end right
+ // CTL: no different
+ // Windows:
+ // UNSHIFTED: if start!=end, start = end, else start=end=end+1;
+ // anchor = tip = start
+ // SHIFT: tip++
+ // CTL: if start!=end, start = end = nextbound(end-1),
+ // else start=end=nextbound(end)
+ // anchor = tip = start
+ // CTL/SHIFT: tip = nextbound(tip)
+
+ public Selection nextBound(BreakIterator breaker,
+ int direction, boolean extend) {
+ if (!extend && anchor != caret) caret -= direction;
+ caret = next(caret, breaker, direction, true);
+ if (!extend) anchor = caret;
+ clickAfter = false;
+ return this;
+ }
+
+ // expand start and end to word breaks--if they are not already on one
+ public void expand(BreakIterator breaker) {
+ if (anchor <= caret) {
+ anchor = next(anchor,breaker,-1,false);
+ caret = next(caret,breaker,1,false);
+ /*
+ try {
+ breaker.following(anchor);
+ anchor = breaker.previous();
+ } catch (Exception e) {}
+ try {
+ caret = breaker.following(caret-1);
+ } catch (Exception e) {}
+ */
+ } else {
+ anchor = next(anchor,breaker,1,false);
+ caret = next(caret,breaker,-1,false);
+ /*
+ try {
+ breaker.following(caret);
+ caret = breaker.previous();
+ } catch (Exception e) {}
+ try {
+ anchor = breaker.following(anchor-1);
+ } catch (Exception e) {}
+ */
+ }
+ }
+
+ // different = false - move to next boundary, unless on one
+ // true - move to next boundary, even if on one
+ public static int next(int position, BreakIterator breaker,
+ int direction, boolean different) {
+ if (!different) position -= direction;
+ try {
+ if (direction > 0) {
+ position = breaker.following(position);
+ } else {
+ breaker.following(position-1);
+ position = breaker.previous();
+ }
+ } catch (Exception e) {}
+ return position;
+ }
+}
+
diff --git a/icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java b/icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java
new file mode 100755
index 00000000000..02bcd5996a5
--- /dev/null
+++ b/icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java
@@ -0,0 +1,191 @@
+package com.ibm.text.components;
+
+import java.awt.*;
+import java.awt.event.*;
+import java.text.*;
+import java.awt.datatransfer.*;
+import com.ibm.text.*;
+
+/**
+ * A subclass of {@link DumbTextComponent} that passes key events through
+ * a {@link com.ibm.text.Transliterator}.
+ *
+ *
Copyright © IBM Corporation 1999. All rights reserved.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: TransliteratingTextComponent.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
+ */
+public class TransliteratingTextComponent extends DumbTextComponent {
+
+ private static boolean DEBUG = false;
+
+ private Transliterator translit = null;
+
+ // Index into getText() where the start of transliteration is.
+ // As we commit text during keyboardTransliteration, we advance
+ // this.
+ private int start = 0;
+
+ // Index into getText() where the cursor is; cursor >= start
+ private int cursor = 0;
+
+ private static final String COPYRIGHT =
+ "\u00A9 IBM Corporation 1999. All rights reserved.";
+
+ /**
+ * Constructor.
+ */
+ public TransliteratingTextComponent() {
+ super();
+ addActionListener(new ActionListener() {
+ public void actionPerformed(ActionEvent e) {
+ // We get an ActionEvent only when the selection changes
+ resetTransliterationStart();
+ }
+ });
+ }
+
+ /**
+ * {@link DumbTextComponent} API. Framework method that is called
+ * when a KeyEvent
is received. This implementation
+ * runs the new character through the current
+ * Transliterator
, if one is set, and inserts the
+ * transliterated text into the buffer.
+ */
+ protected void handleKeyTyped(KeyEvent e) {
+ char ch = e.getKeyChar();
+
+ if (translit == null) {
+ super.handleKeyTyped(e);
+ return;
+ }
+
+ // ------------------------------------------------------------
+ // The following case motivates the two lines that recompute
+ // start and cursor below.
+
+ // " "
+ // a b c q r|s t u m m
+ // 0 1 2 3 4 5 6 7 8 9
+ // 0 1 2
+
+ // start 3, cursor 5, sel 6 -> { 0, 3, 2 }
+ // : new int[] { 0, sel - start, cursor - start };
+
+ // sz>99|9
+
+ // " { "
+ // a b c q r 9 9|9 t u m m
+ // 0 1 2 3 4 5 6 7 8 9 a b
+ // 0 1 2 3 4
+
+ // { 3, 5, 4 } -> start 6, cursor 7, sel 8
+ // : start += index[0];
+ // : cursor = start + index[2] - index[0];
+ // ------------------------------------------------------------
+
+ // Need to save start because calls to replaceRange will update
+ // start and cursor.
+ int saveStart = start;
+
+ ReplaceableString buf = new ReplaceableString();
+ buf.getStringBuffer().append(getText().substring(start,
+ getSelectionStart()));
+
+ int[] index = new int[] { 0, getSelectionStart() - start,
+ cursor - start};
+
+ StringBuffer log = null;
+ if (DEBUG) {
+ log = new StringBuffer();
+ log.append("start " + start + ", cursor " + cursor);
+ log.append(", sel " + getSelectionStart());
+ log.append(", {" + index[0] + ", " + index[1] + ", " + index[2] + "}, ");
+ log.append('"' + buf.toString() + "\" + '" + ch + "' -> \"");
+ }
+
+ translit.keyboardTransliterate(buf, index, ch);
+ replaceRange(buf.toString(), start, getSelectionEnd());
+ // At this point start has been changed by the callback to
+ // resetTransliteratorStart() via replaceRange() -- so use our
+ // local copy, saveStart.
+
+ // The START index is zero-based. On entry to keyboardTransliterate(),
+ // it was zero. We can therefore just add it to our original
+ // getText()-based index value of start (in saveStart) to get
+ // the new getText()-based start.
+ start = saveStart + index[Transliterator.START];
+
+ // Make the cursor getText()-based. The CURSOR index is zero-based.
+ cursor = start + index[Transliterator.CURSOR]
+ - index[Transliterator.START];
+
+ if (DEBUG) {
+ String out = buf.toString();
+ log.append(out.substring(0, index[Transliterator.START])).
+ append('{').
+ append(out.substring(index[Transliterator.START],
+ index[Transliterator.CURSOR])).
+ append('|').
+ append(out.substring(index[Transliterator.CURSOR])).
+ append('"');
+ log.append(", {" + index[0] + ", " + index[1] + ", " + index[2] + "}, ");
+ log.append("start " + start + ", cursor " + cursor);
+ log.append(", sel " + getSelectionStart());
+ System.out.println(escape(log.toString()));
+ }
+ }
+
+ /**
+ * Set the {@link com.ibm.text.Transliterator} and direction to
+ * use to process incoming KeyEvent
s.
+ * @param t the {@link com.ibm.text.Transliterator} to use
+ */
+ public void setTransliterator(Transliterator t) {
+ if (translit != t) { // [sic] pointer compare ok; singletons
+ resetTransliterationStart();
+ }
+ translit = t;
+ }
+
+ /**
+ * Reset the start point at which transliteration begins. This
+ * needs to be done when the user moves the cursor or when the
+ * current {@link com.ibm.text.Transliterator} is changed.
+ */
+ private void resetTransliterationStart() {
+ start = getSelectionStart();
+ cursor = start;
+ }
+
+ /**
+ * Escape non-ASCII characters as Unicode.
+ * JUST FOR DEBUGGING OUTPUT.
+ */
+ public static final String escape(String s) {
+ StringBuffer buf = new StringBuffer();
+ for (int i=0; i= ' ' && c <= 0x007F) {
+ if (c == '\\') {
+ buf.append("\\\\"); // That is, "\\"
+ } else {
+ buf.append(c);
+ }
+ } else {
+ buf.append("\\u");
+ if (c < 0x1000) {
+ buf.append('0');
+ if (c < 0x100) {
+ buf.append('0');
+ if (c < 0x10) {
+ buf.append('0');
+ }
+ }
+ }
+ buf.append(Integer.toHexString(c));
+ }
+ }
+ return buf.toString();
+ }
+}
diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java
new file mode 100755
index 00000000000..fa9a89b2d60
--- /dev/null
+++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java
@@ -0,0 +1,132 @@
+package com.ibm.text.resources;
+
+import java.util.ListResourceBundle;
+
+public class TransliterationRuleKeyboardEscapeLatin1 extends ListResourceBundle {
+ /**
+ * Overrides ListResourceBundle
+ */
+ public Object[][] getContents() {
+ return new Object[][] {
+ { "Description",
+ "Keyboard transliterator for Latin-1 block" },
+
+ { "Rule",
+ "esc=''\n"
+ + "grave=`\n"
+ + "acute=''\n"
+ + "hat=^\n"
+ + "tilde=~\n"
+ + "umlaut=:\n"
+ + "ring=.\n"
+ + "cedilla=,\n"
+ + "slash=/\n"
+ + "super=^\n"
+
+ // Make keyboard entry of {esc} possible
+ // and of backslash
+ + "'\\'{esc}>{esc}\n"
+ + "'\\\\'>'\\'\n"
+
+ // Long keys
+ + "cur{esc}>\u00A4\n"
+ + "sec{esc}>\u00A7\n"
+ + "not{esc}>\u00AC\n"
+ + "mul{esc}>\u00D7\n"
+ + "div{esc}>\u00F7\n"
+
+ + " {esc}>\u00A0\n" // non-breaking space
+ + "!{esc}>\u00A1\n" // inverted exclamation
+ + "c/{esc}>\u00A2\n" // cent sign
+ + "lb{esc}>\u00A3\n" // pound sign
+ + "'|'{esc}>\u00A6\n" // broken vertical bar
+ + ":{esc}>\u00A8\n" // umlaut
+ + "{super}a{esc}>\u00AA\n" // feminine ordinal
+ + "'<<'{esc}>\u00AB\n"
+ + "r{esc}>\u00AE\n"
+ + "--{esc}>\u00AF\n"
+ + "-{esc}>\u00AD\n"
+ + "+-{esc}>\u00B1\n"
+ + "{super}2{esc}>\u00B2\n"
+ + "{super}3{esc}>\u00B3\n"
+ + "{acute}{esc}>\u00B4\n"
+ + "m{esc}>\u00B5\n"
+ + "para{esc}>\u00B6\n"
+ + "dot{esc}>\u00B7\n"
+ + "{cedilla}{esc}>\u00B8\n"
+ + "{super}1{esc}>\u00B9\n"
+ + "{super}o{esc}>\u00BA\n" // masculine ordinal
+ + "'>>'{esc}>\u00BB\n"
+ + "1/4{esc}>\u00BC\n"
+ + "1/2{esc}>\u00BD\n"
+ + "3/4{esc}>\u00BE\n"
+ + "?{esc}>\u00BF\n"
+ + "A{grave}{esc}>\u00C0\n"
+ + "A{acute}{esc}>\u00C1\n"
+ + "A{hat}{esc}>\u00C2\n"
+ + "A{tilde}{esc}>\u00C3\n"
+ + "A{umlaut}{esc}>\u00C4\n"
+ + "A{ring}{esc}>\u00C5\n"
+ + "AE{esc}>\u00C6\n"
+ + "C{cedilla}{esc}>\u00C7\n"
+ + "E{grave}{esc}>\u00C8\n"
+ + "E{acute}{esc}>\u00C9\n"
+ + "E{hat}{esc}>\u00CA\n"
+ + "E{umlaut}{esc}>\u00CB\n"
+ + "I{grave}{esc}>\u00CC\n"
+ + "I{acute}{esc}>\u00CD\n"
+ + "I{hat}{esc}>\u00CE\n"
+ + "I{umlaut}{esc}>\u00CF\n"
+ + "D-{esc}>\u00D0\n"
+ + "N{tilde}{esc}>\u00D1\n"
+ + "O{grave}{esc}>\u00D2\n"
+ + "O{acute}{esc}>\u00D3\n"
+ + "O{hat}{esc}>\u00D4\n"
+ + "O{tilde}{esc}>\u00D5\n"
+ + "O{umlaut}{esc}>\u00D6\n"
+ + "O{slash}{esc}>\u00D8\n"
+ + "U{grave}{esc}>\u00D9\n"
+ + "U{acute}{esc}>\u00DA\n"
+ + "U{hat}{esc}>\u00DB\n"
+ + "U{umlaut}{esc}>\u00DC\n"
+ + "Y{acute}{esc}>\u00DD\n"
+ + "TH{esc}>\u00DE\n"
+ + "ss{esc}>\u00DF\n"
+ + "a{grave}{esc}>\u00E0\n"
+ + "a{acute}{esc}>\u00E1\n"
+ + "a{hat}{esc}>\u00E2\n"
+ + "a{tilde}{esc}>\u00E3\n"
+ + "a{umlaut}{esc}>\u00E4\n"
+ + "a{ring}{esc}>\u00E5\n"
+ + "ae{esc}>\u00E6\n"
+ + "c{cedilla}{esc}>\u00E7\n"
+ + "c{esc}>\u00A9\n" // copyright - after c{cedilla}
+ + "e{grave}{esc}>\u00E8\n"
+ + "e{acute}{esc}>\u00E9\n"
+ + "e{hat}{esc}>\u00EA\n"
+ + "e{umlaut}{esc}>\u00EB\n"
+ + "i{grave}{esc}>\u00EC\n"
+ + "i{acute}{esc}>\u00ED\n"
+ + "i{hat}{esc}>\u00EE\n"
+ + "i{umlaut}{esc}>\u00EF\n"
+ + "d-{esc}>\u00F0\n"
+ + "n{tilde}{esc}>\u00F1\n"
+ + "o{grave}{esc}>\u00F2\n"
+ + "o{acute}{esc}>\u00F3\n"
+ + "o{hat}{esc}>\u00F4\n"
+ + "o{tilde}{esc}>\u00F5\n"
+ + "o{umlaut}{esc}>\u00F6\n"
+ + "o{slash}{esc}>\u00F8\n"
+ + "o{esc}>\u00B0\n"
+ + "u{grave}{esc}>\u00F9\n"
+ + "u{acute}{esc}>\u00FA\n"
+ + "u{hat}{esc}>\u00FB\n"
+ + "u{umlaut}{esc}>\u00FC\n"
+ + "y{acute}{esc}>\u00FD\n"
+ + "y{esc}>\u00A5\n" // yen sign
+ + "th{esc}>\u00FE\n"
+ + "ss{esc}>\u00FF\n"
+ }
+ };
+ }
+}
diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java
new file mode 100755
index 00000000000..bb96443d051
--- /dev/null
+++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java
@@ -0,0 +1,243 @@
+package com.ibm.text.resources;
+
+import java.util.ListResourceBundle;
+
+public class TransliterationRuleLatinArabic extends ListResourceBundle {
+ /**
+ * Overrides ListResourceBundle
+ */
+ public Object[][] getContents() {
+ return new Object[][] {
+ { "HasInverse", "1" },
+
+ { "Rule",
+ // To Do: finish adding shadda, add sokoon
+
+ "alefmadda=\u0622\n"+
+ "alefuhamza=\u0623\n"+
+ "wauuhamza=\u0624\n"+
+ "alefhamza=\u0625\n"+
+ "yehuhamza=\u0626\n"+
+ "alef=\u0627\n"+
+ "beh=\u0628\n"+
+ "tehmarbuta=\u0629\n"+
+ "teh=\u062A\n"+
+ "theh=\u062B\n"+
+ "geem=\u062C\n"+
+ "hah=\u062D\n"+
+ "kha=\u062E\n"+
+ "dal=\u062F\n"+
+ "dhal=\u0630\n"+
+ "reh=\u0631\n"+
+ "zain=\u0632\n"+
+ "seen=\u0633\n"+
+ "sheen=\u0634\n"+
+ "sad=\u0635\n"+
+ "dad=\u0636\n"+
+ "tah=\u0637\n"+
+ "zah=\u0638\n"+
+ "ein=\u0639\n"+
+ "ghein=\u063A\n"+
+ "feh=\u0641\n"+
+ "qaaf=\u0642\n"+
+ "kaf=\u0643\n"+
+ "lam=\u0644\n"+
+ "meem=\u0645\n"+
+ "noon=\u0646\n"+
+ "heh=\u0647\n"+
+ "wau=\u0648\n"+
+ "yehmaqsura=\u0649\n"+
+ "yeh=\u064A\n"+
+ "peh=\u06A4\n"+
+
+ "hamza=\u0621\n"+
+ "fathatein=\u064B\n"+
+ "dammatein=\u064C\n"+
+ "kasratein=\u064D\n"+
+ "fatha=\u064E\n"+
+ "damma=\u064F\n"+
+ "kasra=\u0650\n"+
+ "shadda=\u0651\n"+
+ "sokoon=\u0652\n"+
+
+ // convert English to Arabic
+ "Arabic>"+
+ "\u062a\u062a\u0645\u062a\u0639\u0020"+
+ "\u0627\u0644\u0644\u063a\u0629\u0020"+
+ "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
+ "\u0628\u0628\u0646\u0638\u0645\u0020"+
+ "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
+ "\u062c\u0645\u064a\u0644\u0629\n"+
+
+ "ai>{alefmadda}\n"+
+ "ae>{alefuhamza}\n"+
+ "ao>{alefhamza}\n"+
+ "aa>{alef}\n"+
+ "an>{fathatein}\n"+
+ "a>{fatha}\n"+
+ "b>{beh}\n"+
+ "c>{kaf}\n"+
+ "{dhal}]dh>{shadda}\n"+
+ "dh>{dhal}\n"+
+ "{dad}]dd>{shadda}\n"+
+ "dd>{dad}\n"+
+ "{dal}]d>{shadda}\n"+
+ "d>{dal}\n"+
+ "e>{ein}\n"+
+ "f>{feh}\n"+
+ "gh>{ghein}\n"+
+ "g>{geem}\n"+
+ "hh>{hah}\n"+
+ "h>{heh}\n"+
+ "ii>{kasratein}\n"+
+ "i>{kasra}\n"+
+ "j>{geem}\n"+
+ "kh>{kha}\n"+
+ "k>{kaf}\n"+
+ "l>{lam}\n"+
+ "m>{meem}\n"+
+ "n>{noon}\n"+
+ "o>{hamza}\n"+
+ "p>{peh}\n"+
+ "q>{qaaf}\n"+
+ "r>{reh}\n"+
+ "sh>{sheen}\n"+
+ "ss>{sad}\n"+
+ "s>{seen}\n"+
+ "th>{theh}\n"+
+ "tm>{tehmarbuta}\n"+
+ "tt>{tah}\n"+
+ "t>{teh}\n"+
+ "uu>{dammatein}\n"+
+ "u>{damma}\n"+
+ "v>{beh}\n"+
+ "we>{wauuhamza}\n"+
+ "w>{wau}\n"+
+ "x>{kaf}{shadda}{seen}\n"+
+ "ye>{yehuhamza}\n"+
+ "ym>{yehmaqsura}\n"+
+ "y>{yeh}\n"+
+ "zz>{zah}\n"+
+ "z>{zain}\n"+
+
+ "0>\u0660\n"+ // Arabic digit 0
+ "1>\u0661\n"+ // Arabic digit 1
+ "2>\u0662\n"+ // Arabic digit 2
+ "3>\u0663\n"+ // Arabic digit 3
+ "4>\u0664\n"+ // Arabic digit 4
+ "5>\u0665\n"+ // Arabic digit 5
+ "6>\u0666\n"+ // Arabic digit 6
+ "7>\u0667\n"+ // Arabic digit 7
+ "8>\u0668\n"+ // Arabic digit 8
+ "9>\u0669\n"+ // Arabic digit 9
+ "%>\u066A\n"+ // Arabic %
+ ".>\u066B\n"+ // Arabic decimal separator
+ ",>\u066C\n"+ // Arabic thousands separator
+ "*>\u066D\n"+ // Arabic five-pointed star
+
+ "`0>0\n"+ // Escaped forms of the above
+ "`1>1\n"+
+ "`2>2\n"+
+ "`3>3\n"+
+ "`4>4\n"+
+ "`5>5\n"+
+ "`6>6\n"+
+ "`7>7\n"+
+ "`8>8\n"+
+ "`9>9\n"+
+ "`%>%\n"+
+ "`.>.\n"+
+ "`,>,\n"+
+ "`*>*\n"+
+ "``>`\n"+
+
+ "''>\n"+
+
+ // now Arabic to English
+
+ "''ai\u041f\u0420\u0410\u0412\u0414\u0410\u00D1\u0020\u0411\u044d\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f\u002c\u0020\u043a\u044b\u0440\u0433\u044b\u0437\u002c\u0020\u041c\u043e\u043b\u0434\u043e\u0432\u044d\u043d\u044f\u0441\u043a\u044d\u002e\n" +
+
+ //special equivs for ay, oy, ...
+ "Y{a}{i}>{cyYa}{cyY}\n" +
+ "Y{e}{i}>{cyYe}{cyY}\n" +
+ "Y{i}{i}>{cyYi}{cyY}\n" +
+ "Y{o}{i}>{cyYo}{cyY}\n" +
+ "Y{u}{i}>{cyYu}{cyY}\n" +
+ "A{i}>{cyA}{cyY}\n" +
+ "E{i}>{cyE}{cyY}\n" +
+ //skip II, since it is the soft sign
+ "O{i}>{cyO}{cyY}\n" +
+ "U{i}>{cyU}{cyY}\n" +
+
+ "A>{cyA}\n" +
+ "B>{cyBe}\n" +
+ "C{h}>{cyChe}\n" +
+ "C[{iey}>{cySe}\n" +
+ "C>{cyKe}\n" +
+ "D>{cyDe}\n" +
+ "E>{cyE}\n" +
+ "F>{cyFe}\n" +
+ "G>{cyGe}\n" +
+ "H>{cyHard}\n" +
+ "I{i}>{cySoft}\n" +
+ "I>{cyI}\n" +
+ "J>{cyDe}{cyZhe}\n" +
+ "K{h}>{cyKhe}\n" +
+ "K>{cyKe}\n" +
+ "L>{cyLe}\n" +
+ "M>{cyMe}\n" +
+ "N>{cyNe}\n" +
+ "O>{cyO}\n" +
+ "P>{cyPe}\n" +
+ "Q{u}>{cyKe}{cyVe}\n" +
+ "R>{cyRe}\n" +
+ "S{h}{t}{c}{h}>{cyShche}\n" +
+ "S{h}{c}{h}>{cyShche}\n" +
+ "S{h}>{cyShe}\n" +
+ "S>{cySe}\n" +
+ "T{c}{h}>{cyChe}\n" +
+ "T{h}>{cyZe}\n" +
+ "T{s}>{cyTse}\n" +
+ "T>{cyTe}\n" +
+ "U>{cyU}\n" +
+ "V>{cyVe}\n" +
+ "W{h}>{cyVe}\n" +
+ "W>{cyVe}\n" +
+ "X>{cyKe}{cySe}\n" +
+ "Y{e}>{cyYe}\n" +
+ "Y{o}>{cyYo}\n" +
+ "Y{u}>{cyYu}\n" +
+ "Y{a}>{cyYa}\n" +
+ "Y{i}>{cyYi}\n" +
+ "Y>{cyY}\n" +
+ "Z{h}>{cyZhe}\n" +
+ "Z>{cyZe}\n" +
+ "X>{cyKe}{cySe}\n" +
+
+ //lower case: doesn''t solve join bug
+ "y{a}{i}>{cyya}{cyy}\n" +
+ "y{e}{i}>{cyye}{cyy}\n" +
+ "y{i}{i}>{cyyi}{cyy}\n" +
+ "y{o}{i}>{cyyo}{cyy}\n" +
+ "y{u}{i}>{cyyu}{cyy}\n" +
+ "a{i}>{cya}{cyy}\n" +
+ "e{i}>{cye}{cyy}\n" +
+ //skip ii, since it is the soft sign
+ "o{i}>{cyo}{cyy}\n" +
+ "u{i}>{cyu}{cyy}\n" +
+
+ "a>{cya}\n" +
+ "b>{cybe}\n" +
+ "c{h}>{cyche}\n" +
+ "c[{iey}>{cyse}\n" +
+ "c>{cyke}\n" +
+ "d>{cyde}\n" +
+ "e>{cye}\n" +
+ "f>{cyfe}\n" +
+ "g>{cyge}\n" +
+ "h>{cyhard}\n" +
+ "i{i}>{cysoft}\n" +
+ "i>{cyi}\n" +
+ "j>{cyde}{cyzhe}\n" +
+ "k{h}>{cykhe}\n" +
+ "k>{cyke}\n" +
+ "l>{cyle}\n" +
+ "m>{cyme}\n" +
+ "n>{cyne}\n" +
+ "o>{cyo}\n" +
+ "p>{cype}\n" +
+ "q{u}>{cyke}{cyve}\n" +
+ "r>{cyre}\n" +
+ "s{h}{t}{c}{h}>{cyshche}\n" +
+ "s{h}{c}{h}>{cyshche}\n" +
+ "s{h}>{cyshe}\n" +
+ "s>{cyse}\n" +
+ "t{c}{h}>{cyche}\n" +
+ "t{h}>{cyze}\n" +
+ "t{s}>{cytse}\n" +
+ "t>{cyte}\n" +
+ "u>{cyu}\n" +
+ "v>{cyve}\n" +
+ "w{h}>{cyve}\n" +
+ "w>{cyve}\n" +
+ "x>{cyke}{cyse}\n" +
+ "y{e}>{cyye}\n" +
+ "y{o}>{cyyo}\n" +
+ "y{u}>{cyyu}\n" +
+ "y{a}>{cyya}\n" +
+ "y{i}>{cyyi}\n" +
+ "y>{cyy}\n" +
+ "z{h}>{cyzhe}\n" +
+ "z>{cyze}\n" +
+ "x>{cyke}{cyse}\n" +
+
+ //generally the last rule
+ "''>\n" +
+
+ //now Russian to English
+
+ "Y''<{cyY}[{cyA}\n" +
+ "Y''<{cyY}[{cyE}\n" +
+ "Y''<{cyY}[{cyI}\n" +
+ "Y''<{cyY}[{cyO}\n" +
+ "Y''<{cyY}[{cyU}\n" +
+ "Y''<{cyY}[{cya}\n" +
+ "Y''<{cyY}[{cye}\n" +
+ "Y''<{cyY}[{cyi}\n" +
+ "Y''<{cyY}[{cyo}\n" +
+ "Y''<{cyY}[{cyu}\n" +
+ "A<{cyA}\n" +
+ "B<{cyBe}\n" +
+ "J<{cyDe}{cyZhe}\n" +
+ "J<{cyDe}{cyzhe}\n" +
+ "D<{cyDe}\n" +
+ "V<{cyVe}\n" +
+ "G<{cyGe}\n" +
+ "Zh<{cyZhe}[{lower}\n" +
+ "ZH<{cyZhe}\n" +
+ "Z''<{cyZe}[{cyHard}\n" +
+ "Z''<{cyZe}[{cyhard}\n" +
+ "Z<{cyZe}\n" +
+ "Ye<{cyYe}[{lower}\n" +
+ "YE<{cyYe}\n" +
+ "Yo<{cyYo}[{lower}\n" +
+ "YO<{cyYo}\n" +
+ "Yu<{cyYu}[{lower}\n" +
+ "YU<{cyYu}\n" +
+ "Ya<{cyYa}[{lower}\n" +
+ "YA<{cyYa}\n" +
+ "Yi<{cyYi}[{lower}\n" +
+ "YI<{cyYi}\n" +
+ "Y<{cyY}\n" +
+ "Kh<{cyKhe}[{lower}\n" +
+ "KH<{cyKhe}\n" +
+ "K''<{cyKe}[{cyHard}\n" +
+ "K''<{cyKe}[{cyhard}\n" +
+ "X<{cyKe}{cySe}\n" +
+ "X<{cyKe}{cyse}\n" +
+ "K<{cyKe}\n" +
+ "L<{cyLe}\n" +
+ "M<{cyMe}\n" +
+ "N<{cyNe}\n" +
+ "O<{cyO}\n" +
+ "P<{cyPe}\n" +
+
+ "R<{cyRe}\n" +
+ "Shch<{cyShche}[{lower}\n" +
+ "SHCH<{cyShche}\n" +
+ "Sh''<{cyShe}[{cyche}\n" +
+ "SH''<{cyShe}[{cyChe}\n" +
+ "Sh<{cyShe}[{lower}\n" +
+ "SH<{cyShe}\n" +
+ "S''<{cySe}[{cyHard}\n" +
+ "S''<{cySe}[{cyhard}\n" +
+ "S<{cySe}\n" +
+ "Ts<{cyTse}[{lower}\n" +
+ "TS<{cyTse}\n" +
+ "T''<{cyTe}[{cySe}\n" +
+ "T''<{cyTe}[{cyse}\n" +
+ "T''<{cyTe}[{cyHard}\n" +
+ "T''<{cyTe}[{cyhard}\n" +
+ "T<{cyTe}\n" +
+ "U<{cyU}\n" +
+ "F<{cyFe}\n" +
+ "Ch<{cyChe}[{lower}\n" +
+ "CH<{cyChe}\n" +
+ "H<{cyHard}\n" +
+ "I''<{cyI}[{cyI}\n" +
+ "I''<{cyI}[{cyi}\n" +
+ "I<{cyI}\n" +
+ "Ii<{cySoft}[{lower}\n" +
+ "II<{cySoft}\n" +
+ "E<{cyE}\n" +
+
+ //lowercase
+ "y''<{cyy}[{cya}\n" +
+ "y''<{cyy}[{cye}\n" +
+ "y''<{cyy}[{cyi}\n" +
+ "y''<{cyy}[{cyo}\n" +
+ "y''<{cyy}[{cyu}\n" +
+ "y''<{cyy}[{cyA}\n" +
+ "y''<{cyy}[{cyE}\n" +
+ "y''<{cyy}[{cyI}\n" +
+ "y''<{cyy}[{cyO}\n" +
+ "y''<{cyy}[{cyU}\n" +
+ "a<{cya}\n" +
+ "b<{cybe}\n" +
+ "j<{cyde}{cyzhe}\n" +
+ "j<{cyde}{cyZhe}\n" +
+ "d<{cyde}\n" +
+ "v<{cyve}\n" +
+ "g<{cyge}\n" +
+ "zh<{cyzhe}\n" +
+ "z''<{cyze}[{cyhard}\n" +
+ "z''<{cyze}[{cyHard}\n" +
+ "z<{cyze}\n" +
+ "ye<{cyye}\n" +
+ "yo<{cyyo}\n" +
+ "yu<{cyyu}\n" +
+ "ya<{cyya}\n" +
+ "yi<{cyyi}\n" +
+ "y<{cyy}\n" +
+ "kh<{cykhe}\n" +
+ "k''<{cyke}[{cyhard}\n" +
+ "k''<{cyke}[{cyHard}\n" +
+ "x<{cyke}{cyse}\n" +
+ "x<{cyke}{cySe}\n" +
+ "k<{cyke}\n" +
+ "l<{cyle}\n" +
+ "m<{cyme}\n" +
+ "n<{cyne}\n" +
+ "o<{cyo}\n" +
+ "p<{cype}\n" +
+
+ "r<{cyre}\n" +
+ "shch<{cyshche}\n" +
+ "sh''<{cyshe}[{cyche}\n" +
+ "sh''<{cyshe}[{cyChe}\n" +
+ "sh<{cyshe}\n" +
+ "s''<{cyse}[{cyhard}\n" +
+ "s''<{cyse}[{cyHard}\n" +
+ "s<{cyse}\n" +
+ "ts<{cytse}\n" +
+ "t''<{cyte}[{cyse}\n" +
+ "t''<{cyte}[{cySe}\n" +
+ "t''<{cyte}[{cyhard}\n" +
+ "t''<{cyte}[{cyHard}\n" +
+ "t<{cyte}\n" +
+ "u<{cyu}\n" +
+ "f<{cyfe}\n" +
+ "ch<{cyche}\n" +
+ "h<{cyhard}\n" +
+ "i''<{cyi}[{cyI}\n" +
+ "i''<{cyi}[{cyi}\n" +
+ "i<{cyi}\n" +
+ "ii<{cysoft}\n" +
+ "e<{cye}\n" +
+
+ //generally the last rule
+ "''>\n"
+ //the end
+ }
+ };
+ }
+}
diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java
new file mode 100755
index 00000000000..d359adde14a
--- /dev/null
+++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java
@@ -0,0 +1,412 @@
+package com.ibm.text.resources;
+
+import java.util.ListResourceBundle;
+
+public class TransliterationRuleLatinDevanagari extends ListResourceBundle {
+ /**
+ * Overrides ListResourceBundle
+ */
+ public Object[][] getContents() {
+ return new Object[][] {
+ { "Description",
+ "Latin to Devanagari" },
+
+ { "Rule",
+ //#####################################################################
+ // Keyboard Transliteration Table
+ //#####################################################################
+ // Conversions should be:
+ // 1. complete
+ // * convert every sequence of Latin letters (a to z plus apostrophe)
+ // to a sequence of Native letters
+ // * convert every sequence of Native letters to Latin letters
+ // 2. reversable
+ // * any string of Native converted to Latin and back should be the same
+ // * this is not true for English converted to Native & back, e.g.:
+ // k -> {kaf} -> k
+ // c -> {kaf} -> k
+ //#####################################################################
+ // Sequences of Latin letters may convert to a single Native letter.
+ // When this is the case, an apostrophe can be used to indicate separate
+ // letters.$
+ // E.g. sh -> {shin}
+ // s'h -> {sin}{heh}
+ // ss -> {sad}
+ // s's -> {sin}{shadda}
+ //#####################################################################
+ // To Do:
+ // finish adding shadda, add sokoon, fix uppercase
+ // make two transliteration tables: one with vowels, one without
+ //#####################################################################
+ // Modifications
+ // Devanagari Transliterator: broken up with consonsants/vowels
+ //#####################################################################
+ // Unicode character name definitions
+ //#####################################################################
+
+ //consonants
+ "candrabindu=\u0901\n"
+ + "bindu=\u0902\n"
+ + "visarga=\u0903\n"
+
+ // w represents the stand-alone form
+ + "wa=\u0905\n"
+ + "waa=\u0906\n"
+ + "wi=\u0907\n"
+ + "wii=\u0908\n"
+ + "wu=\u0909\n"
+ + "wuu=\u090A\n"
+ + "wr=\u090B\n"
+ + "wl=\u090C\n"
+ + "we=\u090F\n"
+ + "wai=\u0910\n"
+ + "wo=\u0913\n"
+ + "wau=\u0914\n"
+
+ + "ka=\u0915\n"
+ + "kha=\u0916\n"
+ + "ga=\u0917\n"
+ + "gha=\u0918\n"
+ + "nga=\u0919\n"
+
+ + "ca=\u091A\n"
+ + "cha=\u091B\n"
+ + "ja=\u091C\n"
+ + "jha=\u091D\n"
+ + "nya=\u091E\n"
+
+ + "tta=\u091F\n"
+ + "ttha=\u0920\n"
+ + "dda=\u0921\n"
+ + "ddha=\u0922\n"
+ + "nna=\u0923\n"
+
+ + "ta=\u0924\n"
+ + "tha=\u0925\n"
+ + "da=\u0926\n"
+ + "dha=\u0927\n"
+ + "na=\u0928\n"
+
+ + "pa=\u092A\n"
+ + "pha=\u092B\n"
+ + "ba=\u092C\n"
+ + "bha=\u092D\n"
+ + "ma=\u092E\n"
+
+ + "ya=\u092F\n"
+ + "ra=\u0930\n"
+ + "rra=\u0931\n"
+ + "la=\u0933\n"
+ + "va=\u0935\n"
+
+ + "sha=\u0936\n"
+ + "ssa=\u0937\n"
+ + "sa=\u0938\n"
+ + "ha=\u0939\n"
+
+ // represents the dependent form
+ + "aa=\u093E\n"
+ + "i=\u093F\n"
+ + "ii=\u0940\n"
+ + "u=\u0941\n"
+ + "uu=\u0942\n"
+ + "rh=\u0943\n"
+ + "lh=\u0944\n"
+ + "e=\u0947\n"
+ + "ai=\u0948\n"
+ + "o=\u094B\n"
+ + "au=\u094C\n"
+
+ + "virama=\u094D\n"
+
+ + "wrr=\u0960\n"
+ + "rrh=\u0962\n"
+
+ + "danda=\u0964\n"
+ + "doubleDanda=\u0965\n"
+ + "depVowelAbove=[\u093E-\u0940\u0945-\u094C]\n"
+ + "depVowelBelow=[\u0941-\u0944]\n"
+ + "endThing=[{danda}{doubleDanda}\u0000-\u08FF\u0980-\uFFFF]\n"
+
+ + "&=[{virama}{aa}{ai}{au}{ii}{i}{uu}{u}{rrh}{rh}{lh}{e}{o}]\n"
+ + "%=[bcdfghjklmnpqrstvwxyz]\n"
+
+ //#####################################################################
+ // convert from Latin letters to Native letters
+ //#####################################################################
+ //Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e
+
+ // special forms with no good conversion
+
+ + "mm>{bindu}\n"
+ + "x>{visarga}\n"
+
+ // convert to independent forms at start of word or syllable:
+ // e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
+ // Moved up [LIU]
+
+ + "aa>{waa}\n"
+ + "ai>{wai}\n"
+ + "au>{wau}\n"
+ + "ii>{wii}\n"
+ + "i>{wi}\n"
+ + "uu>{wuu}\n"
+ + "u>{wu}\n"
+ + "rrh>{wrr}\n"
+ + "rh>{wr}\n"
+ + "lh>{wl}\n"
+ + "e>{we}\n"
+ + "o>{wo}\n"
+ + "a>{wa}\n"
+
+ // normal consonants
+
+ + "kh>{kha}|{virama}\n"
+ + "k>{ka}|{virama}\n"
+ + "q>{ka}|{virama}\n"
+ + "gh>{gha}|{virama}\n"
+ + "g>{ga}|{virama}\n"
+ + "ng>{nga}|{virama}\n"
+ + "ch>{cha}|{virama}\n"
+ + "c>{ca}|{virama}\n"
+ + "jh>{jha}|{virama}\n"
+ + "j>{ja}|{virama}\n"
+ + "ny>{nya}|{virama}\n"
+ + "tth>{ttha}|{virama}\n"
+ + "tt>{tta}|{virama}\n"
+ + "ddh>{ddha}|{virama}\n"
+ + "dd>{dda}|{virama}\n"
+ + "nn>{nna}|{virama}\n"
+ + "th>{tha}|{virama}\n"
+ + "t>{ta}|{virama}\n"
+ + "dh>{dha}|{virama}\n"
+ + "d>{da}|{virama}\n"
+ + "n>{na}|{virama}\n"
+ + "ph>{pha}|{virama}\n"
+ + "p>{pa}|{virama}\n"
+ + "bh>{bha}|{virama}\n"
+ + "b>{ba}|{virama}\n"
+ + "m>{ma}|{virama}\n"
+ + "y>{ya}|{virama}\n"
+ + "r>{ra}|{virama}\n"
+ + "l>{la}|{virama}\n"
+ + "v>{va}|{virama}\n"
+ + "f>{va}|{virama}\n"
+ + "w>{va}|{virama}\n"
+ + "sh>{sha}|{virama}\n"
+ + "ss>{ssa}|{virama}\n"
+ + "s>{sa}|{virama}\n"
+ + "z>{sa}|{virama}\n"
+ + "h>{ha}|{virama}\n"
+
+ + ".>{danda}\n"
+ + "{danda}.>{doubleDanda}\n"
+ + "{depVowelAbove}]~>{bindu}\n"
+ + "{depVowelBelow}]~>{candrabindu}\n"
+
+ // convert to dependent forms after consonant with no vowel:
+ // e.g. kai -> {ka}{virama}ai -> {ka}{ai}
+
+ + "{virama}aa>{aa}\n"
+ + "{virama}ai>{ai}\n"
+ + "{virama}au>{au}\n"
+ + "{virama}ii>{ii}\n"
+ + "{virama}i>{i}\n"
+ + "{virama}uu>{uu}\n"
+ + "{virama}u>{u}\n"
+ + "{virama}rrh>{rrh}\n"
+ + "{virama}rh>{rh}\n"
+ + "{virama}lh>{lh}\n"
+ + "{virama}e>{e}\n"
+ + "{virama}o>{o}\n"
+ + "{virama}a>\n"
+
+ // otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
+
+ + "{virama}''aa>{waa}\n"
+ + "{virama}''ai>{wai}\n"
+ + "{virama}''au>{wau}\n"
+ + "{virama}''ii>{wii}\n"
+ + "{virama}''i>{wi}\n"
+ + "{virama}''uu>{wuu}\n"
+ + "{virama}''u>{wu}\n"
+ + "{virama}''rrh>{wrr}\n"
+ + "{virama}''rh>{wr}\n"
+ + "{virama}''lh>{wl}\n"
+ + "{virama}''e>{we}\n"
+ + "{virama}''o>{wo}\n"
+ + "{virama}''a>{wa}\n"
+
+ + "{virama}[{endThing}>\n"
+
+ // convert any left-over apostrophes used for separation
+
+ + "''>\n"
+
+ //#####################################################################
+ // convert from Native letters to Latin letters
+ //#####################################################################
+
+ // special forms with no good conversion
+
+ + "mm<{bindu}\n"
+ + "x<{visarga}\n"
+
+ // normal consonants
+
+ + "kh<{kha}[&\n"
+ + "kha<{kha}\n"
+ + "k''<{ka}{virama}[{ha}\n"
+ + "k<{ka}[&\n"
+ + "ka<{ka}\n"
+ + "gh<{gha}[&\n"
+ + "gha<{gha}\n"
+ + "g''<{ga}{virama}[{ha}\n"
+ + "g<{ga}[&\n"
+ + "ga<{ga}\n"
+ + "ng<{nga}[&\n"
+ + "nga<{nga}\n"
+ + "ch<{cha}[&\n"
+ + "cha<{cha}\n"
+ + "c''<{ca}{virama}[{ha}\n"
+ + "c<{ca}[&\n"
+ + "ca<{ca}\n"
+ + "jh<{jha}[&\n"
+ + "jha<{jha}\n"
+ + "j''<{ja}{virama}[{ha}\n"
+ + "j<{ja}[&\n"
+ + "ja<{ja}\n"
+ + "ny<{nya}[&\n"
+ + "nya<{nya}\n"
+ + "tth<{ttha}[&\n"
+ + "ttha<{ttha}\n"
+ + "tt''<{tta}{virama}[{ha}\n"
+ + "tt<{tta}[&\n"
+ + "tta<{tta}\n"
+ + "ddh<{ddha}[&\n"
+ + "ddha<{ddha}\n"
+ + "dd''<{dda}[&{ha}\n"
+ + "dd<{dda}[&\n"
+ + "dda<{dda}\n"
+ + "dh<{dha}[&\n"
+ + "dha<{dha}\n"
+ + "d''<{da}{virama}[{ha}\n"
+ + "d''<{da}{virama}[{ddha}\n"
+ + "d''<{da}{virama}[{dda}\n"
+ + "d''<{da}{virama}[{dha}\n"
+ + "d''<{da}{virama}[{da}\n"
+ + "d<{da}[&\n"
+ + "da<{da}\n"
+ + "th<{tha}[&\n"
+ + "tha<{tha}\n"
+ + "t''<{ta}{virama}[{ha}\n"
+ + "t''<{ta}{virama}[{ttha}\n"
+ + "t''<{ta}{virama}[{tta}\n"
+ + "t''<{ta}{virama}[{tha}\n"
+ + "t''<{ta}{virama}[{ta}\n"
+ + "t<{ta}[&\n"
+ + "ta<{ta}\n"
+ + "n''<{na}{virama}[{ga}\n"
+ + "n''<{na}{virama}[{ya}\n"
+ + "n<{na}[&\n"
+ + "na<{na}\n"
+ + "ph<{pha}[&\n"
+ + "pha<{pha}\n"
+ + "p''<{pa}{virama}[{ha}\n"
+ + "p<{pa}[&\n"
+ + "pa<{pa}\n"
+ + "bh<{bha}[&\n"
+ + "bha<{bha}\n"
+ + "b''<{ba}{virama}[{ha}\n"
+ + "b<{ba}[&\n"
+ + "ba<{ba}\n"
+ + "m''<{ma}{virama}[{ma}\n"
+ + "m''<{ma}{virama}[{bindu}\n"
+ + "m<{ma}[&\n"
+ + "ma<{ma}\n"
+ + "y<{ya}[&\n"
+ + "ya<{ya}\n"
+ + "r''<{ra}{virama}[{ha}\n"
+ + "r<{ra}[&\n"
+ + "ra<{ra}\n"
+ + "l''<{la}{virama}[{ha}\n"
+ + "l<{la}[&\n"
+ + "la<{la}\n"
+ + "v<{va}[&\n"
+ + "va<{va}\n"
+ + "sh<{sha}[&\n"
+ + "sha<{sha}\n"
+ + "ss<{ssa}[&\n"
+ + "ssa<{ssa}\n"
+ + "s''<{sa}{virama}[{ha}\n"
+ + "s''<{sa}{virama}[{sha}\n"
+ + "s''<{sa}{virama}[{ssa}\n"
+ + "s''<{sa}{virama}[{sa}\n"
+ + "s<{sa}[&\n"
+ + "sa<{sa}\n"
+ + "h<{ha}[&\n"
+ + "ha<{ha}\n"
+
+ // dependent vowels (should never occur except following consonants)
+
+ + "aa<{aa}\n"
+ + "ai<{ai}\n"
+ + "au<{au}\n"
+ + "ii<{ii}\n"
+ + "i<{i}\n"
+ + "uu<{uu}\n"
+ + "u<{u}\n"
+ + "rrh<{rrh}\n"
+ + "rh<{rh}\n"
+ + "lh<{lh}\n"
+ + "e<{e}\n"
+ + "o<{o}\n"
+
+ // independent vowels (when following consonants)
+
+ + "''aa\u039c\u0397\u039d\u0399\u039d\u0020\u0391\u0395\u0399\u0394\u0395\u002c\u0020\u0398\u0395\u0391\u002c\u0020--\u0397\u039b\u0397\u0399\u0391\u0394\u0395\u03a9\u0020\u0391\u03a7\u0399\u039b\u0397\u039f\u03a3\n"
+
+ + "AV`>{grAl}{grAcUp}\n"
+ + "EV`>{grEp}{grAcUp}\n"
+ + "AV>{grAl}{grUp}\n"
+ + "EV>{grEp}{grUp}\n"
+ + "NG>{grGa}{grGa}\n"
+ + "NK>{grGa}{grKa}\n"
+ + "NX>{grGa}{grKs}\n"
+ + "NCH>{grGa}{grKh}\n"
+
+ //+ "final = [ .;]\n" // Syntax error, unused anyway - Liu
+
+ + "A`>{grAcAl}\n"
+ + "EE`>{grAcEt}\n"
+ + "E`>{grAcEp}\n"
+ + "I`>{grAcIo}\n"
+ + "U`>{grAcUp}\n"
+ + "OO`>{grAcOme}\n"
+ + "O`>{grAcOm}\n"
+ + "''I>{grDiIo}\n"
+ + "''U>{grDiUp}\n"
+ + "A>{grAl}\n"
+ + "B>{grBe}\n"
+ + "C[I>{grSi}\n"
+ + "C[E>{grSi}\n"
+ + "C[Y>{grSi}\n"
+ + "CH>{grKh}\n"
+ + "C>{grKa}\n"
+ + "D>{grDe}\n"
+ + "EE>{grEt}\n"
+ + "E>{grEp}\n"
+ + "F>{grPh}\n"
+ + "G>{grGa}\n"
+ + "H>{grKh}\n"
+ + "I>{grIo}\n"
+ + "J>{grIo}\n"
+ + "KS>{grKs}\n"
+ + "KH>{grKh}\n"
+ + "K>{grKa}\n"
+ + "L>{grLa}\n"
+ + "M>{grMu}\n"
+ + "N>{grNu}\n"
+ + "OO>{grOme}\n"
+ + "O>{grOm}\n"
+ + "PS>{grPs}\n"
+ + "PH>{grPh}\n"
+ + "P>{grPi}\n"
+ + "Q>{grKa}\n"
+ + "R>{grRh}\n"
+ + "S>{grSi}\n"
+ + "TH>{grTh}\n"
+ + "T>{grTa}\n"
+ + "W>{grUp}{grUp}\n"
+ + "U>{grUp}\n"
+ + "V>{grUp}\n"
+ + "X>{grKs}\n"
+ + "Y>{grUp}\n"
+ + "Z>{grZe}\n"
+
+ //now Native to Roman
+
+ + "AV<{grAl}{grUp}\n"
+ + "EV<{grEp}{grUp}\n"
+ + "AV`<{grAl}{grAcUp}\n"
+ + "EV`<{grEp}{grAcUp}\n"
+ + "N''<{grNu}[{grGa}\n"
+ + "NG<{grGa}{grGa}\n"
+ + "N''<{grNu}[{grKa}\n"
+ + "NK<{grGa}{grKa}\n"
+ + "N''<{grNu}[{grKs}\n"
+ + "NX<{grGa}{grKs}\n"
+ + "N''<{grNu}[{grKh}\n"
+ + "NCH<{grGa}{grKh}\n"
+
+ + "A<{grAl}\n"
+ + "B<{grBe}\n"
+ + "G<{grGa}\n"
+ + "D<{grDe}\n"
+ + "E''<{grEp}[{grEp}\n"
+ + "E''<{grEp}[{grEt}\n"
+ + "E''<{grEp}[{grAcEp}\n"
+ + "E''<{grEp}[{grAcEt}\n"
+ + "E<{grEp}\n"
+ + "Z<{grZe}\n"
+ + "EE<{grEt}\n"
+ + "TH<{grTh}\n"
+ + "I<{grIo}\n"
+ + "K<{grKa}\n"
+ + "L<{grLa}\n"
+ + "M<{grMu}\n"
+ + "N<{grNu}\n"
+ + "X<{grKs}\n"
+ + "O''<{grOm}[{grOm}\n"
+ + "O''<{grOm}[{grOme}\n"
+ + "O''<{grOm}[{grAcOm}\n"
+ + "O''<{grOm}[{grAcOme}\n"
+ + "O<{grOm}\n"
+ + "P''<{grPi}[{grSi}\n"
+ + "P''<{grPi}[{grfinal}\n"
+ + "P<{grPi}\n"
+ + "R<{grRh}\n"
+ + "S<{grSi}\n"
+ + "T<{grTa}\n"
+ + "W<{grUp}{grUp}\n"
+
+ + "V<{grUp}[{grAcAl}\n"
+ + "V<{grUp}[{grAcEp}\n"
+ + "V<{grUp}[{grAcEt}\n"
+ + "V<{grUp}[{grAcIo}\n"
+ + "V<{grUp}[{grAcOm}\n"
+ + "V<{grUp}[{grAcUp}\n"
+ + "V<{grUp}[{grAcOme}\n"
+
+ + "V<{grUp}[{grAl}\n"
+ + "V<{grUp}[{grEp}\n"
+ + "V<{grUp}[{grEt}\n"
+ + "V<{grUp}[{grIo}\n"
+ + "V<{grUp}[{grOm}\n"
+ //{grUp}[{grUp}{gral}{gracup}\n"
+ + "ev`>{grep}{gracup}\n"
+ + "av>{gral}{grup}\n"
+ + "ev>{grep}{grup}\n"
+ + "ng>{grga}{grga}\n"
+ + "nk>{grga}{grka}\n"
+ + "nx>{grga}{grks}\n"
+ + "nch>{grga}{grkh}\n"
+
+ + "a`>{gracal}\n"
+ + "ee`>{gracet}\n"
+ + "e`>{gracep}\n"
+ + "i`>{gracio}\n"
+ + "u`>{gracup}\n"
+ + "oo`>{gracome}\n"
+ + "o`>{gracom}\n"
+ + "''i>{grdiio}\n"
+ + "''u>{grdiup}\n"
+ + "a>{gral}\n"
+ + "b>{grbe}\n"
+ + "c[i>{grsi}\n"
+ + "c[e>{grsi}\n"
+ + "c[y>{grsi}\n"
+ + "ch>{grkh}\n"
+ + "c>{grka}\n"
+ + "d>{grde}\n"
+ + "ee>{gret}\n"
+ + "e>{grep}\n"
+ + "f>{grph}\n"
+ + "g>{grga}\n"
+ + "h>{grkh}\n"
+ + "i>{grio}\n"
+ + "j>{grio}\n"
+ + "ks>{grks}\n"
+ + "kh>{grkh}\n"
+ + "k>{grka}\n"
+ + "l>{grla}\n"
+ + "m>{grmu}\n"
+ + "n>{grnu}\n"
+ + "oo>{grome}\n"
+ + "o>{grom}\n"
+ + "ps>{grps}\n"
+ + "ph>{grph}\n"
+ + "p>{grpi}\n"
+ + "q>{grka}\n"
+ + "r>{grrh}\n"
+ + "s>|{grfinal}\n"
+ + "{grfinal}[{letter}>{grsi}\n"
+ + "th>{grth}\n"
+ + "t>{grta}\n"
+ + "w>{grup}{grup}\n"
+ + "u>{grup}\n"
+ + "v>{grup}\n"
+ + "x>{grks}\n"
+ + "y>{grup}\n"
+ + "z>{grze}\n"
+
+
+ //forms
+ + "''>\n"
+ //now native to roman
+
+ + "av<{gral}{grup}\n"
+ + "ev<{grep}{grup}\n"
+ + "av`<{gral}{gracup}\n"
+ + "ev`<{grep}{gracup}\n"
+ + "n''<{grnu}[{grga}\n"
+ + "ng<{grga}{grga}\n"
+ + "n''<{grnu}[{grka}\n"
+ + "nk<{grga}{grka}\n"
+ + "n''<{grnu}[{grks}\n"
+ + "nx<{grga}{grks}\n"
+ + "n''<{grnu}[{grkh}\n"
+ + "nch<{grga}{grkh}\n"
+
+ + "a<{gral}\n"
+ + "b<{grbe}\n"
+ + "g<{grga}\n"
+ + "d<{grde}\n"
+ + "e''<{grep}[{grep}\n"
+ + "e''<{grep}[{gret}\n"
+ + "e''<{grep}[{gracep}\n"
+ + "e''<{grep}[{gracet}\n"
+ + "e<{grep}\n"
+ + "z<{grze}\n"
+ + "ee<{gret}\n"
+ + "th<{grth}\n"
+ + "i<{grio}\n"
+ + "k<{grka}\n"
+ + "l<{grla}\n"
+ + "m<{grmu}\n"
+ + "n<{grnu}\n"
+ + "x<{grks}\n"
+ + "o''<{grom}[{grom}\n"
+ + "o''<{grom}[{grome}\n"
+ + "o''<{grom}[{gracom}\n"
+ + "o''<{grom}[{gracome}\n"
+ + "o<{grom}\n"
+ + "p''<{grpi}[{grsi}\n"
+ + "p''<{grpi}[{grfinal}\n"
+ + "p<{grpi}\n"
+ + "r<{grrh}\n"
+ + "s<{grsi}\n"
+ + "s<{grfinal}\n"
+ + "t<{grta}\n"
+ + "w<{grup}{grup}\n"
+
+ + "v<{grup}[{gracal}\n"
+ + "v<{grup}[{gracep}\n"
+ + "v<{grup}[{gracet}\n"
+ + "v<{grup}[{gracio}\n"
+ + "v<{grup}[{gracom}\n"
+ + "v<{grup}[{gracup}\n"
+ + "v<{grup}[{gracome}\n"
+
+ + "v<{grup}[{gral}\n"
+ + "v<{grup}[{grep}\n"
+ + "v<{grup}[{gret}\n"
+ + "v<{grup}[{grio}\n"
+ + "v<{grup}[{grom}\n"
+ //{grup}[{grup}{POINT_SHEVA}
+ //?>{POINT_HATAF_SEGOL}
+ //?>{POINT_HATAF_PATAH}
+ //?>{POINT_HATAF_QAMATS}
+ //?>{POINT_HIRIQ}
+ //?>{POINT_TSERE}
+ //?>{POINT_SEGOL}
+ //?>{POINT_PATAH}
+ //?>{POINT_QAMATS}
+ //?>{POINT_HOLAM}
+ //?>{POINT_QUBUTS}
+ //?>{POINT_DAGESH_OR_MAPIQ}
+ //?>{POINT_METEG}
+ //?>{PUNCTUATION_MAQAF}
+ //?>{POINT_RAFE}
+ //?>{PUNCTUATION_PASEQ}
+ //?>{POINT_SHIN_DOT}
+ //?>{POINT_SIN_DOT}
+ //?>{PUNCTUATION_SOF_PASUQ}
+
+ + "a>{ALEF}\n"
+ + "A>{ALEF}\n"
+
+ + "b>{BET}\n"
+ + "B>{BET}\n"
+
+ + "c[{softvowel}>{SAMEKH}\n"
+ + "C[{softvowel}>{SAMEKH}\n"
+ + "c[{letter}>{KAF}\n"
+ + "C[{letter}>{KAF}\n"
+ + "c>{FINAL_KAF}\n"
+ + "C>{FINAL_KAF}\n"
+
+ + "d>{DALET}\n"
+ + "D>{DALET}\n"
+
+ + "e>{AYIN}\n"
+ + "E>{AYIN}\n"
+
+ + "f[{letter}>{PE}\n"
+ + "f>{FINAL_PE}\n"
+ + "F[{letter}>{PE}\n"
+ + "F>{FINAL_PE}\n"
+
+ + "g>{GIMEL}\n"
+ + "G>{GIMEL}\n"
+
+ + "h>{HE}\n"
+ + "H>{HE}\n"
+
+ + "i>{YOD}\n"
+ + "I>{YOD}\n"
+
+ + "j>{DALET}{SHIN}\n"
+ + "J>{DALET}{SHIN}\n"
+
+ + "kH>{HET}\n"
+ + "kh>{HET}\n"
+ + "KH>{HET}\n"
+ + "Kh>{HET}\n"
+ + "k[{letter}>{KAF}\n"
+ + "K[{letter}>{KAF}\n"
+ + "k>{FINAL_KAF}\n"
+ + "K>{FINAL_KAF}\n"
+
+ + "l>{LAMED}\n"
+ + "L>{LAMED}\n"
+
+ + "m[{letter}>{MEM}\n"
+ + "m>{FINAL_MEM}\n"
+ + "M[{letter}>{MEM}\n"
+ + "M>{FINAL_MEM}\n"
+
+ + "n[{letter}>{NUN}\n"
+ + "n>{FINAL_NUN}\n"
+ + "N[{letter}>{NUN}\n"
+ + "N>{FINAL_NUN}\n"
+
+ + "o>{VAV}\n"
+ + "O>{VAV}\n"
+
+ + "p[{letter}>{PE}\n"
+ + "p>{FINAL_PE}\n"
+ + "P[{letter}>{PE}\n"
+ + "P>{FINAL_PE}\n"
+
+ + "q>{QOF}\n"
+ + "Q>{QOF}\n"
+
+ + "r>{RESH}\n"
+ + "R>{RESH}\n"
+
+ + "sH>{SHIN}\n"
+ + "sh>{SHIN}\n"
+ + "SH>{SHIN}\n"
+ + "Sh>{SHIN}\n"
+ + "s>{SAMEKH}\n"
+ + "S>{SAMEKH}\n"
+
+ + "th>{TAV}\n"
+ + "tH>{TAV}\n"
+ + "TH>{TAV}\n"
+ + "Th>{TAV}\n"
+ + "tS[{letter}>{TSADI}\n"
+ + "ts[{letter}>{TSADI}\n"
+ + "Ts[{letter}>{TSADI}\n"
+ + "TS[{letter}>{TSADI}\n"
+ + "tS>{FINAL_TSADI}\n"
+ + "ts>{FINAL_TSADI}\n"
+ + "Ts>{FINAL_TSADI}\n"
+ + "TS>{FINAL_TSADI}\n"
+ + "t>{TET}\n"
+ + "T>{TET}\n"
+
+ + "u>{VAV}\n"
+ + "U>{VAV}\n"
+
+ + "v>{VAV}\n"
+ + "V>{VAV}\n"
+
+ + "w>{VAV}\n"
+ + "W>{VAV}\n"
+
+ + "x>{KAF}{SAMEKH}\n"
+ + "X>{KAF}{SAMEKH}\n"
+
+ + "y>{YOD}\n"
+ + "Y>{YOD}\n"
+
+ + "z>{ZAYIN}\n"
+ + "Z>{ZAYIN}\n"
+
+ //#?>{YIDDISH_DOUBLE_VAV}
+ //?>{YIDDISH_VAV_YOD}
+ //?>{YIDDISH_DOUBLE_YOD}
+ //?>{PUNCTUATION_GERESH}
+ //?>{PUNCTUATION_GERSHAYIM}
+
+ + "''>\n"
+
+ //{POINT_SHEVA}>@
+ //{POINT_HATAF_SEGOL}>@
+ //{POINT_HATAF_PATAH}>@
+ //{POINT_HATAF_QAMATS}>@
+ //{POINT_HIRIQ}>@
+ //{POINT_TSERE}>@
+ //{POINT_SEGOL}>@
+ //{POINT_PATAH}>@
+ //{POINT_QAMATS}>@
+ //{POINT_HOLAM}>@
+ //{POINT_QUBUTS}>@
+ //{POINT_DAGESH_OR_MAPIQ}>@
+ //{POINT_METEG}>@
+ //{PUNCTUATION_MAQAF}>@
+ //{POINT_RAFE}>@
+ //{PUNCTUATION_PASEQ}>@
+ //{POINT_SHIN_DOT}>@
+ //{POINT_SIN_DOT}>@
+ //{PUNCTUATION_SOF_PASUQ}>@
+
+ + "a<{ALEF}\n"
+ + "e<{AYIN}\n"
+ + "b<{BET}\n"
+ + "d<{DALET}\n"
+ + "k<{FINAL_KAF}\n"
+ + "m<{FINAL_MEM}\n"
+ + "n<{FINAL_NUN}\n"
+ + "p<{FINAL_PE}\n"
+ + "ts<{FINAL_TSADI}\n"
+ + "g<{GIMEL}\n"
+ + "kh<{HET}\n"
+ + "h<{HE}\n"
+ + "k''<{KAF}[{HE}\n"
+ + "k<{KAF}\n"
+ + "l<{LAMED}\n"
+ + "m<{MEM}\n"
+ + "n<{NUN}\n"
+ + "p<{PE}\n"
+ + "q<{QOF}\n"
+ + "r<{RESH}\n"
+ + "s''<{SAMEKH}[{HE}\n"
+ + "s<{SAMEKH}\n"
+ + "sh<{SHIN}\n"
+ + "th<{TAV}\n"
+ + "t''<{TET}[{HE}\n"
+ + "t''<{TET}[{HE}\n"
+ + "t''<{TET}[{SAMEKH}\n"
+ + "t''<{TET}[{SHIN}\n"
+ + "t<{TET}\n"
+ + "ts<{TSADI}\n"
+ + "v<{VAV}[{vowellike}\n"
+ + "u<{VAV}\n"
+ + "y<{YOD}\n"
+ + "z<{ZAYIN}\n"
+
+ //{YIDDISH_DOUBLE_VAV}>@
+ //{YIDDISH_VAV_YOD}>@
+ //{YIDDISH_DOUBLE_YOD}>@
+ //{PUNCTUATION_GERESH}>@
+ //{PUNCTUATION_GERSHAYIM}>@
+
+ + "<''\n"
+ }
+ };
+ }
+}
diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java
new file mode 100755
index 00000000000..47b6e2a3de2
--- /dev/null
+++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java
@@ -0,0 +1,883 @@
+package com.ibm.text.resources;
+
+import java.util.ListResourceBundle;
+
+/**
+ * Rewritten April 1999 to implement Hepburn (kebon shiki)
+ * transliteration. Reference: CJKV Information Processing, Lunde,
+ * 1999, pp. 30-35.
+ * @author Alan Liu
+ */
+public class TransliterationRuleLatinKana extends ListResourceBundle {
+ /**
+ * Overrides ListResourceBundle
+ */
+ public Object[][] getContents() {
+ return new Object[][] {
+ { "Description",
+ "Lowercase Latin to Hiragana; Uppercase Latin to Katakana" },
+
+ { "Rule",
+
+ //------------------------------------------------------------
+ // Variables
+ //------------------------------------------------------------
+
+ // Hiragana. These are named according to the
+ // regularized Nippon romanization (the naming system
+ // used by Unicode). Thus \u3062 is called "di", not
+ // "ji". "x_" is the small form of "_", e.g. "xa" is
+ // small "a".
+
+ "xa=\u3041\n"
+ + "a=\u3042\n"
+ + "xi=\u3043\n"
+ + "i=\u3044\n"
+ + "xu=\u3045\n"
+ + "u=\u3046\n"
+ + "xe=\u3047\n"
+ + "e=\u3048\n"
+ + "xo=\u3049\n"
+ + "o=\u304A\n"
+
+ + "ka=\u304B\n"
+ + "ga=\u304C\n"
+ + "ki=\u304D\n"
+ + "gi=\u304E\n"
+ + "ku=\u304F\n"
+ + "gu=\u3050\n"
+ + "ke=\u3051\n"
+ + "ge=\u3052\n"
+ + "ko=\u3053\n"
+ + "go=\u3054\n"
+
+ + "sa=\u3055\n"
+ + "za=\u3056\n"
+ + "si=\u3057\n"
+ + "zi=\u3058\n"
+ + "su=\u3059\n"
+ + "zu=\u305A\n"
+ + "se=\u305B\n"
+ + "ze=\u305C\n"
+ + "so=\u305D\n"
+ + "zo=\u305E\n"
+
+ + "ta=\u305F\n"
+ + "da=\u3060\n"
+ + "ti=\u3061\n"
+ + "di=\u3062\n"
+ + "xtu=\u3063\n"
+ + "tu=\u3064\n"
+ + "du=\u3065\n"
+ + "te=\u3066\n"
+ + "de=\u3067\n"
+ + "to=\u3068\n"
+ + "do=\u3069\n"
+
+ + "na=\u306A\n"
+ + "ni=\u306B\n"
+ + "nu=\u306C\n"
+ + "ne=\u306D\n"
+ + "no=\u306E\n"
+
+ + "ha=\u306F\n"
+ + "ba=\u3070\n"
+ + "pa=\u3071\n"
+ + "hi=\u3072\n"
+ + "bi=\u3073\n"
+ + "pi=\u3074\n"
+ + "hu=\u3075\n"
+ + "bu=\u3076\n"
+ + "pu=\u3077\n"
+ + "he=\u3078\n"
+ + "be=\u3079\n"
+ + "pe=\u307A\n"
+ + "ho=\u307B\n"
+ + "bo=\u307C\n"
+ + "po=\u307D\n"
+
+ + "ma=\u307E\n"
+ + "mi=\u307F\n"
+ + "mu=\u3080\n"
+ + "me=\u3081\n"
+ + "mo=\u3082\n"
+
+ + "xya=\u3083\n"
+ + "ya=\u3084\n"
+ + "xyu=\u3085\n"
+ + "yu=\u3086\n"
+ + "xyo=\u3087\n"
+ + "yo=\u3088\n"
+
+ + "ra=\u3089\n"
+ + "ri=\u308A\n"
+ + "ru=\u308B\n"
+ + "re=\u308C\n"
+ + "ro=\u308D\n"
+
+ + "xwa=\u308E\n"
+ + "wa=\u308F\n"
+ + "wi=\u3090\n"
+ + "we=\u3091\n"
+ + "wo=\u3092\n"
+
+ + "n=\u3093\n"
+ + "vu=\u3094\n"
+
+ // Katakana. "X_" is the small form of "_", e.g. "XA"
+ // is small "A".
+
+ + "XA=\u30A1\n"
+ + "A=\u30A2\n"
+ + "XI=\u30A3\n"
+ + "I=\u30A4\n"
+ + "XU=\u30A5\n"
+ + "U=\u30A6\n"
+ + "XE=\u30A7\n"
+ + "E=\u30A8\n"
+ + "XO=\u30A9\n"
+ + "O=\u30AA\n"
+
+ + "KA=\u30AB\n"
+ + "GA=\u30AC\n"
+ + "KI=\u30AD\n"
+ + "GI=\u30AE\n"
+ + "KU=\u30AF\n"
+ + "GU=\u30B0\n"
+ + "KE=\u30B1\n"
+ + "GE=\u30B2\n"
+ + "KO=\u30B3\n"
+ + "GO=\u30B4\n"
+
+ + "SA=\u30B5\n"
+ + "ZA=\u30B6\n"
+ + "SI=\u30B7\n"
+ + "ZI=\u30B8\n"
+ + "SU=\u30B9\n"
+ + "ZU=\u30BA\n"
+ + "SE=\u30BB\n"
+ + "ZE=\u30BC\n"
+ + "SO=\u30BD\n"
+ + "ZO=\u30BE\n"
+
+ + "TA=\u30BF\n"
+ + "DA=\u30C0\n"
+ + "TI=\u30C1\n"
+ + "DI=\u30C2\n"
+ + "XTU=\u30C3\n"
+ + "TU=\u30C4\n"
+ + "DU=\u30C5\n"
+ + "TE=\u30C6\n"
+ + "DE=\u30C7\n"
+ + "TO=\u30C8\n"
+ + "DO=\u30C9\n"
+
+ + "NA=\u30CA\n"
+ + "NI=\u30CB\n"
+ + "NU=\u30CC\n"
+ + "NE=\u30CD\n"
+ + "NO=\u30CE\n"
+
+ + "HA=\u30CF\n"
+ + "BA=\u30D0\n"
+ + "PA=\u30D1\n"
+ + "HI=\u30D2\n"
+ + "BI=\u30D3\n"
+ + "PI=\u30D4\n"
+ + "HU=\u30D5\n"
+ + "BU=\u30D6\n"
+ + "PU=\u30D7\n"
+ + "HE=\u30D8\n"
+ + "BE=\u30D9\n"
+ + "PE=\u30DA\n"
+ + "HO=\u30DB\n"
+ + "BO=\u30DC\n"
+ + "PO=\u30DD\n"
+
+ + "MA=\u30DE\n"
+ + "MI=\u30DF\n"
+ + "MU=\u30E0\n"
+ + "ME=\u30E1\n"
+ + "MO=\u30E2\n"
+
+ + "XYA=\u30E3\n"
+ + "YA=\u30E4\n"
+ + "XYU=\u30E5\n"
+ + "YU=\u30E6\n"
+ + "XYO=\u30E7\n"
+ + "YO=\u30E8\n"
+
+ + "RA=\u30E9\n"
+ + "RI=\u30EA\n"
+ + "RU=\u30EB\n"
+ + "RE=\u30EC\n"
+ + "RO=\u30ED\n"
+
+ + "XWA=\u30EE\n"
+ + "WA=\u30EF\n"
+ + "WI=\u30F0\n"
+ + "WE=\u30F1\n"
+ + "WO=\u30F2\n"
+
+ + "N=\u30F3\n"
+ + "VU=\u30F4\n"
+
+ + "XKA=\u30F5\n"
+ + "XKE=\u30F6\n"
+
+ + "VA=\u30F7\n"
+ + "VI=\u30F8\n"
+ + "VE=\u30F9\n"
+ + "VO=\u30FA\n"
+
+ + "DOT=\u30FB\n" // Middle dot
+ + "LONG=\u30FC\n" // Prolonged sound mark
+
+ // Categories and programmatic variables
+
+ + "vowel=[aiueo]\n"
+ + "small=\uE000\n"
+ + "hvr=\uE001\n"
+ + "hv=[{xya}{xi}{xyu}{xe}{xyo}]\n"
+
+ //------------------------------------------------------------
+ // Rules
+ //------------------------------------------------------------
+ /*
+// Hepburn equivalents
+
+shi>|si
+ji>|zi
+chi>|ti
+// ji>|di // By default we use the ji-zi mapping
+tsu>|tu
+fu>|hu
+
+sh[{vowel}>|sy
+ja>|zya
+// ji = zi
+ju>|zyu
+je>|zye
+jo>|zyo
+cha>|tya
+// chi = ti
+chu>|tyu
+che>|tye
+cho>|tyo
+// j[{vowel} = dy{vowel}, but we use zy{vowel} by default
+
+// Historically, m preceded b, p, or m; now n is used
+// in all cases
+m[b>n
+m[p>n
+m[m>n
+
+// Compatibility
+
+// 'f' group
+fa>{fu}{xa}
+fi>{fu}{xi}
+// fu = hu
+fe>{fu}{xe}
+fo>{fu}{xo}
+
+// 'jy' group; these will not round-trip, except for "jyi"
+// See also the 'j' group.
+jya>|zya
+jyi>{zi}{xyi}
+jyu>|zyu
+jye>|zye
+jyo>|zyo
+
+// Nippon romanized forms
+
+a>{a}
+i>{i}
+u>{u}
+e>{e}
+o>{o}
+ka>{ka}
+ki>{ki}
+ku>{ku}
+ke>{ke}
+ko>{ko}
+ga>{ga}
+gi>{gi}
+gu>{gu}
+ge>{ge}
+go>{go}
+sa>{sa}
+si>{si}
+su>{su}
+se>{se}
+so>{so}
+za>{za}
+zi>{zi}
+zu>{zu}
+ze>{ze}
+zo>{zo}
+ta>{ta}
+ti>{ti}
+tu>{tu}
+te>{te}
+to>{to}
+da>{da}
+di>{di}
+du>{du}
+de>{de}
+do>{do}
+na>{na}
+ni>{ni}
+nu>{nu}
+ne>{ne}
+no>{no}
+ha>{ha}
+hi>{hi}
+hu>{hu}
+he>{he}
+ho>{ho}
+ba>{ba}
+bi>{bi}
+bu>{bu}
+be>{be}
+bo>{bo}
+pa>{pa}
+pi>{pi}
+pu>{pu}
+pe>{pe}
+po>{po}
+ma>{ma}
+mi>{mi}
+mu>{mu}
+me>{me}
+mo>{mo}
+ya>{ya}
+yu>{yu}
+yo>{yo}
+ra>{ra}
+ri>{ri}
+ru>{ru}
+re>{re}
+ro>{ro}
+wa>{wa}
+wi>{wi}
+// No "wu"
+we>{we}
+wo>{wo} // Reverse {wo} to "o", not "wo"
+n''>{n}
+n>{n}
+
+// Palatized Nippon romanized syllables
+
+ky[{vowel}>{ki}|{small}
+gy[{vowel}>{gi}|{small}
+sy[{vowel}>{si}|{small}
+zy[{vowel}>{zi}|{small}
+ty[{vowel}>{ti}|{small}
+dy[{vowel}>{di}|{small}
+ny[{vowel}>{ni}|{small}
+my[{vowel}>{mi}|{small}
+hy[{vowel}>{hi}|{small}
+by[{vowel}>{bi}|{small}
+py[{vowel}>{pi}|{small}
+ry[{vowel}>{ri}|{small}
+
+// Doubled consonants
+
+c[c>{xtu}
+k[k>{xtu}
+g[g>{xtu}
+s[s>{xtu}
+z[z>{xtu}
+j[j>{xtu}
+t[t>{xtu}
+d[d>{xtu}
+h[h>{xtu}
+f[f>{xtu}
+p[p>{xtu}
+b[b>{xtu}
+m[m>{xtu}
+y[y>{xtu}
+r[r>{xtu}
+w[w>{xtu}
+ */
+
+ + "a>{a}\n"
+
+ + "ba>{ba}\n"
+ + "bi>{bi}\n"
+ + "bu>{bu}\n"
+ + "be>{be}\n"
+ + "bo>{bo}\n"
+ + "by[{vowel}>{bi}|{small}\n"
+ + "b[b>{xtu}\n"
+
+ + "da>{da}\n"
+ + "di>{di}\n"
+ + "du>{du}\n"
+ + "de>{de}\n"
+ + "do>{do}\n"
+ + "dy[{vowel}>{di}|{small}\n"
+ + "dh[{vowel}>{de}|{small}\n"
+ + "d[d>{xtu}\n"
+
+ + "e>{e}\n"
+
+ + "fa>{hu}{xa}\n"
+ + "fi>{hu}{xi}\n"
+ + "fe>{hu}{xe}\n"
+ + "fo>{hu}{xo}\n"
+ + "fya>{hu}{xya}\n"
+ + "fyu>{hu}{xyu}\n"
+ + "fyo>{hu}{xyo}\n"
+ + "f[f>{xtu}\n"
+
+ + "ga>{ga}\n"
+ + "gi>{gi}\n"
+ + "gu>{gu}\n"
+ + "ge>{ge}\n"
+ + "go>{go}\n"
+ + "gy[{vowel}>{gi}|{small}\n"
+ + "gwa>{gu}{xwa}\n"
+ + "gwi>{gu}{xi}\n"
+ + "gwu>{gu}{xu}\n"
+ + "gwe>{gu}{xe}\n"
+ + "gwo>{gu}{xo}\n"
+ + "g[g>{xtu}\n"
+
+ + "ha>{ha}\n"
+ + "hi>{hi}\n"
+ + "hu>{hu}\n"
+ + "he>{he}\n"
+ + "ho>{ho}\n"
+ + "hy[{vowel}>{hi}|{small}\n"
+ + "h[h>{xtu}\n"
+
+ + "i>{i}\n"
+
+ + "ka>{ka}\n"
+ + "ki>{ki}\n"
+ + "ku>{ku}\n"
+ + "ke>{ke}\n"
+ + "ko>{ko}\n"
+ + "kwa>{ku}{xwa}\n"
+ + "kwi>{ku}{xi}\n"
+ + "kwu>{ku}{xu}\n"
+ + "kwe>{ku}{xe}\n"
+ + "kwo>{ku}{xo}\n"
+ + "ky[{vowel}>{ki}|{small}\n"
+ + "k[k>{xtu}\n"
+
+ + "ma>{ma}\n"
+ + "mi>{mi}\n"
+ + "mu>{mu}\n"
+ + "me>{me}\n"
+ + "mo>{mo}\n"
+ + "my[{vowel}>{mi}|{small}\n"
+ + "m[b>{n}\n"
+ + "m[f>{n}\n"
+ + "m[m>{n}\n"
+ + "m[p>{n}\n"
+ + "m[v>{n}\n"
+ + "m''>{n}\n"
+
+ + "na>{na}\n"
+ + "ni>{ni}\n"
+ + "nu>{nu}\n"
+ + "ne>{ne}\n"
+ + "no>{no}\n"
+ + "ny[{vowel}>{ni}|{small}\n"
+ + "nn>{n}\n"
+ + "n''>{n}\n"
+ + "n>{n}\n"
+
+ + "o>{o}\n"
+
+ + "pa>{pa}\n"
+ + "pi>{pi}\n"
+ + "pu>{pu}\n"
+ + "pe>{pe}\n"
+ + "po>{po}\n"
+ + "py[{vowel}>{pi}|{small}\n"
+ + "p[p>{xtu}\n"
+
+ + "qa>{ku}{xa}\n"
+ + "qi>{ku}{xi}\n"
+ + "qu>{ku}{xu}\n"
+ + "qe>{ku}{xe}\n"
+ + "qo>{ku}{xo}\n"
+ + "qy[{vowel}>{ku}|{small}\n"
+ + "q[q>{xtu}\n"
+
+ + "ra>{ra}\n"
+ + "ri>{ri}\n"
+ + "ru>{ru}\n"
+ + "re>{re}\n"
+ + "ro>{ro}\n"
+ + "ry[{vowel}>{ri}|{small}\n"
+ + "r[r>{xtu}\n"
+
+ + "sa>{sa}\n"
+ + "si>{si}\n"
+ + "su>{su}\n"
+ + "se>{se}\n"
+ + "so>{so}\n"
+ + "sy[{vowel}>{si}|{small}\n"
+ + "s[sh>{xtu}\n"
+ + "s[s>{xtu}\n"
+
+ + "ta>{ta}\n"
+ + "ti>{ti}\n"
+ + "tu>{tu}\n"
+ + "te>{te}\n"
+ + "to>{to}\n"
+ + "th[{vowel}>{te}|{small}\n"
+ + "tsa>{tu}{xa}\n"
+ + "tsi>{tu}{xi}\n"
+ + "tse>{tu}{xe}\n"
+ + "tso>{tu}{xo}\n"
+ + "ty[{vowel}>{ti}|{small}\n"
+ + "t[ts>{xtu}\n"
+ + "t[ch>{xtu}\n"
+ + "t[t>{xtu}\n"
+
+ + "u>{u}\n"
+
+ + "va>{VA}\n"
+ + "vi>{VI}\n"
+ + "vu>{vu}\n"
+ + "ve>{VE}\n"
+ + "vo>{VO}\n"
+ + "vy[{vowel}>{VI}|{small}\n"
+ + "v[v>{xtu}\n"
+
+ + "wa>{wa}\n"
+ + "wi>{wi}\n"
+ + "we>{we}\n"
+ + "wo>{wo}\n"
+ + "w[w>{xtu}\n"
+
+ + "ya>{ya}\n"
+ + "yu>{yu}\n"
+ + "ye>{i}{xe}\n"
+ + "yo>{yo}\n"
+ + "y[y>{xtu}\n"
+
+ + "za>{za}\n"
+ + "zi>{zi}\n"
+ + "zu>{zu}\n"
+ + "ze>{ze}\n"
+ + "zo>{zo}\n"
+ + "zy[{vowel}>{zi}|{small}\n"
+ + "z[z>{xtu}\n"
+
+ + "xa>{xa}\n"
+ + "xi>{xi}\n"
+ + "xu>{xu}\n"
+ + "xe>{xe}\n"
+ + "xo>{xo}\n"
+ + "xka>{XKA}\n"
+ + "xke>{XKE}\n"
+ + "xtu>{xtu}\n"
+ + "xwa>{xwa}\n"
+ + "xya>{xya}\n"
+ + "xyu>{xyu}\n"
+ + "xyo>{xyo}\n"
+
+ // optional mappings
+ + "wu>{u}\n"
+
+ + "ca>{ka}\n"
+ + "ci>{si}\n"
+ + "cu>{ku}\n"
+ + "ce>{se}\n"
+ + "co>{ko}\n"
+ + "cha>{ti}{xya}\n"
+ + "chi>{ti}\n"
+ + "chu>{ti}{xyu}\n"
+ + "che>{ti}{xe}\n"
+ + "cho>{ti}{xyo}\n"
+ + "cy[{vowel}>{ti}|{small}\n"
+ + "c[k>{xtu}\n"
+ + "c[c>{xtu}\n"
+
+ + "fu>{hu}\n"
+
+ + "ja>{zi}{xya}\n"
+ + "ji>{zi}\n"
+ + "ju>{zi}{xyu}\n"
+ + "je>{zi}{xe}\n"
+ + "jo>{zi}{xyo}\n"
+ + "jy[{vowel}>{zi}|{small}\n"
+ + "j[j>{xtu}\n"
+
+ + "la>{ra}\n"
+ + "li>{ri}\n"
+ + "lu>{ru}\n"
+ + "le>{re}\n"
+ + "lo>{ro}\n"
+ + "ly[{vowel}>{ri}|{small}\n"
+ + "l[l>{xtu}\n"
+
+ + "sha>{si}{xya}\n"
+ + "shi>{si}\n"
+ + "shu>{si}{xyu}\n"
+ + "she>{si}{xe}\n"
+ + "sho>{si}{xyo}\n"
+
+ + "tsu>{tu}\n"
+
+ + "yi>{i}\n"
+
+ + "xtsu>{xtu}\n"
+ + "xyi>{xi}\n"
+ + "xye>{xe}\n"
+
+
+
+
+
+
+
+ // Convert vowels to small form
+ + "{small}a>{xya}\n"
+ + "{small}i>{xi}\n"
+ + "{small}u>{xyu}\n"
+ + "{small}e>{xe}\n"
+ + "{small}o>{xyo}\n"
+
+
+
+
+ + "gy|{hvr}<{gi}[{hv}\n"
+ + "gwa<{gu}{xwa}\n"
+ + "gwi<{gu}{xi}\n"
+ + "gwu<{gu}{xu}\n"
+ + "gwe<{gu}{xe}\n"
+ + "gwo<{gu}{xo}\n"
+ + "ga<{ga}\n"
+ + "gi<{gi}\n"
+ + "gu<{gu}\n"
+ + "ge<{ge}\n"
+ + "go<{go}\n"
+
+ + "ky|{hvr}<{ki}[{hv}\n"
+ + "kwa<{ku}{xwa}\n"
+ + "kwi<{ku}{xi}\n"
+ + "kwu<{ku}{xu}\n"
+ + "kwe<{ku}{xe}\n"
+ + "kwo<{ku}{xo}\n"
+ + "qa<{ku}{xa}\n"
+ + "qya<{ku}{xya}\n"
+ + "qyu<{ku}{xyu}\n"
+ + "qyo<{ku}{xyo}\n"
+ + "ka<{ka}\n"
+ + "ki<{ki}\n"
+ + "ku<{ku}\n"
+ + "ke<{ke}\n"
+ + "ko<{ko}\n"
+
+ + "j|{hvr}<{zi}[{hv}\n" // Hepburn
+ + "za<{za}\n"
+ + "ji<{zi}\n" // Hepburn
+ + "zu<{zu}\n"
+ + "ze<{ze}\n"
+ + "zo<{zo}\n"
+
+ + "sh|{hvr}<{si}[{hv}\n" // Hepburn
+ + "sa<{sa}\n"
+ + "shi<{si}\n"
+ + "su<{su}\n"
+ + "se<{se}\n"
+ + "so<{so}\n"
+
+ + "j|{hvr}<{di}[{hv}\n" // Hepburn
+ + "dh|{hvr}<{de}[{hv}\n"
+ + "da<{da}\n"
+ + "ji<{di}\n" // Hepburn
+ + "de<{de}\n"
+ + "do<{do}\n"
+ + "zu<{du}\n" // Hepburn
+
+ + "ch|{hvr}<{ti}[{hv}\n" // Hepburn
+ + "tsa<{tu}{xa}\n"
+ + "tsi<{tu}{xi}\n"
+ + "tse<{tu}{xe}\n"
+ + "tso<{tu}{xo}\n"
+ + "th|{hvr}<{te}[{hv}\n"
+ + "ta<{ta}\n"
+ + "chi<{ti}\n" // Hepburn
+ + "tsu<{tu}\n" // Hepburn
+ + "te<{te}\n"
+ + "to<{to}\n"
+
+ + "ny|{hvr}<{ni}[{hv}\n"
+ + "na<{na}\n"
+ + "ni<{ni}\n"
+ + "nu<{nu}\n"
+ + "ne<{ne}\n"
+ + "no<{no}\n"
+
+ + "by|{hvr}<{bi}[{hv}\n"
+ + "ba<{ba}\n"
+ + "bi<{bi}\n"
+ + "bu<{bu}\n"
+ + "be<{be}\n"
+ + "bo<{bo}\n"
+
+ + "py|{hvr}<{pi}[{hv}\n"
+ + "pa<{pa}\n"
+ + "pi<{pi}\n"
+ + "pu<{pu}\n"
+ + "pe<{pe}\n"
+ + "po<{po}\n"
+
+ + "hy|{hvr}<{hi}[{hv}\n"
+ + "fa<{hu}{xa}\n"
+ + "fi<{hu}{xi}\n"
+ + "fe<{hu}{xe}\n"
+ + "fo<{hu}{xo}\n"
+ + "fya<{hu}{xya}\n"
+ + "fyu<{hu}{xyu}\n"
+ + "fyo<{hu}{xyo}\n"
+ + "ha<{ha}\n"
+ + "hi<{hi}\n"
+ + "fu<{hu}\n" // Hepburn
+ + "he<{he}\n"
+ + "ho<{ho}\n"
+
+ + "my|{hvr}<{mi}[{hv}\n"
+ + "ma<{ma}\n"
+ + "mi<{mi}\n"
+ + "mu<{mu}\n"
+ + "me<{me}\n"
+ + "mo<{mo}\n"
+
+ + "ya<{ya}\n"
+ + "yu<{yu}\n"
+ + "ye<{i}{xe}\n"
+ + "yo<{yo}\n"
+ + "xya<{xya}\n"
+ + "xyu<{xyu}\n"
+ + "xyo<{xyo}\n"
+
+ + "ry|{hvr}<{ri}[{hv}\n"
+ + "ra<{ra}\n"
+ + "ri<{ri}\n"
+ + "ru<{ru}\n"
+ + "re<{re}\n"
+ + "ro<{ro}\n"
+
+ + "wa<{wa}\n"
+ + "wi<{wi}\n"
+ + "we<{we}\n"
+ + "wo<{wo}\n"
+
+ + "vu<{vu}\n"
+ + "vy|{hvr}<{VI}[{hv}\n"
+ + "v<{xtu}[{vu}\n"
+
+ + "xa<{xa}\n"
+ + "xi<{xi}\n"
+ + "xu<{xu}\n"
+ + "xe<{xe}\n"
+ + "xo<{xo}\n"
+
+ + "n''<{n}[{a}\n"
+ + "n''<{n}[{i}\n"
+ + "n''<{n}[{u}\n"
+ + "n''<{n}[{e}\n"
+ + "n''<{n}[{o}\n"
+ + "n''<{n}[{na}\n"
+ + "n''<{n}[{ni}\n"
+ + "n''<{n}[{nu}\n"
+ + "n''<{n}[{ne}\n"
+ + "n''<{n}[{no}\n"
+ + "n''<{n}[{ya}\n"
+ + "n''<{n}[{yu}\n"
+ + "n''<{n}[{yo}\n"
+ + "n''<{n}[{n}\n"
+ + "n<{n}\n"
+
+
+ + "g<{xtu}[{ga}\n"
+ + "g<{xtu}[{gi}\n"
+ + "g<{xtu}[{gu}\n"
+ + "g<{xtu}[{ge}\n"
+ + "g<{xtu}[{go}\n"
+ + "k<{xtu}[{ka}\n"
+ + "k<{xtu}[{ki}\n"
+ + "k<{xtu}[{ku}\n"
+ + "k<{xtu}[{ke}\n"
+ + "k<{xtu}[{ko}\n"
+
+ + "z<{xtu}[{za}\n"
+ + "z<{xtu}[{zi}\n"
+ + "z<{xtu}[{zu}\n"
+ + "z<{xtu}[{ze}\n"
+ + "z<{xtu}[{zo}\n"
+ + "s<{xtu}[{sa}\n"
+ + "s<{xtu}[{si}\n"
+ + "s<{xtu}[{su}\n"
+ + "s<{xtu}[{se}\n"
+ + "s<{xtu}[{so}\n"
+
+ + "d<{xtu}[{da}\n"
+ + "d<{xtu}[{di}\n"
+ + "d<{xtu}[{du}\n"
+ + "d<{xtu}[{de}\n"
+ + "d<{xtu}[{do}\n"
+ + "t<{xtu}[{ta}\n"
+ + "t<{xtu}[{ti}\n"
+ + "t<{xtu}[{tu}\n"
+ + "t<{xtu}[{te}\n"
+ + "t<{xtu}[{to}\n"
+
+
+ + "b<{xtu}[{ba}\n"
+ + "b<{xtu}[{bi}\n"
+ + "b<{xtu}[{bu}\n"
+ + "b<{xtu}[{be}\n"
+ + "b<{xtu}[{bo}\n"
+ + "p<{xtu}[{pa}\n"
+ + "p<{xtu}[{pi}\n"
+ + "p<{xtu}[{pu}\n"
+ + "p<{xtu}[{pe}\n"
+ + "p<{xtu}[{po}\n"
+ + "h<{xtu}[{ha}\n"
+ + "h<{xtu}[{hi}\n"
+ + "h<{xtu}[{hu}\n"
+ + "h<{xtu}[{he}\n"
+ + "h<{xtu}[{ho}\n"
+
+
+ + "r<{xtu}[{ra}\n"
+ + "r<{xtu}[{ri}\n"
+ + "r<{xtu}[{ru}\n"
+ + "r<{xtu}[{re}\n"
+ + "r<{xtu}[{ro}\n"
+
+ + "w<{xtu}[{wa}\n"
+ + "xtu<{xtu}\n"
+
+ + "a<{a}\n"
+ + "i<{i}\n"
+ + "u<{u}\n"
+ + "e<{e}\n"
+ + "o<{o}\n"
+
+
+
+ // Convert small forms to vowels
+ + "a<{hvr}{xya}\n"
+ + "i<{hvr}{xi}\n"
+ + "u<{hvr}{xyu}\n"
+ + "e<{hvr}{xe}\n"
+ + "o<{hvr}{xyo}\n"
+ }
+ };
+ }
+}
+
+
+
diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$StraightQuotes$CurlyQuotes.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$StraightQuotes$CurlyQuotes.java
new file mode 100755
index 00000000000..409d0a1e29b
--- /dev/null
+++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$StraightQuotes$CurlyQuotes.java
@@ -0,0 +1,87 @@
+package com.ibm.text.resources;
+
+import java.util.ListResourceBundle;
+
+public class TransliterationRuleStraightQuotesCurlyQuotes extends ListResourceBundle {
+ /**
+ * Overrides ListResourceBundle
+ */
+ public Object[][] getContents() {
+ return new Object[][] {
+ { "Description",
+ "Use left and right double quotes" },
+
+ { "Rule",
+ // Rewritten using character codes [LIU]
+ "white=[[:Zs:][:Zl:][:Zp:]]\n"
+ + "black=[^[:Zs:][:Zl:][:Zp:]]\n"
+ + "open=[[:Ps:]]\n"
+ + "dquote=\"\n"
+
+ + "lAng=\u3008\n"
+ + "ldAng=\u300A\n"
+ + "lBrk='['\n"
+ + "lBrc='{'\n"
+
+ + "lquote=\u2018\n"
+ + "rquote=\u2019\n"
+ + "ldquote=\u201C\n"
+ + "rdquote=\u201D\n"
+
+ + "ldguill=\u00AB\n"
+ + "rdguill=\u00BB\n"
+ + "lguill=\u2039\n"
+ + "rguill=\u203A\n"
+
+ + "mdash=\u2014\n"
+
+ //#######################################
+ // Conversions from input
+ //#######################################
+
+ // join single quotes
+ + "{lquote}''>{ldquote}\n"
+ + "{lquote}{lquote}>{ldquote}\n"
+ + "{rquote}''>{rdquote}\n"
+ + "{rquote}{rquote}>{rdquote}\n"
+
+ //smart single quotes
+ + "{white}]''>{lquote}\n"
+ + "{open}]''>{lquote}\n"
+ + "{black}]''>{rquote}\n"
+ + "''>{lquote}\n"
+
+ //smart doubles
+ + "{white}]{dquote}>{ldquote}\n"
+ + "{open}]{dquote}>{ldquote}\n"
+ + "{black}]{dquote}>{rdquote}\n"
+ + "{dquote}>{ldquote}\n"
+
+ // join single guillemets
+ + "{rguill}{rguill}>{rdguill}\n"
+ + "'>>'>{rdguill}\n"
+ + "{lguill}{lguill}>{ldguill}\n"
+ + "'<<'>{ldguill}\n"
+
+ // prevent double spaces
+ + " ] >\n"
+
+ // join hyphens into dash
+ + "-->{mdash}\n"
+
+ //#######################################
+ // Conversions back to input
+ //#######################################
+
+ //smart quotes
+ + "''<{lquote}\n"
+ + "''<{rquote}\n"
+ + "{dquote}<{ldquote}\n"
+ + "{dquote}<{rdquote}\n"
+
+ //hyphens
+ + "--<{mdash}\n"
+ }
+ };
+ }
+}