diff --git a/icu4j/src/com/ibm/demo/translit/Demo.java b/icu4j/src/com/ibm/demo/translit/Demo.java new file mode 100755 index 00000000000..d02953d5036 --- /dev/null +++ b/icu4j/src/com/ibm/demo/translit/Demo.java @@ -0,0 +1,253 @@ +import java.applet.*; +import java.awt.*; +import java.awt.event.*; +import java.util.*; +import com.ibm.text.components.*; +import com.ibm.text.*; + +/** + * A frame that allows the user to experiment with keyboard + * transliteration. This class has a main() method so it can be run + * as an application. The frame contains an editable text component + * and uses keyboard transliteration to process keyboard events. + * + *

Copyright (c) IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: Demo.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class Demo extends Frame { + + static final boolean DEBUG = false; + + Transliterator translit = null; + + boolean compound = false; + Transliterator[] compoundTranslit = new Transliterator[MAX_COMPOUND]; + static final int MAX_COMPOUND = 128; + int compoundCount = 0; + + TransliteratingTextComponent text = null; + + Menu translitMenu; + CheckboxMenuItem translitItem; + CheckboxMenuItem noTranslitItem; + + static final String NO_TRANSLITERATOR = "None"; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + public static void main(String[] args) { + Frame f = new Demo(600, 200); + f.addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + System.exit(0); + } + }); + f.setVisible(true); + } + + public Demo(int width, int height) { + super("Transliteration Demo"); + + initMenus(); + + addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + handleClose(); + } + }); + + text = new TransliteratingTextComponent(); + Font font = new Font("serif", Font.PLAIN, 48); + text.setFont(font); + text.setSize(width, height); + text.setVisible(true); + text.setText("\u03B1\u05D0\u3042\u4E80"); + add(text); + + setSize(width, height); + } + + private void initMenus() { + MenuBar mbar; + Menu menu; + MenuItem mitem; + CheckboxMenuItem citem; + + setMenuBar(mbar = new MenuBar()); + mbar.add(menu = new Menu("File")); + menu.add(mitem = new MenuItem("Quit")); + mitem.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + handleClose(); + } + }); + + final ItemListener setTransliteratorListener = new ItemListener() { + public void itemStateChanged(ItemEvent e) { + CheckboxMenuItem item = (CheckboxMenuItem) e.getSource(); + if (e.getStateChange() == ItemEvent.DESELECTED) { + // Don't let the current transliterator be deselected. + // Just reselect it. + item.setState(true); + } else if (compound) { + // Adding an item to a compound transliterator + handleAddToCompound(item.getLabel()); + } else if (item != translitItem) { + // Deselect previous choice. Don't need to call + // setState(true) on new choice. + translitItem.setState(false); + translitItem = item; + handleSetTransliterator(item.getLabel()); + } + } + }; + + translit = null; + mbar.add(translitMenu = new Menu("Transliterator")); + translitMenu.add(translitItem = noTranslitItem = + new CheckboxMenuItem(NO_TRANSLITERATOR, true)); + noTranslitItem.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e) { + // Can't uncheck None -- any action here sets None to true + setNoTransliterator(); + } + }); + + translitMenu.addSeparator(); + + translitMenu.add(citem = new CheckboxMenuItem("Compound")); + citem.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e) { + CheckboxMenuItem item = (CheckboxMenuItem) e.getSource(); + if (e.getStateChange() == ItemEvent.DESELECTED) { + // If compound gets deselected, then select NONE + setNoTransliterator(); + } else if (!compound) { + // Switching from non-compound to compound + translitItem.setState(false); + translitItem = item; + translit = null; + compound = true; + compoundCount = 0; + for (int i=0; i 0) { + v.setElementAt(b, i); + v.setElementAt(a, j); + a = b; + } + } + } + return v; + } + + private void setNoTransliterator() { + translitItem = noTranslitItem; + noTranslitItem.setState(true); + handleSetTransliterator(noTranslitItem.getLabel()); + compound = false; + for (int i=0; i. + */ + private static Transliterator decodeTranslitItem(String name) { + return (name.equals(NO_TRANSLITERATOR)) + ? null : Transliterator.getInstance(name); + } + + private void handleBatchTransliterate() { + if (translit == null) { + return; + } + + int start = text.getSelectionStart(); + int end = text.getSelectionEnd(); + ReplaceableString s = + new ReplaceableString(text.getText().substring(start, end)); + + StringBuffer log = null; + if (DEBUG) { + log = new StringBuffer(); + log.append('"' + s.toString() + "\" (start " + start + + ", end " + end + ") -> \""); + } + + translit.transliterate(s); + String str = s.toString(); + + if (DEBUG) { + log.append(str + "\""); + System.out.println("Batch " + translit.getID() + ": " + log.toString()); + } + + text.replaceRange(str, start, end); + text.select(start, start + str.length()); + } + + private void handleClose() { + dispose(); + } +} diff --git a/icu4j/src/com/ibm/demo/translit/DemoApplet.java b/icu4j/src/com/ibm/demo/translit/DemoApplet.java new file mode 100755 index 00000000000..21b256ebc26 --- /dev/null +++ b/icu4j/src/com/ibm/demo/translit/DemoApplet.java @@ -0,0 +1,62 @@ + +import java.awt.*; +import java.awt.event.*; +import java.applet.*; +import com.ibm.text.components.AppletFrame; + +/** + * A simple Applet that shows a button. When pressed, the button + * shows the DemoAppletFrame. This Applet is meant to be embedded + * in a web page. + * + *

Copyright (c) IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: DemoApplet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class DemoApplet extends Applet { + + Demo frame = null; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + public static void main(String args[]) { + final DemoApplet applet = new DemoApplet(); + new AppletFrame("Transliteration Demo", applet, 640, 480); + } + + public void init() { + + Button button = new Button("Transliteration Demo"); + button.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + if (frame == null) { + frame = new Demo(600, 200); + frame.addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent we) { + frame = null; + } + }); + } + frame.setVisible(true); + frame.toFront(); + } + }); + + add(button); + + Dimension size = button.getPreferredSize(); + size.width += 10; + size.height += 10; + + resize(size); + } + + public void stop() { + if (frame != null) { + frame.dispose(); + } + frame = null; + } +} diff --git a/icu4j/src/com/ibm/demo/translit/demo.bat b/icu4j/src/com/ibm/demo/translit/demo.bat new file mode 100755 index 00000000000..88f63e3446f --- /dev/null +++ b/icu4j/src/com/ibm/demo/translit/demo.bat @@ -0,0 +1,7 @@ +REM For best results, run the demo as an applet inside of Netscape +REM with Bitstream Cyberbit installed. + +REM setup your JDK 1.1.x path and classpath here: +call JDK11 +set CLASSPATH=../translit.jar;%CLASSPATH% +javaw Demo diff --git a/icu4j/src/com/ibm/demo/translit/demo.html b/icu4j/src/com/ibm/demo/translit/demo.html new file mode 100755 index 00000000000..6327daf6504 --- /dev/null +++ b/icu4j/src/com/ibm/demo/translit/demo.html @@ -0,0 +1,8 @@ + + +Transliteration Demo + + + + + diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java b/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java new file mode 100755 index 00000000000..d02953d5036 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java @@ -0,0 +1,253 @@ +import java.applet.*; +import java.awt.*; +import java.awt.event.*; +import java.util.*; +import com.ibm.text.components.*; +import com.ibm.text.*; + +/** + * A frame that allows the user to experiment with keyboard + * transliteration. This class has a main() method so it can be run + * as an application. The frame contains an editable text component + * and uses keyboard transliteration to process keyboard events. + * + *

Copyright (c) IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: Demo.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class Demo extends Frame { + + static final boolean DEBUG = false; + + Transliterator translit = null; + + boolean compound = false; + Transliterator[] compoundTranslit = new Transliterator[MAX_COMPOUND]; + static final int MAX_COMPOUND = 128; + int compoundCount = 0; + + TransliteratingTextComponent text = null; + + Menu translitMenu; + CheckboxMenuItem translitItem; + CheckboxMenuItem noTranslitItem; + + static final String NO_TRANSLITERATOR = "None"; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + public static void main(String[] args) { + Frame f = new Demo(600, 200); + f.addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + System.exit(0); + } + }); + f.setVisible(true); + } + + public Demo(int width, int height) { + super("Transliteration Demo"); + + initMenus(); + + addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + handleClose(); + } + }); + + text = new TransliteratingTextComponent(); + Font font = new Font("serif", Font.PLAIN, 48); + text.setFont(font); + text.setSize(width, height); + text.setVisible(true); + text.setText("\u03B1\u05D0\u3042\u4E80"); + add(text); + + setSize(width, height); + } + + private void initMenus() { + MenuBar mbar; + Menu menu; + MenuItem mitem; + CheckboxMenuItem citem; + + setMenuBar(mbar = new MenuBar()); + mbar.add(menu = new Menu("File")); + menu.add(mitem = new MenuItem("Quit")); + mitem.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + handleClose(); + } + }); + + final ItemListener setTransliteratorListener = new ItemListener() { + public void itemStateChanged(ItemEvent e) { + CheckboxMenuItem item = (CheckboxMenuItem) e.getSource(); + if (e.getStateChange() == ItemEvent.DESELECTED) { + // Don't let the current transliterator be deselected. + // Just reselect it. + item.setState(true); + } else if (compound) { + // Adding an item to a compound transliterator + handleAddToCompound(item.getLabel()); + } else if (item != translitItem) { + // Deselect previous choice. Don't need to call + // setState(true) on new choice. + translitItem.setState(false); + translitItem = item; + handleSetTransliterator(item.getLabel()); + } + } + }; + + translit = null; + mbar.add(translitMenu = new Menu("Transliterator")); + translitMenu.add(translitItem = noTranslitItem = + new CheckboxMenuItem(NO_TRANSLITERATOR, true)); + noTranslitItem.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e) { + // Can't uncheck None -- any action here sets None to true + setNoTransliterator(); + } + }); + + translitMenu.addSeparator(); + + translitMenu.add(citem = new CheckboxMenuItem("Compound")); + citem.addItemListener(new ItemListener() { + public void itemStateChanged(ItemEvent e) { + CheckboxMenuItem item = (CheckboxMenuItem) e.getSource(); + if (e.getStateChange() == ItemEvent.DESELECTED) { + // If compound gets deselected, then select NONE + setNoTransliterator(); + } else if (!compound) { + // Switching from non-compound to compound + translitItem.setState(false); + translitItem = item; + translit = null; + compound = true; + compoundCount = 0; + for (int i=0; i 0) { + v.setElementAt(b, i); + v.setElementAt(a, j); + a = b; + } + } + } + return v; + } + + private void setNoTransliterator() { + translitItem = noTranslitItem; + noTranslitItem.setState(true); + handleSetTransliterator(noTranslitItem.getLabel()); + compound = false; + for (int i=0; i. + */ + private static Transliterator decodeTranslitItem(String name) { + return (name.equals(NO_TRANSLITERATOR)) + ? null : Transliterator.getInstance(name); + } + + private void handleBatchTransliterate() { + if (translit == null) { + return; + } + + int start = text.getSelectionStart(); + int end = text.getSelectionEnd(); + ReplaceableString s = + new ReplaceableString(text.getText().substring(start, end)); + + StringBuffer log = null; + if (DEBUG) { + log = new StringBuffer(); + log.append('"' + s.toString() + "\" (start " + start + + ", end " + end + ") -> \""); + } + + translit.transliterate(s); + String str = s.toString(); + + if (DEBUG) { + log.append(str + "\""); + System.out.println("Batch " + translit.getID() + ": " + log.toString()); + } + + text.replaceRange(str, start, end); + text.select(start, start + str.length()); + } + + private void handleClose() { + dispose(); + } +} diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java b/icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java new file mode 100755 index 00000000000..21b256ebc26 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java @@ -0,0 +1,62 @@ + +import java.awt.*; +import java.awt.event.*; +import java.applet.*; +import com.ibm.text.components.AppletFrame; + +/** + * A simple Applet that shows a button. When pressed, the button + * shows the DemoAppletFrame. This Applet is meant to be embedded + * in a web page. + * + *

Copyright (c) IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: DemoApplet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class DemoApplet extends Applet { + + Demo frame = null; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + public static void main(String args[]) { + final DemoApplet applet = new DemoApplet(); + new AppletFrame("Transliteration Demo", applet, 640, 480); + } + + public void init() { + + Button button = new Button("Transliteration Demo"); + button.addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + if (frame == null) { + frame = new Demo(600, 200); + frame.addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent we) { + frame = null; + } + }); + } + frame.setVisible(true); + frame.toFront(); + } + }); + + add(button); + + Dimension size = button.getPreferredSize(); + size.width += 10; + size.height += 10; + + resize(size); + } + + public void stop() { + if (frame != null) { + frame.dispose(); + } + frame = null; + } +} diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat new file mode 100755 index 00000000000..88f63e3446f --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat @@ -0,0 +1,7 @@ +REM For best results, run the demo as an applet inside of Netscape +REM with Bitstream Cyberbit installed. + +REM setup your JDK 1.1.x path and classpath here: +call JDK11 +set CLASSPATH=../translit.jar;%CLASSPATH% +javaw Demo diff --git a/icu4j/src/com/ibm/icu/dev/demo/translit/demo.html b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.html new file mode 100755 index 00000000000..6327daf6504 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/demo/translit/demo.html @@ -0,0 +1,8 @@ + + +Transliteration Demo + + + + + diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java new file mode 100755 index 00000000000..96433f64a26 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java @@ -0,0 +1,763 @@ +import com.ibm.text.*; +import java.text.*; +import java.util.*; + +/** + * @test + * @summary General test of Transliterator + */ +public class TransliteratorTest extends IntlTest { + + public static void main(String[] args) throws Exception { + new TransliteratorTest().run(args); + } + + /** + * A CommonPoint legacy round-trip test for the Kana transliterator. + */ +// public void TestKanaRoundTrip() { +// Transliterator t = Transliterator.getInstance("Kana"); +// StringTokenizer tok = new StringTokenizer(KANA_RT_DATA); +// while (tok.hasMoreTokens()) { +// String str = tok.nextToken(); +// ReplaceableString tmp = new ReplaceableString(str); +// t.transliterate(tmp, Transliterator.FORWARD); +// +// str = tmp.toString(); +// tmp = new ReplaceableString(str); +// t.transliterate(tmp, Transliterator.REVERSE); +// t.transliterate(tmp, Transliterator.FORWARD); +// if (!tmp.toString().equals(str)) { +// tmp = new ReplaceableString(str); +// t.transliterate(tmp, Transliterator.REVERSE); +// String a = tmp.toString(); +// t.transliterate(tmp, Transliterator.FORWARD); +// errln("FAIL: " + escape(str) + " -> " + +// escape(a) + " -> " + escape(tmp.toString())); +// } +// } +// } + + public void TestInstantiation() { + long ms = System.currentTimeMillis(); + String ID; + for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) { + ID = (String) e.nextElement(); + try { + Transliterator t = Transliterator.getInstance(ID); + // We should get a new instance if we try again + Transliterator t2 = Transliterator.getInstance(ID); + if (t != t2) { + logln(ID + ":" + t); + } else { + errln("FAIL: " + ID + " returned identical instances"); + } + } catch (IllegalArgumentException ex) { + errln("FAIL: " + ID); + throw ex; + } + } + + // Now test the failure path + try { + ID = ""; + Transliterator t = Transliterator.getInstance(ID); + errln("FAIL: " + ID + " returned " + t); + } catch (IllegalArgumentException ex) { + logln("OK: Bogus ID handled properly"); + } + + ms = System.currentTimeMillis() - ms; + logln("Elapsed time: " + ms + " ms"); + } + + public void TestSimpleRules() { + /* Example: rules 1. ab>x|y + * 2. yc>z + * + * []|eabcd start - no match, copy e to tranlated buffer + * [e]|abcd match rule 1 - copy output & adjust cursor + * [ex|y]cd match rule 2 - copy output & adjust cursor + * [exz]|d no match, copy d to transliterated buffer + * [exzd]| done + */ + expect("ab>x|y\n" + + "yc>z", + "eabcd", "exzd"); + + /* Another set of rules: + * 1. ab>x|yzacw + * 2. za>q + * 3. qc>r + * 4. cw>n + * + * []|ab Rule 1 + * [x|yzacw] No match + * [xy|zacw] Rule 2 + * [xyq|cw] Rule 4 + * [xyqn]| Done + */ + expect("ab>x|yzacw\n" + + "za>q\n" + + "qc>r\n" + + "cw>n", + "ab", "xyqn"); + + /* Test categories + */ + Transliterator t = new RuleBasedTransliterator("", + "dummy=\uE100\n" + + "vowel=[aeiouAEIOU]\n" + + "lu=[:Lu:]\n" + + "{vowel}[{lu}>!\n" + + "{vowel}>&\n" + + "!]{lu}>^\n" + + "{lu}>*\n" + + "a>ERROR"); + expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&"); + } + + // Restore this test if/when it's been deciphered. In general, + // tests that depend on a specific tranliterator are subject + // to the same fragility as tests that depend on resource data. + +// public void TestKana() { +// String DATA[] = { +// "a", "\u3042", +// "A", "\u30A2", +// "aA", "\u3042\u30A2", +// "aaaa", "\u3042\u3042\u3042\u3042", +// "akasata", "\u3042\u304B\u3055\u305F", +// }; +// +// Transliterator t = Transliterator.getInstance("Latin-Kana"); +// Transliterator rt = Transliterator.getInstance("Kana-Latin"); +// for (int i=0; izyx\n" + + "ab>yz\n" + + "bc>zx\n" + + "ca>xy\n" + + "a>x\n" + + "b>y\n" + + "c>z\n" + + + "abc", RULES); + Transliterator rev = new RuleBasedTransliterator("", RULES, + RuleBasedTransliterator.REVERSE, null); + for (int i=0; i", + "psch>Y\n" + +"ps>y\n" + +"ch>x\n" + +"a>A\n"); + String DATA[] = { + // insertion, buffer + "a", "A", + "p", "Ap", + "s", "Aps", + "c", "Apsc", + "a", "AycA", + "psch", "AycAY", + null, "AycAY", // null means finishKeyboardTransliteration + }; + + keyboardAux(t, DATA); + } + + /** + * Basic test of keyboard with cursor. + */ + public void TestKeyboard2() { + Transliterator t = new RuleBasedTransliterator("", + "ych>Y\n" + +"ps>|y\n" + +"ch>x\n" + +"a>A\n"); + String DATA[] = { + // insertion, buffer + "a", "A", + "p", "Ap", + "s", "Ay", + "c", "Ayc", + "a", "AycA", + "p", "AycAp", + "s", "AycAy", + "c", "AycAyc", + "h", "AycAY", + null, "AycAY", // null means finishKeyboardTransliteration + }; + + keyboardAux(t, DATA); + } + + /** + * Test keyboard transliteration with back-replacement. + */ + public void TestKeyboard3() { + // We want th>z but t>y. Furthermore, during keyboard + // transliteration we want t>y then yh>z if t, then h are + // typed. + String RULES = + "t>|y\n" + + "yh>z\n" + + ""; + + String[] DATA = { + // Column 1: characters to add to buffer (as if typed) + // Column 2: expected appearance of buffer after + // keyboard xliteration. + "a", "a", + "b", "ab", + "t", "aby", + "c", "abyc", + "t", "abycy", + "h", "abycz", + null, "abycz", // null means finishKeyboardTransliteration + }; + + Transliterator t = new RuleBasedTransliterator("", RULES); + keyboardAux(t, DATA); + } + + private void keyboardAux(Transliterator t, String[] DATA) { + int[] index = {0, 0, 0}; + ReplaceableString s = new ReplaceableString(); + for (int i=0; i "); + t.keyboardTransliterate(s, index, DATA[i]); + } else { + log = new StringBuffer(s.toString() + " => "); + t.finishKeyboardTransliteration(s, index); + } + String str = s.toString(); + // Show the start index '{' and the cursor '|' + log.append(str.substring(0, index[Transliterator.START])). + append('{'). + append(str.substring(index[Transliterator.START], + index[Transliterator.CURSOR])). + append('|'). + append(str.substring(index[Transliterator.CURSOR])); + if (str.equals(DATA[i+1])) { + logln(log.toString()); + } else { + errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]); + } + } + } + + public void TestArabic() { + String DATA[] = { + "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+ + "\u0627\u0644\u0644\u063a\u0629\u0020"+ + "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+ + "\u0628\u0628\u0646\u0638\u0645\u0020"+ + "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+ + "\u062c\u0645\u064a\u0644\u0629", + }; + + Transliterator t = Transliterator.getInstance("Latin-Arabic"); + for (int i=0; i", trans); + + expect(t, "aaaaa", "aaaaa"); + } + + /** + * Compose the hex transliterators forward and reverse. + */ + public void TestCompoundHex() { + Transliterator a = Transliterator.getInstance("Unicode-Hex"); + Transliterator b = Transliterator.getInstance("Hex-Unicode"); + Transliterator[] trans = { a, b }; + Transliterator ab = new CompoundTransliterator("ab", trans); + String s = "abcde"; + expect(ab, s, s); + + trans = new Transliterator[] { b, a }; + Transliterator ba = new CompoundTransliterator("ba", trans); + ReplaceableString str = new ReplaceableString(s); + a.transliterate(str); + expect(ba, str.toString(), str.toString()); + } + + /** + * Do some basic tests of filtering. + */ + public void TestFiltering() { + Transliterator hex = Transliterator.getInstance("Unicode-Hex"); + hex.setFilter(new UnicodeFilter() { + public boolean isIn(char c) { + return c != 'c'; + } + }); + String s = "abcde"; + String out = hex.transliterate(s); + String exp = "\\u0061\\u0062c\\u0064\\u0065"; + if (out.equals(exp)) { + logln("Ok: \"" + exp + "\""); + } else { + logln("FAIL: \"" + out + "\", wanted \"" + exp + "\""); + } + } + + //====================================================================== + // Support methods + //====================================================================== + + void expect(String rules, String source, String expectedResult) { + expect(new RuleBasedTransliterator("", rules), source, expectedResult); + } + + void expect(Transliterator t, String source, String expectedResult, + Transliterator reverseTransliterator) { + expect(t, source, expectedResult); + if (reverseTransliterator != null) { + expect(reverseTransliterator, expectedResult, source); + } + } + + void expect(Transliterator t, String source, String expectedResult) { + String result = t.transliterate(source); + expectAux(t.getID() + ":String", source, result, expectedResult); + + ReplaceableString rsource = new ReplaceableString(source); + t.transliterate(rsource); + result = rsource.toString(); + expectAux(t.getID() + ":Replaceable", source, result, expectedResult); + + // Test keyboard (incremental) transliteration -- this result + // must be the same after we finalize (see below). + rsource.getStringBuffer().setLength(0); + int[] index = { 0, 0, 0 }; + StringBuffer log = new StringBuffer(); + + for (int i=0; i "); + t.keyboardTransliterate(rsource, index, + String.valueOf(source.charAt(i))); + // Append the string buffer with a vertical bar '|' where + // the committed index is. + String s = rsource.toString(); + log.append(s.substring(0, index[Transliterator.CURSOR])). + append('|'). + append(s.substring(index[Transliterator.CURSOR])); + } + + // As a final step in keyboard transliteration, we must call + // transliterate to finish off any pending partial matches that + // were waiting for more input. + t.finishKeyboardTransliteration(rsource, index); + result = rsource.toString(); + log.append(" => ").append(rsource.toString()); + + expectAux(t.getID() + ":Keyboard", log.toString(), + result.equals(expectedResult), + expectedResult); + } + + void expectAux(String tag, String source, + String result, String expectedResult) { + expectAux(tag, source + " -> " + result, + result.equals(expectedResult), + expectedResult); + } + + void expectAux(String tag, String summary, boolean pass, + String expectedResult) { + if (pass) { + logln("("+tag+") " + escape(summary)); + } else { + errln("FAIL: ("+tag+") " + + escape(summary) + + ", expected " + escape(expectedResult)); + } + } + + /** + * Escape non-ASCII characters as Unicode. + */ + public static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } + + /* + static final String KANA_RT_DATA = +"a "+ + +"ba bi bu be bo "+ +"bya byi byu bye byo "+ +"bba "+ + +"da di du de do "+ +"dya dyi dyu dye dyo "+ +"dha dhi dhu dhe dho "+ +"dda "+ + +"e "+ + +"fa fi fe fo "+ +"fya fyu fyo "+ +"ffa "+ + +"ga gi gu ge go "+ +"gya gyi gyu gye gyo "+ +"gwa gwi gwu gwe gwo "+ +"gga "+ + +"ha hi hu he ho "+ +"hya hyi hyu hye hyo "+ +"hha "+ + +"i "+ + +"ka ki ku ke ko "+ +"kwa kwi kwu kwe kwo "+ +"kya kyi kyu kye kyo "+ +"kka "+ + +"ma mi mu me mo "+ +"mya myi myu mye myo "+ +"mba mfa mma mpa mva "+ +"m'' "+ + +"na ni nu ne no "+ +"nya nyi nyu nye nyo "+ +"nn n'' n "+ + +"o "+ + +"pa pi pu pe po "+ +"pya pyi pyu pye pyo "+ +"ppa "+ + +"qa qi qu qe qo "+ +"qya qyi qyu qye qyo "+ +"qqa "+ + +"ra ri ru re ro "+ +"rya ryi ryu rye ryo "+ +"rra "+ + +"sa si su se so "+ +"sya syi syu sye syo "+ +"ssya ssa "+ + +"ta ti tu te to "+ +"tha thi thu the tho "+ +"tsa tsi tse tso "+ +"tya tyi tyu tye tyo "+ +"ttsa "+ +"tta "+ + +"u "+ + +"va vi vu ve vo "+ +"vya vyi vyu vye vyo "+ +"vva "+ + +"wa wi we wo "+ +"wwa "+ + +"ya yu ye yo "+ +"yya "+ + +"za zi zu ze zo "+ +"zya zyi zyu zye zyo "+ +"zza "+ + +"xa xi xu xe xo "+ +"xka xke "+ +"xtu "+ +"xwa "+ +"xya xyu xyo "+ + + "akka akki akku akke akko "+ + "akkya akkyu akkyo "+ + + "atta atti attu atte atto "+ + "attya attyu attyo "+ + "adda addi addu adde addo "+ + + "atcha atchi atchu atche atcho "+ + + "assa assi assu asse asso "+ + "assya assyu assyo "+ + + "ahha ahhi ahhu ahhe ahho "+ + "appa appi appu appe appo "+ + + "an "+ + "ana ani anu ane ano "+ + "anna anni annu anne anno "+ + "an'a an'i an'u an'e an'o "+ + + "annna annni annnu annne annno "+ + "an'na an'ni an'nu an'ne an'no "+ + + "anka anki anku anke anko "+ + "anga angi angu ange ango "+ + + "ansa ansi ansu anse anso "+ + "anza anzi anzu anze anzo "+ + "anzya anzyu anzyo "+ + + "anta anti antu ante anto "+ + "antya antyu antyo "+ + "anda andi andu ande ando "+ + + "ancha anchi anchu anche ancho "+ + "anja anji anju anje anjo "+ + "antsa antsu antso "+ + + "anpa anpi anpu anpe anpo "+ + "ampa ampi ampu ampe ampo "+ + + "anba anbi anbu anbe anbo "+ + "amba ambi ambu ambe ambo "+ + + "anma anmi anmu anme anmo "+ + "amma ammi ammu amme ammo "+ + + "anwa anwi anwu anwe anwo "+ + + "anha anhi anhu anhe anho "+ + + "anya anyi anyu anye anyo "+ + "annya annyi annyu annye annyo "+ + "an'ya an'yi an'yu an'ye an'yo "+ + + "kkk "+ + "ggg "+ + "sss "+ + "zzz "+ + "ttt "+ + "ddd "+ + "nnn "+ + "hhh "+ + "bbb "+ + "ppp "+ + "mmm "+ + "yyy "+ + "rrr "+ + "www "; +*/ + + /*+ + + "A I U E O "+ + "XA XI XU XE XO "+ + + "KA KI KU KE KO "+ + "KYA KYI KYU KYE KYO "+ + "KWA KWI KWU KWE KWO "+ + "QA QI QU QE QO "+ + "QYA QYI QYU QYE QYO "+ + "XKA XKE "+ + + "GA GI GU GE GO "+ + "GYA GYI GYU GYE GYO "+ + "GWA GWI GWU GWE GWO "+ + + "SA SI SU SE SO "+ + "SHA SHI SHU SHE SHO "+ + "SYA SYI SYU SYE SYO "+ + + "ZA ZI ZU ZE ZO "+ + "ZYA ZYI ZYU ZYE ZYO "+ + "JA JI JU JE JO "+ + "JYA JYU JYO "+ + + "TA TI TU TE TO "+ + "XTU XTSU "+ + "TYA TYU TYO "+ + "CYA CYU CYO "+ + "CHA CHI CHU CHE CHO "+ + "TSA TSI TSU TSE TSO "+ + "DA DI DU DE DO "+ + "DYA DYU DYO "+ + "THA THI THU THE THO "+ + "DHA DHI DHU DHE DHO "+ + + "NA NI NU NE NO "+ + "NYA NYU NYO "+ + + "HA HI HU HE HO "+ + "HYA HYU HYO "+ + "FA FI FU FE FO "+ + "FYA FYU FYO "+ + "BA BI BU BE BO "+ + "BYA BYU BYO "+ + "PA PI PU PE PO "+ + "PYA PYU PYO "+ + + "MA MI MU ME MO "+ + "MYA MYU MYO "+ + "YA YI YU YE YO "+ + "XYA XYI XYU XYE XYO "+ + + "RA RI RU RE RO "+ + "LA LI LU LE LO "+ + "RYA RYI RYU RYE RYO "+ + "LYA LYI LYU LYE LYO "+ + + "WA WI WU WE WO "+ + "VA VI VU VE VO "+ + "VYA VYU VYO "+ + + "CYA CYI CYU CYE CYO "+ + + "NN "+ + "N' "+ + "N "+ + + "AKKA AKKI AKKU AKKE AKKO "+ + "AKKYA AKKYU AKKYO "+ + + "ATTA ATTI ATTU ATTE ATTO "+ + "ATTYA ATTYU ATTYO "+ + "ADDA ADDI ADDU ADDE ADDO "+ + + "ATCHA ATCHI ATCHU ATCHE ATCHO "+ + + "ASSA ASSI ASSU ASSE ASSO "+ + "ASSYA ASSYU ASSYO "+ + + "AHHA AHHI AHHU AHHE AHHO "+ + "APPA APPI APPU APPE APPO "+ + + "AN "+ + "ANA ANI ANU ANE ANO "+ + "ANNA ANNI ANNU ANNE ANNO "+ + "AN'A AN'I AN'U AN'E AN'O "+ + + "ANNNA ANNNI ANNNU ANNNE ANNNO "+ + "AN'NA AN'NI AN'NU AN'NE AN'NO "+ + + "ANKA ANKI ANKU ANKE ANKO "+ + "ANGA ANGI ANGU ANGE ANGO "+ + + "ANSA ANSI ANSU ANSE ANSO "+ + "ANZA ANZI ANZU ANZE ANZO "+ + "ANZYA ANZYU ANZYO "+ + + "ANTA ANTI ANTU ANTE ANTO "+ + "ANTYA ANTYU ANTYO "+ + "ANDA ANDI ANDU ANDE ANDO "+ + + "ANCHA ANCHI ANCHU ANCHE ANCHO "+ + "ANJA ANJI ANJU ANJE ANJO "+ + "ANTSA ANTSU ANTSO "+ + + "ANPA ANPI ANPU ANPE ANPO "+ + "AMPA AMPI AMPU AMPE AMPO "+ + + "ANBA ANBI ANBU ANBE ANBO "+ + "AMBA AMBI AMBU AMBE AMBO "+ + + "ANMA ANMI ANMU ANME ANMO "+ + "AMMA AMMI AMMU AMME AMMO "+ + + "ANWA ANWI ANWU ANWE ANWO "+ + + "ANHA ANHI ANHU ANHE ANHO "+ + + "ANYA ANYI ANYU ANYE ANYO "+ + "ANNYA ANNYI ANNYU ANNYE ANNYO "+ + "AN'YA AN'YI AN'YU AN'YE AN'YO "+ + + "KKK "+ + "GGG "+ + "SSS "+ + "ZZZ "+ + "TTT "+ + "DDD "+ + "NNN "+ + "HHH "+ + "BBB "+ + "PPP "+ + "MMM "+ + "YYY "+ + "RRR "+ + "WWW";*/ +} diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java new file mode 100755 index 00000000000..8417faf4b44 --- /dev/null +++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java @@ -0,0 +1,118 @@ +import com.ibm.text.*; +import java.text.*; +import java.util.*; + +/** + * @test + * @summary General test of UnicodeSet + */ +public class UnicodeSetTest extends IntlTest { + + public static void main(String[] args) throws Exception { + new UnicodeSetTest().run(args); + } + + public void TestPatterns() { + UnicodeSet set = new UnicodeSet(); + expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km"); + expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz"); + expectPattern(set, "[a\\-z]", "--aazz"); + expectPattern(set, "[-az]", "--aazz"); + expectPattern(set, "[az-]", "--aazz"); + expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz"); + + // Throw in a test of complement + set.complement(); + String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF'; + expectPairs(set, exp); + } + + public void TestAddRemove() { + UnicodeSet set = new UnicodeSet(); + set.add('a', 'z'); + expectPairs(set, "az"); + set.remove('m', 'p'); + expectPairs(set, "alqz"); + set.remove('e', 'g'); + expectPairs(set, "adhlqz"); + set.remove('d', 'i'); + expectPairs(set, "acjlqz"); + set.remove('c', 'r'); + expectPairs(set, "absz"); + set.add('f', 'q'); + expectPairs(set, "abfqsz"); + set.remove('a', 'g'); + expectPairs(set, "hqsz"); + set.remove('a', 'z'); + expectPairs(set, ""); + + // Try removing an entire set from another set + expectPattern(set, "[c-x]", "cx"); + UnicodeSet set2 = new UnicodeSet(); + expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); + set.removeAll(set2); + expectPairs(set, "deluxx"); + + // Try adding an entire set to another set + expectPattern(set, "[jackiemclean]", "aacceein"); + expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); + set.addAll(set2); + expectPairs(set, "aacehort"); + + // Test commutativity + expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); + expectPattern(set2, "[jackiemclean]", "aacceein"); + set.addAll(set2); + expectPairs(set, "aacehort"); + } + + void expectPattern(UnicodeSet set, + String pattern, + String expectedPairs) { + set.applyPattern(pattern); + if (!set.getPairs().equals(expectedPairs)) { + errln("FAIL: applyPattern(\"" + pattern + + "\") => pairs \"" + + escape(set.getPairs()) + "\", expected \"" + + escape(expectedPairs) + "\""); + } else { + logln("Ok: applyPattern(\"" + pattern + + "\") => pairs \"" + + escape(set.getPairs()) + "\""); + } + } + + void expectPairs(UnicodeSet set, String expectedPairs) { + if (!set.getPairs().equals(expectedPairs)) { + errln("FAIL: Expected pair list \"" + + escape(expectedPairs) + "\", got \"" + + escape(set.getPairs()) + "\""); + } + } + + /** + * Escape non-ASCII characters as Unicode. + */ + static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java b/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java new file mode 100755 index 00000000000..c3582237d42 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CompoundTransliterator.java @@ -0,0 +1,285 @@ +package com.ibm.text; + +import java.util.Enumeration; +import java.util.Vector; + +/** + * A transliterator that is composed of two or more other + * transliterator objects linked together. For example, if one + * transliterator transliterates from script A to script B, and + * another transliterates from script B to script C, the two may be + * combined to form a new transliterator from A to C. + * + *

Composed transliterators may not behave as expected. For + * example, inverses may not combine to form the identity + * transliterator. See the class documentation for {@link + * Transliterator} for details. + * + *

If a non-null UnicodeFilter is applied to a + * CompoundTransliterator, it has the effect of being + * logically anded with the filter of each transliterator in + * the chain. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class CompoundTransliterator extends Transliterator { + + private static final boolean DEBUG = false; + + private Transliterator[] trans; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructs a new compound transliterator given an array of + * transliterators. The array of transliterators may be of any + * length, including zero or one, however, useful compound + * transliterators have at least two components. + * @param transliterators array of Transliterator + * objects + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + public CompoundTransliterator(String ID, Transliterator[] transliterators, + UnicodeFilter filter) { + super(ID, filter); + trans = new Transliterator[transliterators.length]; + System.arraycopy(transliterators, 0, trans, 0, trans.length); + } + + /** + * Constructs a new compound transliterator given an array of + * transliterators. The array of transliterators may be of any + * length, including zero or one, however, useful compound + * transliterators have at least two components. + * @param transliterators array of Transliterator + * objects + */ + public CompoundTransliterator(String ID, Transliterator[] transliterators) { + this(ID, transliterators, null); + } + + /** + * Returns the number of transliterators in this chain. + * @return number of transliterators in this chain. + */ + public int getCount() { + return trans.length; + } + + /** + * Returns the transliterator at the given index in this chain. + * @param index index into chain, from 0 to getCount() - 1 + * @return transliterator at the given index + */ + public Transliterator getTransliterator(int index) { + return trans[index]; + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return the new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + for (int i=0; i abca/u + * S C L S C L gl=f->a + * + * 2. upup, changes "x" to "XX" + * + * 4 7 a 4 7 a + * abca/u => abcAA/u + * S CL S C + * L gl=a->b + * 3. u-h, changes Unicode to hex + * + * 4 7 a 4 7 a d 0 3 + * abcAA/u => abc/u0041/u0041/u + * S C L S C + * L gl=b->15 + * 4. return + * + * 4 7 a d 0 3 + * abc/u0041/u0041/u + * S C L + */ + + /** + * One more wrinkle. If there is a filter F for the compound + * transliterator as a whole, then we need to modify every + * non-null filter f in the chain to be f' = F & f. Then, + * when we're done, we restore the original filters. + * + * A possible future optimization is to change f to f' at + * construction time, but then if anyone else is using the + * transliterators in the chain outside of this context, they + * will get unexpected results. + */ + UnicodeFilter F = getFilter(); + UnicodeFilter[] f = null; + if (F != null) { + f = new UnicodeFilter[trans.length]; + for (int i=0; i \"")); + } + + trans[i].handleKeyboardTransliterate(text, index); + + if (DEBUG) { + System.out.println(escape( + substring(text, index[START], index[CURSOR]) + '|' + + substring(text, index[CURSOR], index[LIMIT]) + + '"')); + } + + // Adjust overall limit for insertions/deletions + globalLimit += index[LIMIT] - limit; + limit = index[CURSOR]; // Move limit to end of committed text + } + // Cursor is good where it is -- where the last + // transliterator left it. Limit needs to be put back + // where it was, modulo adjustments for deletions/insertions. + index[LIMIT] = globalLimit; + + } finally { + // Fixup the transliterator filters, if we had to modify them. + if (f != null) { + for (int i=0; ipreceding context. + * @return maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + int max = 0; + for (int i=0; i max) { + max = len; + } + } + return max; + } + + /** + * DEBUG + * Returns a substring of a Replaceable. + */ + private static final String substring(Replaceable str, int start, int limit) { + StringBuffer buf = new StringBuffer(); + while (start < limit) { + buf.append(str.charAt(start++)); + } + return buf.toString(); + } + + /** + * DEBUG + * Escapes non-ASCII characters as Unicode. + */ + private static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java b/icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java new file mode 100755 index 00000000000..18673e15fe7 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java @@ -0,0 +1,130 @@ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that converts from hexadecimal Unicode + * escape sequences to the characters they represent. For example, "U+0040" + * and '\u0040'. It recognizes the + * prefixes "U+", "u+", "\U", and "\u". Hex values may be + * upper- or lowercase. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class HexToUnicodeTransliterator extends Transliterator { + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Package accessible ID for this transliterator. + */ + static String _ID = "Hex-Unicode"; + + /** + * Constructs a transliterator. + */ + public HexToUnicodeTransliterator() { + super(_ID, null); + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return the new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + int[] offsets = { start, limit, start }; + handleKeyboardTransliterate(text, offsets); + return offsets[LIMIT]; + } + + /** + * Implements {@link Transliterator#handleKeyboardTransliterate}. + */ + protected void handleKeyboardTransliterate(Replaceable text, + int[] offsets) { + /** + * Performs transliteration changing Unicode hexadecimal + * escapes to characters. For example, "U+0040" -> '@'. A fixed + * set of prefixes is recognized: "\u", "\U", "u+", "U+". + */ + int cursor = offsets[CURSOR]; + int limit = offsets[LIMIT]; + + int maxCursor = limit - 6; + loop: + while (cursor <= maxCursor) { + char c = filteredCharAt(text, cursor + 5); + int digit0 = Character.digit(c, 16); + if (digit0 < 0) { + if (c == '\\') { + cursor += 5; + } else if (c == 'U' || c == 'u' || c == '+') { + cursor += 4; + } else { + cursor += 6; + } + continue; + } + + int u = digit0; + + for (int i=4; i>=2; --i) { + c = filteredCharAt(text, cursor + i); + int digit = Character.digit(c, 16); + if (digit < 0) { + if (c == 'U' || c == 'u' || c == '+') { + cursor += i-1; + } else { + cursor += 6; + } + continue loop; + } + u |= digit << (4 * (5-i)); + } + + c = filteredCharAt(text, cursor); + char d = filteredCharAt(text, cursor + 1); + if (((c == 'U' || c == 'u') && d == '+') + || (c == '\\' && (d == 'U' || d == 'u'))) { + + // At this point, we have a match; replace cursor..cursor+5 + // with u. + text.replace(cursor, cursor+6, String.valueOf((char) u)); + limit -= 5; + maxCursor -= 5; + + ++cursor; + } else { + cursor += 6; + } + } + + offsets[LIMIT] = limit; + offsets[CURSOR] = cursor; + } + + private char filteredCharAt(Replaceable text, int i) { + char c; + UnicodeFilter filter = getFilter(); + return (filter == null) ? text.charAt(i) : + (filter.isIn(c = text.charAt(i)) ? c : '\uFFFF'); + } + + /** + * Return the length of the longest context required by this transliterator. + * This is preceding context. + * @param direction either FORWARD or REVERSE + * @return maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return 0; + } +} diff --git a/icu4j/src/com/ibm/icu/text/Replaceable.java b/icu4j/src/com/ibm/icu/text/Replaceable.java new file mode 100755 index 00000000000..b4c8519689c --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/Replaceable.java @@ -0,0 +1,77 @@ +package com.ibm.text; + +/** + * Replaceable is an interface that supports the + * operation of replacing a substring with another piece of text. + * Replaceable is needed in order to change a piece of + * text while retaining style attributes. For example, if the string + * "the bold font" has range (4, 8) replaced with "strong", + * then it becomes "the strong font". + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: Replaceable.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public interface Replaceable { + /** + * Return the number of characters in the text. + * @return number of characters in text + */ + int length(); + + /** + * Return the character at the given offset into the text. + * @param offset an integer between 0 and length()-1 + * inclusive + * @return character of text at given offset + */ + char charAt(int offset); + + /** + * Copies characters from this object into the destination + * character array. The first character to be copied is at index + * srcStart; the last character to be copied is at + * index srcLimit-1 (thus the total number of + * characters to be copied is srcLimit-srcStart). The + * characters are copied into the subarray of dst + * starting at index dstStart and ending at index + * dstStart + (srcLimit-srcStart) - 1. + * + * @param srcStart the beginning index to copy, inclusive; 0 + * <= start <= limit. + * @param srcLimit the ending index to copy, exclusive; + * start <= limit <= length(). + * @param dst the destination array. + * @param dstStart the start offset in the destination array. + */ + void getChars(int srcStart, int srcLimit, char dst[], int dstStart); + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param text the text to replace characters start + * to limit - 1 + */ + void replace(int start, int limit, String text); + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param chars the text to replace characters start + * to limit - 1 + * @param charsStart the beginning index into chars, + * inclusive; 0 <= start <= limit. + * @param charsLen the number of characters of chars. + */ + void replace(int start, int limit, char[] chars, + int charsStart, int charsLen); + // Note: We use length rather than limit to conform to StringBuffer + // and System.arraycopy. +} diff --git a/icu4j/src/com/ibm/icu/text/ReplaceableString.java b/icu4j/src/com/ibm/icu/text/ReplaceableString.java new file mode 100755 index 00000000000..d6a7df06db5 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/ReplaceableString.java @@ -0,0 +1,159 @@ +package com.ibm.text; + +/** + * ReplaceableString is an adapter class that implements the + * Replaceable API around an ordinary StringBuffer. + * + *

Note: This class does not support attributes and is not + * intended for general use. Most clients will need to implement + * {@link Replaceable} in their text representation class. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @see Replaceable + * @author Alan Liu + * @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class ReplaceableString implements Replaceable { + private StringBuffer buf; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a new object with the given initial contents. + * @param str initial contents + */ + public ReplaceableString(String str) { + buf = new StringBuffer(str); + } + + /** + * Construct a new object using buf for internal + * storage. The contents of buf at the time of + * construction are used as the initial contents. Note! + * Modifications to buf will modify this object, and + * vice versa. + * @param buf object to be used as internal storage + */ + public ReplaceableString(StringBuffer buf) { + this.buf = buf; + } + + /** + * Construct a new empty object. + */ + public ReplaceableString() { + buf = new StringBuffer(); + } + + /** + * Return the contents of this object as a String. + * @return string contents of this object + */ + public String toString() { + return buf.toString(); + } + + /** + * Return the internal storage of this object. Note! Any + * changes made to the returned object affect this object's + * contents, and vice versa. + * @return internal buffer used by this object + */ + public StringBuffer getStringBuffer() { + return buf; + } + + /** + * Return the number of characters contained in this object. + * Replaceable API. + */ + public int length() { + return buf.length(); + } + + /** + * Return the character at the given position in this object. + * Replaceable API. + * @param offset offset into the contents, from 0 to + * length() - 1 + */ + public char charAt(int offset) { + return buf.charAt(offset); + } + + /** + * Copies characters from this object into the destination + * character array. The first character to be copied is at index + * srcStart; the last character to be copied is at + * index srcLimit-1 (thus the total number of + * characters to be copied is srcLimit-srcStart). The + * characters are copied into the subarray of dst + * starting at index dstStart and ending at index + * dstStart + (srcLimit-srcStart) - 1. + * + * @param srcStart the beginning index to copy, inclusive; 0 + * <= start <= limit. + * @param srcLimit the ending index to copy, exclusive; + * start <= limit <= length(). + * @param dst the destination array. + * @param dstStart the start offset in the destination array. + */ + public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) { + buf.getChars(srcStart, srcLimit, dst, dstStart); + } + + /** + * Replace zero or more characters with new characters. + * Replaceable API. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param text new text to replace characters start to + * limit - 1 + */ + public void replace(int start, int limit, String text) { + if (start == limit) { + buf.insert(start, text); + } else { + char[] tail = null; + if (limit < buf.length()) { + tail = new char[buf.length() - limit]; + buf.getChars(limit, buf.length(), tail, 0); + } + buf.setLength(start); + buf.append(text); + if (tail != null) { + buf.append(tail); + } + } + } + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param chars the text to replace characters start + * to limit - 1 + * @param charsStart the beginning index into chars, + * inclusive; 0 <= start <= limit. + * @param charsLen the number of characters of chars. + */ + public void replace(int start, int limit, char[] chars, + int charsStart, int charsLen) { + char[] tail = null; + if (limit < buf.length()) { + tail = new char[buf.length() - limit]; + buf.getChars(limit, buf.length(), tail, 0); + } + buf.setLength(start); + buf.append(chars, charsStart, charsLen); + if (tail != null) { + buf.append(tail); + } + } +} diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java new file mode 100755 index 00000000000..4a433e9479d --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java @@ -0,0 +1,1187 @@ +package com.ibm.text; + +import java.util.Hashtable; +import java.util.Vector; + +/** + * A transliterator that reads a set of rules in order to determine how to + * perform translations. Rules are stored in resource bundles indexed by name. + * Rules are separated by newline characters ('\n'); to include a literal + * newline, prefix it with a backslash ('\\\n'). Whitespace is significant. If + * the first character on a line is '#', the entire line is ignored as a + * comment. + * + *

Each set of rules consists of two groups, one forward, and one reverse. + * This is a convention that is not enforced; rules for one direction may be + * omitted, with the result that translations in that direction will not modify + * the source text. + * + *

Rule syntax + * + *

Rule statements take one of the following forms: + *

+ *
alefmadda=\u0622
+ * + *
Variable definition. The name on the left is + * assigned the character or expression on the right. Names may not + * contain any special characters (see list below). Duplicate names + * (including duplicates of simple variables or category names) + * cause an exception to be thrown. If the right hand side consists + * of one character, then the variable stands for that character. + * In this example, after this statement, instances of the left hand + * name surrounded by braces, "{alefmadda}", + * will be replaced by the Unicode character U+0622.
If the + * right hand side is longer than one character, then it is + * interpreted as a character category expression; see below for + * details. + * + *
softvowel=[eiyEIY]
+ * + *
Category definition. The name on the left is assigned + * to stand for a set of characters. The same rules for names of simple + * variables apply. After this statement, the left hand variable will be + * interpreted as indicating a set of characters in appropriate contexts. The + * pattern syntax defining sets of characters is defined by {@link UnicodeSet}. + * Examples of valid patterns are: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
[abc]The set containing the characters 'a', 'b', and 'c'.
[^abc]The set of all characters except 'a', 'b', and 'c'.
[A-Z]The set of all characters from 'A' to 'Z' in Unicode order.
[:Lu:]The set of Unicode uppercase letters. See + * www.unicode.org + * for a complete list of categories and their two-letter codes.
[^a-z[:Lu:][:Ll:]]The set of all characters except 'a' through 'z' and + * uppercase or lowercase letters.
+ * + * See {@link UnicodeSet} for more documentation and examples. + *
+ * + *
ai>{alefmadda}
+ * + *
Forward translation rule. This rule states that the + * string on the left will be changed to the string on the right when + * performing forward transliteration.
+ * + *
ai<{alefmadda}
+ * + *
Reverse translation rule. This rule states that the + * string on the right will be changed to the string on the left when + * performing reverse transliteration.
+ * + *
+ * + *

Forward and reverse translation rules consist of a match + * pattern and an output string. The match pattern consists + * of literal characters, optionally preceded by context, and optionally + * followed by context. Context characters, like literal pattern characters, + * must be matched in the text being transliterated. However, unlike literal + * pattern characters, they are not replaced by the output text. For example, + * the pattern "[abc]def" indicates the characters + * "def" must be preceded by "abc" for a successful + * match. If there is a successful match, "def" will be replaced, + * but not "abc". The initial '[' is optional, so + * "abc]def" is equivalent to "[abc]def". Another + * example is "123[456]" (or "123[456") in which the + * literal pattern "123" must be followed by "456". + * + *

The output string of a forward or reverse rule consists of characters to + * replace the literal pattern characters. If the output string contains the + * character '|', this is taken to indicate the location of the + * cursor after replacement. The cursor is the point in the text + * at which the next replacement, if any, will be applied. + * + *

Example + * + *

The following example rules illustrate many of the features of the rule + * language. + * + * + * + * + * + * + * + *
Rule 1.abc]def>x|y
Rule 2.xyz>r
Rule 3.yz>q
+ * + *

Applying these rules to the string "adefabcdefz" yields the + * following results: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
|adefabcdefzInitial state, no rules match. Advance cursor.
a|defabcdefzStill no match. Rule 1 does not match because the preceding + * context is not present.
ad|efabcdefzStill no match. Keep advancing until there is a match...
ade|fabcdefz...
adef|abcdefz...
adefa|bcdefz...
adefab|cdefz...
adefabc|defzRule 1 matches; replace "def" with "xy" + * and back up the cursor to before the 'y'.
adefabcx|yzAlthough "xyz" is present, rule 2 does not match + * because the cursor is before the 'y', not before the + * 'x'. Rule 3 does match. Replace "yz" with + * "q".
adefabcxq|The cursor is at the end; transliteration is complete.
+ * + *

The order of rules is significant. If multiple rules may match at some + * point, the first matching rule is applied. + * + *

Forward and reverse rules may have an empty output string. Otherwise, an + * empty left or right hand side of any statement is a syntax error. + * + *

Single quotes are used to quote the special characters + * =><{}[]|. To specify a single quote itself, inside or + * outside of quotes, use two single quotes in a row. For example, the rule + * "'>'>o''clock" changes the string ">" to + * the string "o'clock". + * + *

Notes + * + *

While a RuleBasedTransliterator is being built, it checks that the rules + * are added in proper order. For example, if the rule "a>x" is followed by the + * rule "ab>y", then the second rule will throw an exception. The reason is + * that the second rule can never be triggered, since the first rule always + * matches anything it matches. In other words, the first rule masks + * the second rule. There is a cost of O(n^2) to make this check; in real-world + * tests it appears to approximately double build time. + * + *

One optimization that can be made is to add a pragma to the rule language, + * "#pragma order", that turns off ordering checking. This pragma can then be + * added to all of our resource-based rules (after we build these once and + * determine that there are no ordering errors). I haven't made this change yet + * in the interests of keeping the code from getting too byzantine. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class RuleBasedTransliterator extends Transliterator { + /** + * Direction constant passed to constructor to create a transliterator + * using the forward rules. + */ + public static final int FORWARD = 0; + + /** + * Direction constant passed to constructor to create a transliterator + * using the reverse rules. + */ + public static final int REVERSE = 1; + + private Data data; + + static final boolean DEBUG = false; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructs a new transliterator from the given rules. + * @param rules rules, separated by '\n' + * @param direction either FORWARD or REVERSE. + * @exception IllegalArgumentException if rules are malformed + * or direction is invalid. + */ + public RuleBasedTransliterator(String ID, String rules, int direction, + UnicodeFilter filter) { + super(ID, filter); + if (direction != FORWARD && direction != REVERSE) { + throw new IllegalArgumentException("Invalid direction"); + } + data = parse(rules, direction); + } + + /** + * Constructs a new transliterator from the given rules in the + * FORWARD direction. + * @param rules rules, separated by '\n' + * @exception IllegalArgumentException if rules are malformed + * or direction is invalid. + */ + public RuleBasedTransliterator(String ID, String rules) { + this(ID, rules, FORWARD, null); + } + + RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) { + super(ID, filter); + this.data = data; + } + + static Data parse(String rules, int direction) { + return new Parser(rules, direction).getData(); + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result buffer to receive the transliterated text; previous + * contents are discarded + */ + public void transliterate(String text, int start, int limit, + StringBuffer result) { + /* In the following loop there is a virtual buffer consisting of the + * text transliterated so far followed by the untransliterated text. There is + * also a cursor, which may be in the already transliterated buffer or just + * before the untransliterated text. + * + * Example: rules 1. ab>x|y + * 2. yc>z + * + * []|eabcd start - no match, copy e to tranlated buffer + * [e]|abcd match rule 1 - copy output & adjust cursor + * [ex|y]cd match rule 2 - copy output & adjust cursor + * [exz]|d no match, copy d to transliterated buffer + * [exzd]| done + * + * cursor: an index into the virtual buffer, 0..result.length()-1. + * Matches take place at the cursor. If there is no match, the cursor + * is advanced, and one character is moved from the source text to the + * result buffer. + * + * start, limit: these designate the substring of the source text which + * has not been processed yet. The range of offsets is start..limit-1. + * At any moment the virtual buffer consists of result + + * text.substring(start, limit). + */ + int cursor = 0; + result.setLength(0); + while (start < limit || cursor < result.length()) { + TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result, + cursor, data.setVariables, getFilter()); + if (DEBUG) { + StringBuffer buf = new StringBuffer( + result.toString() + '#' + text.substring(start, limit)); + buf.insert(cursor <= result.length() + ? cursor : (cursor + 1), + '|'); + System.err.print((r == null ? "nomatch:" : ("match:" + r + ", ")) + + buf); + } + + if (r == null) { + if (cursor == result.length()) { + result.append(text.charAt(start++)); + } + ++cursor; + } else { + // resultPad is length of result to right of cursor; >= 0 + int resultPad = result.length() - cursor; + char[] tail = null; + if (r.getKeyLength() > resultPad) { + start += r.getKeyLength() - resultPad; + } else if (r.getKeyLength() < resultPad) { + tail = new char[resultPad - r.getKeyLength()]; + result.getChars(cursor + r.getKeyLength(), result.length(), + tail, 0); + } + result.setLength(cursor); + result.append(r.getOutput()); + if (tail != null) { + result.append(tail); + } + cursor += r.getCursorPos(); + } + + if (DEBUG) { + StringBuffer buf = new StringBuffer( + result.toString() + '#' + text.substring(start, limit)); + buf.insert(cursor <= result.length() + ? cursor : (cursor + 1), + '|'); + System.err.println(" => " + buf); + } + } + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return The new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + /* When using Replaceable, the algorithm is simpler, since we don't have + * two separate buffers. We keep start and limit fixed the entire time, + * relative to the text -- limit may move numerically if text is + * inserted or removed. The cursor moves from start to limit, with + * replacements happening under it. + * + * Example: rules 1. ab>x|y + * 2. yc>z + * + * |eabcd start - no match, advance cursor + * e|abcd match rule 1 - change text & adjust cursor + * ex|ycd match rule 2 - change text & adjust cursor + * exz|d no match, advance cursor + * exzd| done + */ + int cursor = start; + while (cursor < limit) { + TransliterationRule r = data.ruleSet.findMatch(text, start, limit, + cursor, data.setVariables, getFilter()); + if (r == null) { + ++cursor; + } else { + text.replace(cursor, cursor + r.getKeyLength(), r.getOutput()); + limit += r.getOutput().length() - r.getKeyLength(); + cursor += r.getCursorPos(); + } + } + return limit; + } + + /** + * Implements {@link Transliterator#handleKeyboardTransliterate}. + */ + protected void handleKeyboardTransliterate(Replaceable text, + int[] index) { + int start = index[START]; + int limit = index[LIMIT]; + int cursor = index[CURSOR]; + + if (DEBUG) { + System.out.print("\"" + + escape(rsubstring(text, start, cursor)) + '|' + + escape(rsubstring(text, cursor, limit)) + "\""); + } + + boolean partial[] = new boolean[1]; + + while (cursor < limit) { + TransliterationRule r = data.ruleSet.findIncrementalMatch( + text, start, limit, cursor, data.setVariables, partial, getFilter()); + /* If we match a rule then apply it by replacing the key + * with the rule output and repositioning the cursor + * appropriately. If we get a partial match, then we + * can't do anything without more text; return with the + * cursor at the current position. If we get null, then + * there is no match at this position, and we can advance + * the cursor. + */ + if (r == null) { + if (partial[0]) { + break; + } else { + ++cursor; + } + } else { + text.replace(cursor, cursor + r.getKeyLength(), r.getOutput()); + limit += r.getOutput().length() - r.getKeyLength(); + cursor += r.getCursorPos(); + } + } + + if (DEBUG) { + System.out.println(" -> \"" + + escape(rsubstring(text, start, cursor)) + '|' + + escape(rsubstring(text, cursor, cursor)) + '|' + + escape(rsubstring(text, cursor, limit)) + "\""); + } + + index[LIMIT] = limit; + index[CURSOR] = cursor; + } + + /** + * Returns the length of the longest context required by this transliterator. + * This is preceding context. + * @return Maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return data.ruleSet.getMaximumContextLength(); + } + + + /** + * FOR DEBUGGING: Return a substring of a Replaceable. + */ + private static String rsubstring(Replaceable r, int start, int limit) { + StringBuffer buf = new StringBuffer(); + while (start < limit) { + buf.append(r.charAt(start++)); + } + return buf.toString(); + } + + /** + * FOR DEBUGGING: Escape non-ASCII characters as Unicode. + */ + private static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + if (c == '\\') { + buf.append("\\\\"); // That is, "\\" + } else { + buf.append(c); + } + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } + + + + + + static class Data { + public Data() { + variableNames = new Hashtable(); + setVariables = new Hashtable(); + ruleSet = new TransliterationRuleSet(); + } + + /** + * Rule table. May be empty. + */ + public TransliterationRuleSet ruleSet; + + /** + * Map variable name (String) to variable (Character). A variable + * name may correspond to a single literal character, in which + * case the character is stored in this hash. It may also + * correspond to a UnicodeSet, in which case a character is + * again stored in this hash, but the character is a stand-in: it + * is a key for a secondary lookup in data.setVariables. The stand-in + * also represents the UnicodeSet in the stored rules. + */ + public Hashtable variableNames; + + /** + * Map category variable (Character) to set (UnicodeSet). + * Variables that correspond to a set of characters are mapped + * from variable name to a stand-in character in data.variableNames. + * The stand-in then serves as a key in this hash to lookup the + * actual UnicodeSet object. In addition, the stand-in is + * stored in the rule text to represent the set of characters. + */ + public Hashtable setVariables; + } + + + + + + + private static class Parser { + private String rules; + + private int direction; + + private Data data; + + /** + * The next available stand-in for variables. This starts at some point in + * the private use area (discovered dynamically) and increments up toward + * variableLimit. At any point during parsing, available + * variables are variableNext..variableLimit-1. + */ + private char variableNext; + + /** + * The last available stand-in for variables. This is discovered + * dynamically. At any point during parsing, available variables are + * variableNext..variableLimit-1. + */ + private char variableLimit; + + // Operators + private static final char VARIABLE_DEF_OP = '='; + private static final char FORWARD_RULE_OP = '>'; + private static final char REVERSE_RULE_OP = '<'; + + private static final String OPERATORS = "=><"; + + // Other special characters + private static final char QUOTE = '\''; + private static final char VARIABLE_REF_OPEN = '{'; + private static final char VARIABLE_REF_CLOSE = '}'; + private static final char CONTEXT_OPEN = '['; + private static final char CONTEXT_CLOSE = ']'; + private static final char CURSOR_POS = '|'; + private static final char RULE_COMMENT_CHAR = '#'; + + /** + * Specials must be quoted in rules to be used as literals. + * Specials may not occur in variable names. + */ + private static final String SPECIALS = "'{}[]|#" + OPERATORS; + + /** + * Specials that must be quoted in variable definitions. + */ + private static final String DEF_SPECIALS = "'{}"; + + /** + * @param rules list of rules, separated by newline characters + * @exception IllegalArgumentException if there is a syntax error in the + * rules + */ + public Parser(String rules, int direction) { + this.rules = rules; + this.direction = direction; + data = new Data(); + parseRules(); + } + + public Data getData() { + return data; + } + + /** + * Parse the given string as a sequence of rules, separated by newline + * characters ('\n'), and cause this object to implement those rules. Any + * previous rules are discarded. Typically this method is called exactly + * once, during construction. + * @exception IllegalArgumentException if there is a syntax error in the + * rules + */ + private void parseRules() { + determineVariableRange(); + + int n = rules.length(); + int i = 0; + while (i0 && rules.charAt(limit-1) == '\\') { + limit = rules.indexOf('\n', limit+1); + } + + if (limit == -1) { + limit = n; + } + // Skip over empty lines and line starting with # + if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) { + applyRule(i, limit); + } + i = limit + 1; + } + + data.ruleSet.freeze(); + } + + /** + * Parse the given substring as a rule, and append it to the rules currently + * represented in this object. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @exception IllegalArgumentException if there is a syntax error in the + * rules + */ + private void applyRule(int start, int limit) { + /* General description of parsing: Initially, rules contain two types of + * quoted characters. First, there are variable references, such as + * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of + * the first steps in parsing a rule is to resolve such quoted matter. + * Quotes are removed early, leaving unquoted literal matter. Variable + * references are resolved and replaced by single characters. In some + * instances these characters represent themselves; in others, they + * stand for categories of characters. Character categories are either + * predefined (e.g., "{Lu}"), or are defined by the user using a + * statement (e.g., "vowels:aeiouAEIOU"). + * + * Another early step in parsing is to split each rule into component + * pieces. These pieces are, for every rule, a left-hand side, a right- + * hand side, and an operator. The left- and right-hand sides may not + * be empty, except for the output patterns of forward and reverse + * rules. In addition to this partitioning, the match patterns of + * forward and reverse rules must be partitioned into antecontext, + * postcontext, and literal pattern, where the context portions may or + * may not be present. Finally, output patterns must have the cursor + * indicator '|' detected and removed, with its position recorded. + * + * Quote removal, variable resolution, and sub-pattern splitting must + * all happen at once. This is due chiefly to the quoting mechanism, + * which allows special characters to appear at arbitrary positions in + * the final unquoted text. (For this reason, alteration of the rule + * language is somewhat clumsy; it entails reassessment and revision of + * the parsing methods as a whole.) + * + * After this processing of rules is complete, the final end products + * are unquoted pieces of text of various types, and an integer cursor + * position, if one is specified. These processed raw materials are now + * easy to deal with; other classes such as UnicodeSet and + * TransliterationRule need know nothing of quoting or variables. + */ + StringBuffer left = new StringBuffer(); + StringBuffer right = new StringBuffer(); + StringBuffer anteContext = new StringBuffer(); + StringBuffer postContext = new StringBuffer(); + int cursorPos[] = new int[1]; + + char operator = parseRule(start, limit, left, right, + anteContext, postContext, cursorPos); + + switch (operator) { + case VARIABLE_DEF_OP: + applyVariableDef(left.toString(), right.toString()); + break; + case FORWARD_RULE_OP: + if (direction == FORWARD) { + data.ruleSet.addRule(new TransliterationRule( + left.toString(), right.toString(), + anteContext.toString(), postContext.toString(), + cursorPos[0])); + } // otherwise ignore the rule; it's not the direction we want + break; + case REVERSE_RULE_OP: + if (direction == REVERSE) { + data.ruleSet.addRule(new TransliterationRule( + right.toString(), left.toString(), + anteContext.toString(), postContext.toString(), + cursorPos[0])); + } // otherwise ignore the rule; it's not the direction we want + break; + } + } + + /** + * Add a variable definition. + * @param name the name of the variable. It must not already be defined. + * @param pattern the value of the variable. It may be a single character + * or a pattern describing a character set. + * @exception IllegalArgumentException if there is a syntax error + */ + private final void applyVariableDef(String name, String pattern) { + validateVariableName(name); + if (data.variableNames.get(name) != null) { + throw new IllegalArgumentException("Duplicate variable definition: " + + name + '=' + pattern); + } +//! if (UnicodeSet.getCategoryID(name) >= 0) { +//! throw new IllegalArgumentException("Reserved variable name: " +//! + name); +//! } + if (pattern.length() < 1) { + throw new IllegalArgumentException("Variable definition missing: " + + name); + } + if (pattern.length() == 1) { + // Got a single character variable definition + data.variableNames.put(name, new Character(pattern.charAt(0))); + } else { + // Got more than one character; parse it as a category + if (variableNext >= variableLimit) { + throw new RuntimeException("Private use variables exhausted"); + } + Character c = new Character(variableNext++); + data.variableNames.put(name, c); + data.setVariables.put(c, new UnicodeSet(pattern)); + } + } + + /** + * Given a rule, parses it into three pieces: The left side, the right side, + * and the operator. Returns the operator. Quotes and variable references + * are resolved; the otuput text in all StringBuffer parameters + * is literal text. This method delegates to other parsing methods to + * handle the match pattern, output pattern, and other sub-patterns in the + * rule. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param left left side of rule is appended to this buffer + * with the quotes removed and variables resolved + * @param right right side of rule is appended to this buffer + * with the quotes removed and variables resolved + * @param anteContext the preceding context of the match pattern, + * if there is one, is appended to this buffer + * @param postContext the following context of the match pattern, + * if there is one, is appended to this buffer + * @param cursorPos if there is a cursor in the output pattern, its + * offset is stored in cursorPos[0] + * @return The operator character, one of the characters in OPERATORS. + */ + private char parseRule(int start, int limit, + StringBuffer left, StringBuffer right, + StringBuffer anteContext, + StringBuffer postContext, + int[] cursorPos) { + if (false) { + System.err.println("Parsing " + rules.substring(start, limit)); + } + /* Parse the rule into three pieces -- left, operator, and right, + * parsing out quotes. The result is that left and right will have + * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted + * operators throw an exception. Two quotes inside or outside + * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". + */ + int i = quotedIndexOf(rules, start, limit, OPERATORS); + if (i < 0) { + throw new IllegalArgumentException( + "Syntax error: " + + rules.substring(start, limit)); + } + char c = rules.charAt(i); + switch (c) { + case FORWARD_RULE_OP: + if (i == start) { + throw new IllegalArgumentException( + "Empty left side: " + + rules.substring(start, limit)); + } + parseMatchPattern(start, i, left, anteContext, postContext); + if (i != (limit-1)) { + parseOutputPattern(i+1, limit, right, cursorPos); + } + break; + case REVERSE_RULE_OP: + if (i == (limit-1)) { + throw new IllegalArgumentException( + "Empty right side: " + + rules.substring(start, limit)); + } + if (i != start) { + parseOutputPattern(start, i, left, cursorPos); + } + parseMatchPattern(i+1, limit, right, anteContext, postContext); + break; + default: + if (i == start || i == (limit-1)) { + throw new IllegalArgumentException( + "Empty left or right side: " + + rules.substring(start, limit)); + } + parseSubPattern(start, i, left); + parseDefPattern(i+1, limit, right); + break; + } + return c; + } + + /** + * Parses the match pattern of a forward or reverse rule. Given the raw + * match pattern, return the match text and the context on both sides, if + * any. Resolves all quotes and variables. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param text the key to be matched will be appended to this buffer + * @param anteContext the preceding context, if any, will be appended + * to this buffer. + * @param postContext the following context, if any, will be appended + * to this buffer. + */ + private void parseMatchPattern(int start, int limit, + StringBuffer text, + StringBuffer anteContext, + StringBuffer postContext) { + if (start >= limit) { + throw new IllegalArgumentException( + "Empty expression in rule: " + + rules.substring(start, limit)); + } + if (anteContext != null) { + // Ignore optional opening and closing context characters + if (rules.charAt(start) == CONTEXT_OPEN) { + ++start; + } + if (rules.charAt(limit-1) == CONTEXT_CLOSE) { + --limit; + } + // The four possibilities are: + // key + // anteContext]key + // anteContext]key[postContext + // key[postContext + int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE)); + int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN)); + if (ante >= 0 && post >= 0 && ante > post) { + throw new IllegalArgumentException( + "Syntax error in context specifier: " + + rules.substring(start, limit)); + } + if (ante >= 0) { + parseSubPattern(start, ante, anteContext); + start = ante+1; + } + if (post >= 0) { + parseSubPattern(post+1, limit, postContext); + limit = post; + } + } + parseSubPattern(start, limit, text); + } + + private final void parseSubPattern(int start, int limit, + StringBuffer text) { + parseSubPattern(start, limit, text, null, SPECIALS); + } + + /** + * Parse a variable definition sub pattern. This kind of sub + * pattern differs in the set of characters that are considered + * special. In particular, the '[' and ']' characters are not + * special, since these are used in UnicodeSet patterns. + */ + private final void parseDefPattern(int start, int limit, + StringBuffer text) { + parseSubPattern(start, limit, text, null, DEF_SPECIALS); + } + + /** + * Parses the output pattern of a forward or reverse rule. Given the + * output pattern, return the output text and the position of the cursor, + * if any. Resolves all quotes and variables. + * @param rules the string to be parsed + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param text the output text will be appended to this buffer + * @param cursorPos if this parameter is not null, then cursorPos[0] + * will be set to the cursor position, or -1 if there is none. If this + * parameter is null, then cursors will be disallowed. + */ + private final void parseOutputPattern(int start, int limit, + StringBuffer text, + int[] cursorPos) { + parseSubPattern(start, limit, text, cursorPos, SPECIALS); + } + + /** + * Parses a sub-pattern of a rule. Return the text and the position of the cursor, + * if any. Resolves all quotes and variables. + * @param rules the string to be parsed + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param text the output text will be appended to this buffer + * @param cursorPos if this parameter is not null, then cursorPos[0] + * will be set to the cursor position, or -1 if there is none. If this + * parameter is null, then cursors will be disallowed. + * @param specials characters that must be quoted; typically either + * SPECIALS or DEF_SPECIALS. + */ + private void parseSubPattern(int start, int limit, + StringBuffer text, + int[] cursorPos, + String specials) { + boolean inQuote = false; + + if (start >= limit) { + throw new IllegalArgumentException("Empty expression in rule"); + } + if (cursorPos != null) { + cursorPos[0] = -1; + } + for (int i=start; i= 0) { + throw new IllegalArgumentException("Multiple cursors: " + + rules.substring(start, limit)); + } + cursorPos[0] = text.length(); + } else if (specials.indexOf(c) >= 0) { + throw new IllegalArgumentException("Unquoted special character: " + + rules.substring(start, limit)); + } else { + text.append(c); + } + } + } + + private static void validateVariableName(String name) { + if (indexOf(name, SPECIALS) >= 0) { + throw new IllegalArgumentException( + "Special character in variable name: " + + name); + } + } + + /** + * Returns the single character value of the given variable name. Defined + * names are recognized. + * + * NO LONGER SUPPORTED: + * If a Unicode category name is given, a standard character variable + * in the range firstCategoryVariable to lastCategoryVariable is returned, + * with value firstCategoryVariable + n, where n is the category + * number. + * @exception IllegalArgumentException if the name is unknown. + */ + private Character getVariableDef(String name) { + Character ch = (Character) data.variableNames.get(name); +//! if (ch == null) { +//! int id = UnicodeSet.getCategoryID(name); +//! if (id >= 0) { +//! ch = new Character((char) (firstCategoryVariable + id)); +//! data.variableNames.put(name, ch); +//! data.setVariables.put(ch, new UnicodeSet(id)); +//! } +//! } + if (ch == null) { + throw new IllegalArgumentException("Undefined variable: " + + name); + } + return ch; + } + + /** + * Determines what part of the private use region of Unicode we can use for + * variable stand-ins. The correct way to do this is as follows: Parse each + * rule, and for forward and reverse rules, take the FROM expression, and + * make a hash of all characters used. The TO expression should be ignored. + * When done, everything not in the hash is available for use. In practice, + * this method may employ some other algorithm for improved speed. + */ + private final void determineVariableRange() { + Range r = new Range('\uE000', 0x1900); // Private use area + r = r.largestUnusedSubrange(rules); + + if (r == null) { + throw new RuntimeException( + "No private use characters available for variables"); + } + + variableNext = r.start; + variableLimit = (char) (r.start + r.length); + + if (variableNext >= variableLimit) { + throw new RuntimeException( + "Too few private use characters available for variables"); + } + } + + /** + * Returns the index of the first character in a set, ignoring quoted text. + * For example, in the string "abc'hide'h", the 'h' in "hide" will not be + * found by a search for "h". Unlike String.indexOf(), this method searches + * not for a single character, but for any character of the string + * setOfChars. + * @param text text to be searched + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param setOfChars string with one or more distinct characters + * @return Offset of the first character in setOfChars + * found, or -1 if not found. + * @see #indexOf + */ + private static int quotedIndexOf(String text, int start, int limit, + String setOfChars) { + for (int i=start; i= 0) { + return i; + } + } + return -1; + } + + /** + * Returns the index of the first character in a set. Unlike + * String.indexOf(), this method searches not for a single character, but + * for any character of the string setOfChars. + * @param text text to be searched + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param setOfChars string with one or more distinct characters + * @return Offset of the first character in setOfChars + * found, or -1 if not found. + * @see #quotedIndexOf + */ + private static int indexOf(String text, int start, int limit, + String setOfChars) { + for (int i=start; i= 0) { + return i; + } + } + return -1; + } + + /** + * Returns the index of the first character in a set. Unlike + * String.indexOf(), this method searches not for a single character, but + * for any character of the string setOfChars. + * @param text text to be searched + * @param setOfChars string with one or more distinct characters + * @return Offset of the first character in setOfChars + * found, or -1 if not found. + * @see #quotedIndexOf + */ + private static int indexOf(String text, String setOfChars) { + return indexOf(text, 0, text.length(), setOfChars); + } + + + + /** + * A range of Unicode characters. Support the operations of testing for + * inclusion (does this range contain this character?) and splitting. + * Splitting involves breaking a range into two smaller ranges around a + * character inside the original range. The split character is not included + * in either range. If the split character is at either extreme end of the + * range, one of the split products is an empty range. + * + * This class is used internally to determine the largest available private + * use character range for variable stand-ins. + */ + private static class Range implements Cloneable { + char start; + int length; + + Range(char start, int length) { + this.start = start; + this.length = length; + } + + public Object clone() { + return new Range(start, length); + } + + boolean contains(char c) { + return c >= start && (c - start) < length; + } + + /** + * Assume that contains(c) is true. Split this range into two new + * ranges around the character c. Make this range one of the new ranges + * (modify it in place) and return the other new range. The character + * itself is not included in either range. If the split results in an + * empty range (that is, if c == start or c == start + length - 1) then + * return null. + */ + Range split(char c) { + if (c == start) { + ++start; + --length; + return null; + } else if (c - start == length - 1) { + --length; + return null; + } else { + ++c; + Range r = new Range(c, start + length - c); + length = --c - start; + return r; + } + } + + /** + * Finds the largest unused subrange by the given string. A + * subrange is unused by a string if the string contains no + * characters in that range. If the given string contains no + * characters in this range, then this range itself is + * returned. + */ + Range largestUnusedSubrange(String str) { + int n = str.length(); + + Vector v = new Vector(1); + v.addElement(clone()); + for (int i=0; i bestRange.length) { + bestRange = r; + } + } + + return bestRange; + } + } + } +} diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/src/com/ibm/icu/text/TransliterationRule.java new file mode 100755 index 00000000000..383c77ed340 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/TransliterationRule.java @@ -0,0 +1,530 @@ +package com.ibm.text; + +import java.util.Dictionary; + +/** + * A transliteration rule used by + * RuleBasedTransliterator. + * TransliterationRule is an immutable object. + * + *

A rule consists of an input pattern and an output string. When + * the input pattern is matched, the output string is emitted. The + * input pattern consists of zero or more characters which are matched + * exactly (the key) and optional context. Context must match if it + * is specified. Context may be specified before the key, after the + * key, or both. The key, preceding context, and following context + * may contain variables. Variables represent a set of Unicode + * characters, such as the letters a through z. + * Variables are detected by looking up each character in a supplied + * variable list to see if it has been so defined. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +class TransliterationRule { + /** + * Constant returned by getMatchDegree() indicating a mismatch + * between the text and this rule. One or more characters of the context or + * key do not match the text. + * @see #getMatchDegree + */ + public static final int MISMATCH = 0; + + /** + * Constant returned by getMatchDegree() indicating a partial + * match between the text and this rule. All characters of the text match + * the corresponding context or key, but more characters are required for a + * complete match. There are some key or context characters at the end of + * the pattern that remain unmatched because the text isn't long enough. + * @see #getMatchDegree + */ + public static final int PARTIAL_MATCH = 1; + + /** + * Constant returned by getMatchDegree() indicating a complete + * match between the text and this rule. The text matches all context and + * key characters. + * @see #getMatchDegree + */ + public static final int FULL_MATCH = 2; + + /** + * The string that must be matched. + */ + private String key; + + /** + * The string that is emitted if the key, anteContext, and postContext + * are matched. + */ + private String output; + + /** + * The string that must match before the key. Must not be the empty string. + * May be null; if null, then there is no matching requirement before the + * key. + */ + private String anteContext; + + /** + * The string that must match after the key. Must not be the empty string. + * May be null; if null, then there is no matching requirement after the + * key. + */ + private String postContext; + + /** + * The position of the cursor after emitting the output string, from 0 to + * output.length(). For most rules with no special cursor specification, + * the cursorPos is output.length(). + */ + private int cursorPos; + + /** + * A string used to implement masks(). + */ + private String maskKey; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a new rule with the given key, output text, and other + * attributes. Zero, one, or two context strings may be specified. A + * cursor position may be specified for the output text. + * @param key the string to match + * @param output the string to produce when the key is seen + * @param anteContext if not null and not empty, then it must be matched + * before the key + * @param postContext if not null and not empty, then it must be matched + * after the key + * @param cursorPos a position for the cursor after the output + * is emitted. If less than zero, then the cursor is placed after the + * output; that is, -1 is equivalent to + * output.length(). If greater than + * output.length() then an exception is thrown. + * @exception IllegalArgumentException if the cursor position is out of + * range. + */ + public TransliterationRule(String key, String output, + String anteContext, String postContext, + int cursorPos) { + this.key = key; + this.output = output; + this.anteContext = (anteContext != null && anteContext.length() > 0) + ? anteContext : null; + this.postContext = (postContext != null && postContext.length() > 0) + ? postContext : null; + this.cursorPos = cursorPos < 0 ? output.length() : cursorPos; + if (this.cursorPos > output.length()) { + throw new IllegalArgumentException("Illegal cursor position"); + } + + /* The mask key is needed when we are adding individual rules to a rule + * set, for performance. Here are the numbers: Without mask key, 13.0 + * seconds. With mask key, 6.2 seconds. However, once the rules have + * been added to the set, then they can be discarded to free up space. + * This is what the freeze() method does. After freeze() has been + * called, the method masks() must NOT be called. + */ + maskKey = key; + if (postContext != null) { + maskKey += postContext; + } + } + + /** + * Return the length of the key. Equivalent to getKey().length(). + * @return the length of the match key. + */ + public int getKeyLength() { + return key.length(); + } + + /** + * Return the key. + * @return the match key. + */ + public String getKey() { + return key; + } + + /** + * Return the output string. + * @return the output string. + */ + public String getOutput() { + return output; + } + + /** + * Return the position of the cursor within the output string. + * @return a value from 0 to getOutput().length(), inclusive. + */ + public int getCursorPos() { + return cursorPos; + } + + /** + * Return the preceding context length. This method is needed to + * support the Transliterator method + * getMaximumContextLength(). + */ + public int getAnteContextLength() { + return anteContext == null ? 0 : anteContext.length(); + } + + /** + * Return true if this rule masks another rule. If r1 masks r2 then + * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks + * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". + * "[c]a>x" masks "[dc]a>y". + * + *

This method must not be called after freeze() is called. + */ + public boolean masks(TransliterationRule r2) { + /* There are three cases of masking. In each instance, rule1 + * masks rule2. + * + * 1. KEY mask: len(key1) < len(key2), key2 starts with key1. + * + * 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2), + * prefix2 ends with prefix1, suffix2 starts with suffix1. + * + * 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2), + * prefix2 ends with prefix1, suffix2 starts with suffix1. + */ + + /* LIMITATION of the current mask algorithm: Some rule + * maskings are currently not detected. For example, + * "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking, + * we need a subset operator on UnicodeSet objects, which we + * currently do not have. This can be added later. + */ + return ((maskKey.length() < r2.maskKey.length() && + r2.maskKey.startsWith(maskKey)) || + (r2.anteContext != null && maskKey.equals(r2.maskKey) && + ((anteContext == null) || + (anteContext.length() < r2.anteContext.length() && + r2.anteContext.endsWith(anteContext))))); + } + + /** + * Free up space. Once this method is called, masks() must NOT be called. + * If it is called, an exception will be thrown. + */ + public void freeze() { + maskKey = null; + } + + /** + * Return a string representation of this object. + * @return string representation of this object + */ + public String toString() { + return getClass().getName() + '[' + + escape((anteContext != null ? ("[" + anteContext + ']') : "") + + key + + (postContext != null ? ("[" + postContext + ']') : "") + + " -> " + + (cursorPos < output.length() + ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos)) + : output)) + + ']'; + } + + /** + * Return true if this rule matches the given text. The text being matched + * occupies a virtual buffer consisting of the contents of + * result concatenated to a substring of text. + * The substring is specified by start and limit. + * The value of cursor is an index into this virtual buffer, + * from 0 to the length of the buffer. In terms of the parameters, + * cursor must be between 0 and result.length() + limit - + * start. + * @param text the untranslated text + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result translated text so far + * @param cursor position at which to translate next, an offset into result. + * If greater than or equal to result.length(), represents offset start + + * cursor - result.length() into text. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + public boolean matches(String text, int start, int limit, + StringBuffer result, int cursor, + Dictionary variables, + UnicodeFilter filter) { + return + (anteContext == null + || regionMatches(text, start, limit, result, + cursor - anteContext.length(), + anteContext, variables, filter)) && + regionMatches(text, start, limit, result, cursor, + key, variables, filter) && + (postContext == null + || regionMatches(text, start, limit, result, + cursor + key.length(), + postContext, variables, filter)); + } + + /** + * Return true if this rule matches the given text. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + public boolean matches(Replaceable text, int start, int limit, + int cursor, Dictionary variables, + UnicodeFilter filter) { + return + (anteContext == null + || regionMatches(text, start, limit, cursor - anteContext.length(), + anteContext, variables, filter)) && + regionMatches(text, start, limit, cursor, + key, variables, filter) && + (postContext == null + || regionMatches(text, start, limit, cursor + key.length(), + postContext, variables, filter)); + } + + /** + * Return the degree of match between this rule and the given text. The + * degree of match may be mismatch, a partial match, or a full match. A + * mismatch means at least one character of the text does not match the + * context or key. A partial match means some context and key characters + * match, but the text is not long enough to match all of them. A full + * match means all context and key characters match. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return one of MISMATCH, PARTIAL_MATCH, or + * FULL_MATCH. + * @see #MISMATCH + * @see #PARTIAL_MATCH + * @see #FULL_MATCH + */ + public int getMatchDegree(Replaceable text, int start, int limit, + int cursor, Dictionary variables, + UnicodeFilter filter) { + if (anteContext != null + && !regionMatches(text, start, limit, cursor - anteContext.length(), + anteContext, variables, filter)) { + return MISMATCH; + } + int len = getRegionMatchLength(text, start, limit, cursor, + key, variables, filter); + if (len < 0) { + return MISMATCH; + } + if (len < key.length()) { + return PARTIAL_MATCH; + } + if (postContext == null) { + return FULL_MATCH; + } + len = getRegionMatchLength(text, start, limit, + cursor + key.length(), + postContext, variables, filter); + return (len < 0) ? MISMATCH + : ((len == postContext.length()) ? FULL_MATCH + : PARTIAL_MATCH); + } + + /** + * Return true if a template matches the text. The entire length of the + * template is compared to the text at the cursor. As in + * matches(), the text being matched occupies a virtual buffer + * consisting of the contents of result concatenated to a + * substring of text. See matches() for details. + * @param text the untranslated text + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result translated text so far + * @param cursor position at which to translate next, an offset into result. + * If greater than or equal to result.length(), represents offset start + + * cursor - result.length() into text. + * @param template the text to match against. All characters must match. + * @param variables a dictionary of variables mapping Character + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return true if there is a match + */ + protected static boolean regionMatches(String text, int start, int limit, + StringBuffer result, int cursor, + String template, + Dictionary variables, + UnicodeFilter filter) { + int rlen = result.length(); + if (cursor < 0 + || (cursor + template.length()) > (rlen + limit - start)) { + return false; + } + for (int i=0; i0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param template the text to match against. All characters must match. + * @param variables a dictionary of variables mapping Character + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return true if there is a match + */ + protected static boolean regionMatches(Replaceable text, int start, int limit, + int cursor, + String template, Dictionary variables, + UnicodeFilter filter) { + if (cursor < start + || (cursor + template.length()) > limit) { + return false; + } + for (int i=0; i0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param template the text to match against. All characters must match. + * @param variables a dictionary of variables mapping Character + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return -1 if there is a mismatch, 0 if the text is not long enough to + * match any characters, otherwise the number of characters of text that + * match this rule. + */ + protected static int getRegionMatchLength(Replaceable text, int start, + int limit, int cursor, + String template, + Dictionary variables, + UnicodeFilter filter) { + if (cursor < start) { + return -1; + } + int i; + for (i=0; iCharacter + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + protected static boolean charMatches(char keyChar, char textChar, + Dictionary variables, UnicodeFilter filter) { + UnicodeSet set = null; + return (filter == null || filter.isIn(textChar)) && + ((set = (UnicodeSet) variables.get(new Character(keyChar))) + == null) ? + keyChar == textChar : set.contains(textChar); + } + + /** + * Escape non-ASCII characters as Unicode. + */ + public static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java b/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java new file mode 100755 index 00000000000..d57bf75464a --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java @@ -0,0 +1,218 @@ +package com.ibm.text; + +import java.util.*; + +/** + * A set of rules for a RuleBasedTransliterator. This set encodes + * the transliteration in one direction from one set of characters or short + * strings to another. A RuleBasedTransliterator consists of up to + * two such sets, one for the forward direction, and one for the reverse. + * + *

A TransliterationRuleSet has one important operation, that of + * finding a matching rule at a given point in the text. This is accomplished + * by the findMatch() method. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +class TransliterationRuleSet { + /* Note: There was an old implementation that indexed by first letter of + * key. Problem with this is that key may not have a meaningful first + * letter; e.g., {Lu}>*. One solution is to keep a separate vector of all + * rules whose intial key letter is a category variable. However, the + * problem is that they must be kept in order with respect to other rules. + * One solution -- add a sequence number to each rule. Do the usual + * first-letter lookup, and also a lookup from the spare bin with rules like + * {Lu}>*. Take the lower sequence number. This seems complex and not + * worth the trouble, but we may revisit this later. For documentation (or + * possible resurrection) the old code is included below, commented out + * with the remark "// OLD INDEXED IMPLEMENTATION". Under the old + * implementation, rules is a Hashtable, not a Vector. + */ + + /** + * Vector of rules, in the order added. + */ + private Vector rules; + + /** + * Length of the longest preceding context + */ + private int maxContextLength; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a new empty rule set. + */ + public TransliterationRuleSet() { + rules = new Vector(); + maxContextLength = 0; + } + + /** + * Return the maximum context length. + * @return the length of the longest preceding context. + */ + public int getMaximumContextLength() { + return maxContextLength; + } + + /** + * Add a rule to this set. Rules are added in order, and order is + * significant. + * + *

Once freeze() is called, this method must not be called. + * @param rule the rule to add + */ + public void addRule(TransliterationRule rule) { + + // Build time, no checking : 3562 ms + // Build time, with checking: 6234 ms + + for (int i=0; i maxContextLength) { + maxContextLength = len; + } + } + + /** + * Free up space. Once this method is called, addRule() must NOT + * be called again. + */ + public void freeze() { + for (int i=0; iresult concatenated to a substring of text. + * The substring is specified by start and limit. + * The value of cursor is an index into this virtual buffer, + * from 0 to the length of the buffer. In terms of the parameters, + * cursor must be between 0 and result.length() + limit - + * start. + * @param text the untranslated text + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result tranlated text + * @param cursor position at which to translate next, an offset into result. + * If greater than or equal to result.length(), represents offset start + + * cursor - result.length() into text. + * @param variables a dictionary mapping variables to the sets they + * represent (maps Character to UnicodeSet) + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return the matching rule, or null if none found. + */ + public TransliterationRule findMatch(String text, int start, int limit, + StringBuffer result, int cursor, + Dictionary variables, + UnicodeFilter filter) { + for (Enumeration e = rules.elements(); e.hasMoreElements(); ) { + TransliterationRule rule = (TransliterationRule) e.nextElement(); + if (rule.matches(text, start, limit, result, cursor, variables, filter)) { + return rule; + } + } + return null; + } + + /** + * Attempt to find a matching rule at the specified point in the text. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param variables a dictionary mapping variables to the sets they + * represent (maps Character to UnicodeSet) + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return the matching rule, or null if none found. + */ + public TransliterationRule findMatch(Replaceable text, int start, int limit, + int cursor, + Dictionary variables, + UnicodeFilter filter) { + for (Enumeration e = rules.elements(); e.hasMoreElements(); ) { + TransliterationRule rule = (TransliterationRule) e.nextElement(); + if (rule.matches(text, start, limit, cursor, variables, filter)) { + return rule; + } + } + return null; + } + + /** + * Attempt to find a matching rule at the specified point in the text. + * Unlike findMatch(), this method does an incremental match. + * An incremental match requires that there be no partial matches that might + * pre-empt the full match that is found. If there are partial matches, + * then null is returned. A non-null result indicates that a full match has + * been found, and that it cannot be pre-empted by a partial match + * regardless of what additional text is added to the translation buffer. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param variables a dictionary mapping variables to the sets they + * represent (maps Character to UnicodeSet) + * @param partial output parameter. partial[0] is set to + * true if a partial match is returned. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return the matching rule, or null if none found, or if the text buffer + * does not have enough text yet to unambiguously match a rule. + */ + public TransliterationRule findIncrementalMatch(Replaceable text, int start, + int limit, int cursor, + Dictionary variables, + boolean partial[], + UnicodeFilter filter) { + partial[0] = false; + for (Enumeration e = rules.elements(); e.hasMoreElements(); ) { + TransliterationRule rule = (TransliterationRule) e.nextElement(); + int match = rule.getMatchDegree(text, start, limit, cursor, + variables, filter); + switch (match) { + case TransliterationRule.FULL_MATCH: + return rule; + case TransliterationRule.PARTIAL_MATCH: + partial[0] = true; + return null; + } + } + return null; + } +} diff --git a/icu4j/src/com/ibm/icu/text/Transliterator.java b/icu4j/src/com/ibm/icu/text/Transliterator.java new file mode 100755 index 00000000000..83171a961e7 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/Transliterator.java @@ -0,0 +1,860 @@ +package com.ibm.text; + +import java.util.*; +import java.text.MessageFormat; + +/** + * Transliterator is an abstract class that + * transliterates text from one format to another. The most common + * kind of transliterator is a script, or alphabet, transliterator. + * For example, a Russian to Latin transliterator changes Russian text + * written in Cyrillic characters to phonetically equivalent Latin + * characters. It does not translate Russian to English! + * Transliteration, unlike translation, operates on characters, without + * reference to the meanings of words and sentences. + * + *

Although script conversion is its most common use, a + * transliterator can actually perform a more general class of tasks. + * In fact, Transliterator defines a very general API + * which specifies only that a segment of the input text is replaced + * by new text. The particulars of this conversion are determined + * entirely by subclasses of Transliterator. + * + *

Transliterators are stateless + * + *

Transliterator objects are stateless; they + * retain no information between calls to + * transliterate(). As a result, threads may share + * transliterators without synchronizing them. This might seem to + * limit the complexity of the transliteration operation. In + * practice, subclasses perform complex transliterations by delaying + * the replacement of text until it is known that no other + * replacements are possible. In other words, although the + * Transliterator objects are stateless, the source text + * itself embodies all the needed information, and delayed operation + * allows arbitrary complexity. + * + *

Batch transliteration + * + *

The simplest way to perform transliteration is all at once, on a + * string of existing text. This is referred to as batch + * transliteration. For example, given a string input + * and a transliterator t, the call + * + *

String result = t.transliterate(input); + *
+ * + * will transliterate it and return the result. Other methods allow + * the client to specify a substring to be transliterated and to use + * {@link Replaceable} objects instead of strings, in order to + * preserve out-of-band information (such as text styles). + * + *

Keyboard transliteration + * + *

Somewhat more involved is keyboard, or incremental + * transliteration. This is the transliteration of text that is + * arriving from some source (typically the user's keyboard) one + * character at a time, or in some other piecemeal fashion. + * + *

In keyboard transliteration, a Replaceable buffer + * stores the text. As text is inserted, as much as possible is + * transliterated on the fly. This means a GUI that displays the + * contents of the buffer may show text being modified as each new + * character arrives. + * + *

Consider the simple RuleBasedTransliterator: + * + *

+ * th>{theta}
+ * t>{tau} + *
+ * + * When the user types 't', nothing will happen, since the + * transliterator is waiting to see if the next character is 'h'. To + * remedy this, we introduce the notion of a cursor, marked by a '|' + * in the output string: + * + *
+ * t>|{tau}
+ * {tau}h>{theta} + *
+ * + * Now when the user types 't', tau appears, and if the next character + * is 'h', the tau changes to a theta. This is accomplished by + * maintaining a cursor position (independent of the insertion point, + * and invisible in the GUI) across calls to + * keyboardTransliterate(). Typically, the cursor will + * be coincident with the insertion point, but in a case like the one + * above, it will precede the insertion point. + * + *

Keyboard transliteration methods maintain a set of three indices + * that are updated with each call to + * keyboardTransliterate(), including the cursor, start, + * and limit. Since these indices are changed by the method, they are + * passed in an int[] array. The START index + * marks the beginning of the substring that the transliterator will + * look at. It is advanced as text becomes committed (but it is not + * the committed index; that's the CURSOR). The + * CURSOR index, described above, marks the point at + * which the transliterator last stopped, either because it reached + * the end, or because it required more characters to disambiguate + * between possible inputs. The CURSOR can also be + * explicitly set by rules in a RuleBasedTransliterator. + * Any characters before the CURSOR index are frozen; + * future keyboard transliteration calls within this input sequence + * will not change them. New text is inserted at the + * LIMIT index, which marks the end of the substring that + * the transliterator looks at. + * + *

Because keyboard transliteration assumes that more characters + * are to arrive, it is conservative in its operation. It only + * transliterates when it can do so unambiguously. Otherwise it waits + * for more characters to arrive. When the client code knows that no + * more characters are forthcoming, perhaps because the user has + * performed some input termination operation, then it should call + * finishKeyboardTransliteration() to complete any + * pending transliterations. + * + *

Inverses + * + *

Pairs of transliterators may be inverses of one another. For + * example, if transliterator A transliterates characters by + * incrementing their Unicode value (so "abc" -> "def"), and + * transliterator B decrements character values, then A + * is an inverse of B and vice versa. If we compose A + * with B in a compound transliterator, the result is the + * indentity transliterator, that is, a transliterator that does not + * change its input text. + * + * The Transliterator method getInverse() + * returns a transliterator's inverse, if one exists, or + * null otherwise. However, the result of + * getInverse() usually will not be a true + * mathematical inverse. This is because true inverse transliterators + * are difficult to formulate. For example, consider two + * transliterators: AB, which transliterates the character 'A' + * to 'B', and BA, which transliterates 'B' to 'A'. It might + * seem that these are exact inverses, since + * + *

"A" x AB -> "B"
+ * "B" x BA -> "A"
+ * + * where 'x' represents transliteration. However, + * + *
"ABCD" x AB -> "BBCD"
+ * "BBCD" x BA -> "AACD"
+ * + * so AB composed with BA is not the + * identity. Nonetheless, BA may be usefully considered to be + * AB's inverse, and it is on this basis that + * AB.getInverse() could legitimately return + * BA. + * + *

IDs and display names + * + *

A transliterator is designated by a short identifier string or + * ID. IDs follow the format source-destination, + * where source describes the entity being replaced, and + * destination describes the entity replacing + * source. The entities may be the names of scripts, + * particular sequences of characters, or whatever else it is that the + * transliterator converts to or from. For example, a transliterator + * from Russian to Latin might be named "Russian-Latin". A + * transliterator from keyboard escape sequences to Latin-1 characters + * might be named "KeyboardEscape-Latin1". By convention, system + * entity names are in English, with the initial letters of words + * capitalized; user entity names may follow any format so long as + * they do not contain dashes. + * + *

In addition to programmatic IDs, transliterator objects have + * display names for presentation in user interfaces, returned by + * {@link #getDisplayName}. + * + *

Factory methods and registration + * + *

In general, client code should use the factory method + * getInstance() to obtain an instance of a + * transliterator given its ID. Valid IDs may be enumerated using + * getAvailableIDs(). Since transliterators are + * stateless, multiple calls to getInstance() with the + * same ID will return the same object. + * + *

In addition to the system transliterators registered at startup, + * user transliterators may be registered by calling + * registerInstance() at run time. To register a + * transliterator subclass without instantiating it (until it is + * needed), users may call registerClass(). + * + *

Subclassing + * + *

Subclasses must implement the abstract + * transliterate() method. They should also override the + * transliterate() method taking a String + * and StringBuffer if the performance of these methods + * can be improved over the performance obtained by the default + * implementations in this class. Subclasses must also implement + * handleKeyboardTransliterate(). + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: Transliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public abstract class Transliterator { + /** + * In the keyboardTransliterate() + * index[] array, the beginning index, inclusive + * @see #keyboardTransliterate + */ + public static final int START = 0; + + /** + * In the keyboardTransliterate() + * index[] array, the ending index, exclusive + * @see #keyboardTransliterate + */ + public static final int LIMIT = 1; + + /** + * In the keyboardTransliterate() + * index[] array, the next character to be considered + * for transliteration + * @see #keyboardTransliterate + */ + public static final int CURSOR = 2; + + /** + * Programmatic name, e.g., "Latin-Arabic". + */ + private String ID; + + /** + * This transliterator's filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + private UnicodeFilter filter; + + /** + * Dictionary of known transliterators. Keys are String + * names, values are one of the following: + * + *

+ */ + private static Hashtable cache; + + /** + * Internal object used to stand for instances of + * RuleBasedTransliterator that have not been + * constructed yet in the cache. When a + * getInstance() call retrieves this object, it is + * replaced by the actual RuleBasedTransliterator. + * This allows Transliterator to delay instantiation + * of such transliterators until they are needed. + */ + private static final Object RULE_BASED_PLACEHOLDER = new Object(); + + /** + * Internal object used to stand for instances of + * RuleBasedTransliterator that have not been + * constructed yet in the cache. These instances are + * constructed with an argument + * RuleBasedTransliterator.REVERSE. + */ + private static final Object REVERSE_RULE_BASED_PLACEHOLDER = new Object(); + + /** + * Prefix for resource bundle key for the display name for a + * transliterator. The ID is appended to this to form the key. + * The resource bundle value should be a String. + */ + private static final String RB_DISPLAY_NAME_PREFIX = "T:"; + + /** + * Resource bundle key for display name pattern. + * The resource bundle value should be a String forming a + * MessageFormat pattern, e.g.: + * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}". + */ + private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern"; + + /** + * Resource bundle key for the list of RuleBasedTransliterator IDs. + * The resource bundle value should be a String[] with each element + * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX + * to obtain the class name in which the RB_RULE key will be sought. + */ + private static final String RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs"; + + /** + * Resource bundle containing display name keys and the + * RB_RULE_BASED_IDS array. + * + *

If we ever integrate this with the Sun JDK, the resource bundle + * root will change to java.text.resources.LocaleElements + */ + private static final String RB_LOCALE_ELEMENTS = + "com.ibm.text.resources.LocaleElements"; + + /** + * Prefix for resource bundle containing RuleBasedTransliterator + * RB_RULE string. The ID is munged to remove the first '-' then appended + * to this String to obtain the class name. + */ + private static final String RB_RULE_BASED_PREFIX = + "com.ibm.text.resources.TransliterationRule"; + + /** + * Resource bundle key for the RuleBasedTransliterator rule. + */ + private static final String RB_RULE = "Rule"; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Default constructor. + * @param ID the string identifier for this transliterator + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + protected Transliterator(String ID, UnicodeFilter filter) { + if (ID == null) { + throw new NullPointerException(); + } + this.ID = ID; + this.filter = filter; + } + + /** + * Transliterates the segment of a string that begins at the + * character at offset start and extends to the + * character at offset limit - 1, with optional + * filtering. A default implementaion is provided here; + * subclasses should provide a more efficient implementation if + * possible. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result buffer to receive the transliterated text; previous + * contents are discarded + */ + public void transliterate(String text, int start, int limit, + StringBuffer result) { + /* This is a default implementation that should be replaced by + * a more efficient subclass implementation if possible. + */ + result.setLength(0); + result.append(text.substring(start, limit)); + transliterate(new ReplaceableString(result), + 0, result.length()); + } + + /** + * Transliterates a segment of a string, with optional filtering. + * Subclasses must override this abstract method. + * + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return The new limit index. The text previously occupying [start, + * limit) has been transliterated, possibly to a string of a different + * length, at [start, new-limit), where + * new-limit is the return value. + */ + public abstract int transliterate(Replaceable text, int start, int limit); + + /** + * Transliterates an entire string. Convenience method. + * @param text the string to be transliterated + * @param result buffer to receive the transliterated text; previous + * contents are discarded + */ + public final void transliterate(String text, StringBuffer result) { + transliterate(text, 0, text.length(), result); + } + + /** + * Transliterate an entire string and returns the result. Convenience method. + * + * @param text the string to be transliterated + * @return The transliterated text + */ + public final String transliterate(String text) { + StringBuffer result = new StringBuffer(); + transliterate(text, 0, text.length(), result); + return result.toString(); + } + + /** + * Transliterates an entire string in place. Convenience method. + * @param text the string to be transliterated + */ + public final void transliterate(Replaceable text) { + transliterate(text, 0, text.length()); + } + + /** + * Transliterates the portion of the text buffer that can be + * transliterated unambiguosly after new text has been inserted, + * typically as a result of a keyboard event. The new text in + * insertion will be inserted into text + * at index[LIMIT], advancing + * index[LIMIT] by insertion.length(). + * Then the transliterator will try to transliterate characters of + * text between index[CURSOR] and + * index[LIMIT]. Characters before + * index[CURSOR] will not be changed. + * + *

Upon return, values in index[] will be updated. + * index[START] will be advanced to the first + * character that future calls to this method will read. + * index[CURSOR] and index[LIMIT] will + * be adjusted to delimit the range of text that future calls to + * this method may change. + * + *

Typical usage of this method begins with an initial call + * with index[START] and index[LIMIT] + * set to indicate the portion of text to be + * transliterated, and index[CURSOR] == index[START]. + * Thereafter, index[] can be used without + * modification in future calls, provided that all changes to + * text are made via this method. + * + *

This method assumes that future calls may be made that will + * insert new text into the buffer. As a result, it only performs + * unambiguous transliterations. After the last call to this + * method, there may be untransliterated text that is waiting for + * more input to resolve an ambiguity. In order to perform these + * pending transliterations, clients should call {@link + * #finishKeyboardTransliteration} after the last call to this + * method has been made. + * + * @param text the buffer holding transliterated and untransliterated text + * @param index an array of three integers. + * + *

+ * + * @param insertion text to be inserted and possibly + * transliterated into the translation buffer at + * index[LIMIT]. If null then no text + * is inserted. + * @see #START + * @see #LIMIT + * @see #CURSOR + * @see #handleKeyboardTransliterate + * @exception IllegalArgumentException if index[] + * is invalid + */ + public final void keyboardTransliterate(Replaceable text, int[] index, + String insertion) { + if (index.length < 3 || + index[START] < 0 || + index[LIMIT] > text.length() || + index[CURSOR] < index[START] || + index[CURSOR] > index[LIMIT]) { + throw new IllegalArgumentException("Invalid index array"); + } + + int originalStart = index[START]; + if (insertion != null) { + text.replace(index[LIMIT], index[LIMIT], insertion); + index[LIMIT] += insertion.length(); + } + + handleKeyboardTransliterate(text, index); + + index[START] = Math.max(index[CURSOR] - getMaximumContextLength(), + originalStart); + } + + /** + * Transliterates the portion of the text buffer that can be + * transliterated unambiguosly after a new character has been + * inserted, typically as a result of a keyboard event. This is a + * convenience method; see {@link + * #keyboardTransliterate(Replaceable, int[], String)} for details. + * @param text the buffer holding transliterated and + * untransliterated text + * @param index an array of three integers. See {@link + * #keyboardTransliterate(Replaceable, int[], String)}. + * @param insertion text to be inserted and possibly + * transliterated into the translation buffer at + * index[LIMIT]. + * @see #keyboardTransliterate(Replaceable, int[], String) + */ + public final void keyboardTransliterate(Replaceable text, int[] index, + char insertion) { + keyboardTransliterate(text, index, String.valueOf(insertion)); + } + + /** + * Transliterates the portion of the text buffer that can be + * transliterated unambiguosly. This is a convenience method; see + * {@link #keyboardTransliterate(Replaceable, int[], String)} for + * details. + * @param text the buffer holding transliterated and + * untransliterated text + * @param index an array of three integers. See {@link + * #keyboardTransliterate(Replaceable, int[], String)}. + * @see #keyboardTransliterate(Replaceable, int[], String) + */ + public final void keyboardTransliterate(Replaceable text, int[] index) { + keyboardTransliterate(text, index, null); + } + + /** + * Finishes any pending transliterations that were waiting for + * more characters. Clients should call this method as the last + * call after a sequence of one or more calls to + * keyboardTransliterate(). + * @param text the buffer holding transliterated and + * untransliterated text. + * @param index the array of indices previously passed to {@link + * #keyboardTransliterate} + */ + public final void finishKeyboardTransliteration(Replaceable text, + int[] index) { + transliterate(text, index[START], index[LIMIT]); + } + + /** + * Abstract method that concrete subclasses define to implement + * keyboard transliteration. This method should transliterate all + * characters between index[CURSOR] and + * index[LIMIT] that can be unambiguously + * transliterated, regardless of future insertions of text at + * index[LIMIT]. index[CURSOR] should + * be advanced past committed characters (those that will not + * change in future calls to this method). + * index[LIMIT] should be updated to reflect text + * replacements that shorten or lengthen the text between + * index[CURSOR] and index[LIMIT]. Upon + * return, neither index[CURSOR] nor + * index[LIMIT] should be less than the initial value + * of index[CURSOR]. index[START] + * should not be changed. + * + * @param text the buffer holding transliterated and + * untransliterated text + * @param index an array of three integers. See {@link + * #keyboardTransliterate(Replaceable, int[], String)}. + * @see #keyboardTransliterate + */ + protected abstract void handleKeyboardTransliterate(Replaceable text, + int[] index); + + /** + * Returns the length of the longest context required by this transliterator. + * This is preceding context. The default implementation supplied + * by Transliterator returns zero; subclasses + * that use preceding context should override this method to return the + * correct value. For example, if a transliterator translates "ddd" (where + * d is any digit) to "555" when preceded by "(ddd)", then the preceding + * context length is 5, the length of "(ddd)". + * + * @return The maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return 0; + } + + /** + * Returns a programmatic identifier for this transliterator. + * If this identifier is passed to getInstance(), it + * will return this object, if it has been registered. + * @see #registerInstance + * @see #registerClass + * @see #getAvailableIDs + */ + public final String getID() { + return ID; + } + + /** + * Returns a name for this transliterator that is appropriate for + * display to the user in the default locale. See {@link + * #getDisplayName(Locale)} for details. + */ + public final String getDisplayName() { + return getDisplayName(Locale.getDefault()); + } + + /** + * Returns a name for this transliterator that is appropriate for + * display to the user in the given locale. This name is taken + * from the locale resource data in the standard manner of the + * java.text package. + * + *

If no localized names exist in the system resource bundles, + * a name is synthesized using a localized + * MessageFormat pattern from the resource data. The + * arguments to this pattern are an integer followed by one or two + * strings. The integer is the number of strings, either 1 or 2. + * The strings are formed by splitting the ID for this + * transliterator at the first '-'. If there is no '-', then the + * entire ID forms the only string. + * @param inLocale the Locale in which the display name should be + * localized. + * @see java.text.MessageFormat + */ + public String getDisplayName(Locale inLocale) { + ResourceBundle bundle = ResourceBundle.getBundle( + RB_LOCALE_ELEMENTS, inLocale); + + try { + return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID); + } catch (MissingResourceException e) {} + + try { + // Construct the formatter first; if getString() fails + // we'll exit the try block + MessageFormat format = new MessageFormat( + bundle.getString(RB_DISPLAY_NAME_PATTERN)); + // Construct the argument array + int i = ID.indexOf('-'); + Object[] args = (i < 0) + ? new Object[] { new Integer(1), ID } + : new Object[] { new Integer(2), ID.substring(0, i), + ID.substring(i+1) }; + // Format it using the pattern in the resource + return format.format(args); + } catch (MissingResourceException e2) {} + + // We should not reach this point unless there is something + // wrong with the build or the RB_DISPLAY_NAME_PATTERN has + // been deleted from the root RB_LOCALE_ELEMENTS resource. + throw new RuntimeException(); + } + + /** + * Returns the filter used by this transliterator, or null + * if this transliterator uses no filter. + */ + public UnicodeFilter getFilter() { + return filter; + } + + /** + * Changes the filter used by this transliterator. If the filter + * is set to null then no filtering will occur. + * + *

Callers must take care if a transliterator is in use by + * multiple threads. The filter should not be changed by one + * thread while another thread may be transliterating. + */ + public void setFilter(UnicodeFilter filter) { + this.filter = filter; + } + + /** + * Returns this transliterator's inverse. See the class + * documentation for details. This implementation simply inverts + * the two entities in the ID and attempts to retrieve the + * resulting transliterator. That is, if getID() + * returns "A-B", then this method will return the result of + * getInstance("B-A"), or null if that + * call fails. + * + *

This method does not take filtering into account. The + * returned transliterator will have no filter. + * + *

Subclasses with knowledge of their inverse may wish to + * override this method. + * + * @return a transliterator that is an inverse, not necessarily + * exact, of this transliterator, or null if no such + * transliterator is registered. + * @see #registerInstance + */ + public Transliterator getInverse() { + int i = ID.indexOf('-'); + if (i >= 0) { + String inverseID = ID.substring(i+1) + '-' + ID.substring(0, i); + return internalGetInstance(inverseID); + } + return null; + } + + /** + * Returns a Transliterator object given its ID. + * The ID must be either a system transliterator ID or a ID registered + * using registerInstance(). + * + * @param ID a valid ID, as enumerated by getAvailableIDs() + * @return A Transliterator object with the given ID + * @exception IllegalArgumentException if the given ID is invalid. + * @see #registerInstance + * @see #getAvailableIDs + * @see #getID + */ + public static Transliterator getInstance(String ID) { + Transliterator t = internalGetInstance(ID); + if (t != null) { + return t; + } + throw new IllegalArgumentException("Unsupported transliterator: " + + ID); + } + + /** + * Returns a transliterator object given its ID. Unlike getInstance(), + * this method returns null if it cannot make use of the given ID. + */ + private static Transliterator internalGetInstance(String ID) { + Object obj = cache.get(ID); + RuleBasedTransliterator.Data data = null; + + if (obj instanceof RuleBasedTransliterator.Data) { + data = (RuleBasedTransliterator.Data) obj; + // Fall through to construct transliterator from cached Data object. + } else if (obj instanceof Class) { + try { + return (Transliterator) ((Class) obj).newInstance(); + } catch (InstantiationException e) { + } catch (IllegalAccessException e2) {} + } else { + synchronized (cache) { + boolean isReverse = (obj == REVERSE_RULE_BASED_PLACEHOLDER); + String resourceName = RB_RULE_BASED_PREFIX; + int i = ID.indexOf('-'); + if (i < 0) { + resourceName += ID; + } else { + String IDLeft = ID.substring(0, i); + String IDRight = ID.substring(i+1); + resourceName += isReverse ? (IDRight + IDLeft) + : (IDLeft + IDRight); + } + try { + ResourceBundle resource = ResourceBundle.getBundle(resourceName); + + data = RuleBasedTransliterator.parse(resource.getString(RB_RULE), + isReverse + ? RuleBasedTransliterator.REVERSE + : RuleBasedTransliterator.FORWARD); + + cache.put(ID, data); + // Fall through to construct transliterator from Data object. + } catch (MissingResourceException e) {} + } + } + + if (data != null) { + return new RuleBasedTransliterator(ID, data, null); + } + + return null; + } + + /** + * Registers a subclass of Transliterator with the + * system. This subclass must have a public constructor taking no + * arguments. When that constructor is called, the resulting + * object must return the ID passed to this method if + * its getID() method is called. + * + * @param ID the result of getID() for this + * transliterator + * @param transClass a subclass of Transliterator + * @see #registerInstance + * @see #unregister + */ + public static void registerClass(String ID, Class transClass) { + cache.put(ID, transClass); + } + + /** + * Unregisters a transliterator or class. This may be either + * a system transliterator or a user transliterator or class. + * + * @param ID the ID of the transliterator or class + * @return the Object that was registered with + * ID, or null if none was + * @see #registerInstance + * @see #registerClass + */ + public static Object unregister(String ID) { + return cache.remove(ID); + } + + /** + * Returns an enumeration over the programmatic names of registered + * Transliterator objects. This includes both system + * transliterators and user transliterators registered using + * registerInstance(). The enumerated names may be + * passed to getInstance(). + * + * @return An Enumeration over String objects + * @see #getInstance + * @see #registerInstance + */ + public static final Enumeration getAvailableIDs() { + return cache.keys(); + } + + static { + ResourceBundle bundle = ResourceBundle.getBundle(RB_LOCALE_ELEMENTS); + + try { + String[] ruleBasedIDs = bundle.getStringArray(RB_RULE_BASED_IDS); + + cache = new Hashtable(); + + for (int i=0; iUnicodeFilter defines a protocol for selecting a + * subset of the full range (U+0000 to U+FFFF) of Unicode characters. + * Currently, filters are used in conjunction with classes like {@link + * Transliterator} to only process selected characters through a + * transformation. + * + * {@link UnicodeFilterLogic} + */ + +public interface UnicodeFilter { + + /** + * Returns true for characters that are in the selected + * subset. In other words, if a character is to be + * filtered, then isIn() returns + * false. + */ + public boolean isIn(char c); +} diff --git a/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java b/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java new file mode 100755 index 00000000000..f9e6ec1c609 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java @@ -0,0 +1,112 @@ +package com.ibm.text; + +/** + * UnicodeFilterLogic provides logical operators on + * {@link UnicodeFilter} objects. This class cannot be instantiated; + * it consists only of static methods. The static methods return + * filter objects that perform logical inversion (not), + * intersection (and), or union (or) of the given + * filter objects. + */ +public final class UnicodeFilterLogic { + + /** + * Returns a UnicodeFilter that implements the inverse of + * the given filter. + */ + public static UnicodeFilter not(final UnicodeFilter f) { + return new UnicodeFilter() { + public boolean isIn(char c) { + return !f.isIn(c); + } + }; + } + + /** + * Returns a UnicodeFilter that implements a short + * circuit AND of the result of the two given filters. That is, + * if f.isIn() is false, then g.isIn() + * is not called, and isIn() returns false. + * + *

Either f or g must be non-null. + */ + public static UnicodeFilter and(final UnicodeFilter f, + final UnicodeFilter g) { + if (f == null) { + return g; + } + if (g == null) { + return f; + } + return new UnicodeFilter() { + public boolean isIn(char c) { + return f.isIn(c) && g.isIn(c); + } + }; + } + + /** + * Returns a UnicodeFilter that implements a short + * circuit AND of the result of the given filters. That is, if + * f[i].isIn() is false, then + * f[j].isIn() is not called, where j > i, and + * isIn() returns false. + */ + public static UnicodeFilter and(final UnicodeFilter[] f) { + return new UnicodeFilter() { + public boolean isIn(char c) { + for (int i=0; iUnicodeFilter that implements a short + * circuit OR of the result of the two given filters. That is, if + * f.isIn() is true, then g.isIn() is + * not called, and isIn() returns true. + * + *

Either f or g must be non-null. + */ + public static UnicodeFilter or(final UnicodeFilter f, + final UnicodeFilter g) { + if (f == null) { + return g; + } + if (g == null) { + return f; + } + return new UnicodeFilter() { + public boolean isIn(char c) { + return f.isIn(c) || g.isIn(c); + } + }; + } + + /** + * Returns a UnicodeFilter that implements a short + * circuit OR of the result of the given filters. That is, if + * f[i].isIn() is false, then + * f[j].isIn() is not called, where j > i, and + * isIn() returns true. + */ + public static UnicodeFilter or(final UnicodeFilter[] f) { + return new UnicodeFilter() { + public boolean isIn(char c) { + for (int i=0; icharacter classes used in regular expressions. + * Such classes specify a subset of the set of all Unicode characters, + * which in this implementation is the characters from U+0000 to + * U+FFFF, ignoring surrogates. + * + *

This class supports two APIs. The first is modeled after Java 2's + * java.util.Set interface, although this class does not + * implement that interface. All methods of Set are + * supported, with the modification that they take a character range + * or single character instead of an Object, and they + * take a UnicodeSet instead of a Collection. + * + *

The second API is the + * applyPattern()/toPattern() API from the + * java.text.Format-derived classes. Unlike the + * methods that add characters, add categories, and control the logic + * of the set, the method applyPattern() sets all + * attributes of a UnicodeSet at once, based on a + * string pattern. + * + *

In addition, the set complement operation is supported through + * the complement() method. + * + *

Pattern syntax

+ * + * Patterns are accepted by the constructors and the + * applyPattern() methods and returned by the + * toPattern() method. These patterns follow a syntax + * similar to that employed by version 8 regular expression character + * classes: + * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
pattern :=  ('[' '^'? item* ']') | + * ('[:' '^'? category ':]')
item :=  char | (char '-' char) | pattern-expr
+ *
pattern-expr :=  pattern | pattern-expr pattern | + * pattern-expr op pattern
+ *
op :=  '&' | '-'
+ *
special :=  '[' | ']' | '-'
+ *
char :=  any character that is not special
+ * | ('\u005C'
any character)
+ * | ('\u005Cu' hex hex hex hex)
+ *
hex :=  any character for which + * Character.digit(c, 16) + * returns a non-negative result
category :=  'M' | 'N' | 'Z' | 'C' | 'L' | 'P' | + * 'S' | 'Mn' | 'Mc' | 'Me' | 'Nd' | 'Nl' | 'No' | 'Zs' | 'Zl' | + * 'Zp' | 'Cc' | 'Cf' | 'Cs' | 'Co' | 'Cn' | 'Lu' | 'Ll' | 'Lt' + * | 'Lm' | 'Lo' | 'Pc' | 'Pd' | 'Ps' | 'Pe' | 'Po' | 'Sm' | + * 'Sc' | 'Sk' | 'So'
+ *
+ * + * + * + * + *
Legend: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
a := b  a may be replaced by b
a?zero or one instance of a
+ *
a*one or more instances of a
+ *
a | beither a or b
+ *
'a'the literal string between the quotes
+ *
+ *
+ * + * Patterns specify individual characters, ranges of characters, and + * Unicode character categories. When elements are concatenated, they + * specify their union. To complement a set, place a '^' immediately + * after the opening '[' or '[:'. In any other location, '^' has no + * special meaning. + * + *

Ranges are indicated by placing two a '-' between two + * characters, as in "a-z". This specifies the range of all + * characters from the left to the right, in Unicode order. If the + * left and right characters are the same, then the range consists of + * just that character. If the left character is greater than the + * right character it is a syntax error. If a '-' occurs as the first + * character after the opening '[' or '[^', or if it occurs as the + * last character before the closing ']', then it is taken as a + * literal. Thus "[a\u005C-b]", "[-ab]", and "[ab-]" all indicate the same + * set of three characters, 'a', 'b', and '-'. + * + *

Sets may be intersected using the '&' operator or the asymmetric + * set difference may be taken using the '-' operator, for example, + * "[[:L:]&[\u005Cu0000-\u005Cu0FFF]]" indicates the set of all Unicode letters + * with values less than 4096. Operators ('&' and '|') have equal + * precedence and bind left-to-right. Thus + * "[[:L:]-[a-z]-[\u005Cu0100-\u005Cu01FF]]" is equivalent to + * "[[[:L:]-[a-z]]-[\u005Cu0100-\u005Cu01FF]]". This only really matters for + * difference; intersection is commutative. + * + * + *
[a]The set containing 'a' + *
[a-z]The set containing 'a' + * through 'z' and all letters in between, in Unicode order + *
[^a-z]The set containing + * all characters but 'a' through 'z', + * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF + *
[[pat1][pat2]] + * The union of sets specified by pat1 and pat2 + *
[[pat1]&[pat2]] + * The intersection of sets specified by pat1 and pat2 + *
[[pat1]-[pat2]] + * The asymmetric difference of sets specified by pat1 and + * pat2 + *
[:Lu:] + * The set of characters belonging to the given + * Unicode category, as defined by Character.getType(); in + * this case, Unicode uppercase letters + *
[:L:] + * The set of characters belonging to all Unicode categories + * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. + *
+ * + *

Character categories. + * + * Character categories are specified using the POSIX-like syntax + * '[:Lu:]'. The complement of a category is specified by inserting + * '^' after the opening '[:'. The following category names are + * recognized. Actual determination of category data uses + * Character.getType(), so it reflects the underlying + * implmementation used by Character. As of Java 2 and + * JDK 1.1.8, this is Unicode 2.1.2. + * + *

+ * Normative
+ *     Mn = Mark, Non-Spacing
+ *     Mc = Mark, Spacing Combining
+ *     Me = Mark, Enclosing
+ * 
+ *     Nd = Number, Decimal Digit
+ *     Nl = Number, Letter
+ *     No = Number, Other
+ * 
+ *     Zs = Separator, Space
+ *     Zl = Separator, Line
+ *     Zp = Separator, Paragraph
+ * 
+ *     Cc = Other, Control
+ *     Cf = Other, Format
+ *     Cs = Other, Surrogate
+ *     Co = Other, Private Use
+ *     Cn = Other, Not Assigned
+ * 
+ * Informative
+ *     Lu = Letter, Uppercase
+ *     Ll = Letter, Lowercase
+ *     Lt = Letter, Titlecase
+ *     Lm = Letter, Modifier
+ *     Lo = Letter, Other
+ * 
+ *     Pc = Punctuation, Connector
+ *     Pd = Punctuation, Dash
+ *     Ps = Punctuation, Open
+ *     Pe = Punctuation, Close
+ *    *Pi = Punctuation, Initial quote
+ *    *Pf = Punctuation, Final quote
+ *     Po = Punctuation, Other
+ * 
+ *     Sm = Symbol, Math
+ *     Sc = Symbol, Currency
+ *     Sk = Symbol, Modifier
+ *     So = Symbol, Other
+ * 
+ * *Unsupported by Java (and hence unsupported by UnicodeSet). + * + * @author Alan Liu + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */ +public class UnicodeSet { + /** + * The internal representation is a StringBuffer of even length. + * Each pair of characters represents a range that is included in + * the set. A single character c is represented as cc. Thus, the + * ranges in the set are (a,b), a and b inclusive, where a = + * pairs.charAt(i) and b = pairs.charAt(i+1) for all even i, 0 <= + * i <= pairs.length()-2. Pairs are always stored in ascending + * Unicode order. Pairs are always stored in shortest form. For + * example, if the pair "hh", representing the single character + * 'h', is added to the pairs list "agik", representing the ranges + * 'a'-'g' and 'i'-'k', the result is "ak", not "aghhik". + * + * This representation format was originally used in Richard + * Gillam's CharSet class. + */ + private StringBuffer pairs; + + private static final String CATEGORY_NAMES = + // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 + //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8 + "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo"; + + private static final int UNSUPPORTED_CATEGORY = 17; + + private static final int CATEGORY_COUNT = 29; + + /** + * A cache mapping character category integers, as returned by + * Character.getType(), to pairs strings. Entries are initially + * null and are created on demand. + */ + private static final String[] CATEGORY_PAIRS_CACHE = + new String[CATEGORY_COUNT]; + + //---------------------------------------------------------------- + // Debugging and testing + //---------------------------------------------------------------- + + /** + * Return the representation of this set as a list of character + * ranges. Ranges are listed in ascending Unicode order. For + * example, the set [a-zA-M3] is represented as "33AMaz". + */ + public String getPairs() { + return pairs.toString(); + } + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + + /** + * Constructs an empty set. + */ + public UnicodeSet() { + pairs = new StringBuffer(); + } + + /** + * Constructs a set from the given pattern. See the class description + * for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @exception IllegalArgumentException if the pattern contains + * a syntax error. + */ + public UnicodeSet(String pattern) { + applyPattern(pattern, false); + } + + /** + * Constructs a set from the given pattern, optionally ignoring + * white space. See the class description for the syntax of the + * pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored, except those preceded by '\u005C'. Spaces are + * those characters for which Character.isSpaceChar() + * is true. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + public UnicodeSet(String pattern, boolean ignoreSpaces) { + applyPattern(pattern, ignoreSpaces); + } + + /** + * Constructs a set from the given Unicode character category. + * @param category an integer indicating the character category as + * returned by Character.getType(). + * @exception IllegalArgumentException if the given + * category is invalid. + */ + public UnicodeSet(int category) { + if (category < 0 || category >= CATEGORY_COUNT || + category == UNSUPPORTED_CATEGORY) { + throw new IllegalArgumentException("Invalid category"); + } + pairs = new StringBuffer(getCategoryPairs(category)); + } + + /** + * Modifies this set to represent the set specified by the given + * pattern. See the class description for the syntax of the + * pattern language. + * @param pattern a string specifying what characters are in the set + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + public final void applyPattern(String pattern) { + applyPattern(pattern, false); + } + + /** + * Modifies this set to represent the set specified by the given + * pattern, optionally ignoring white space. See the class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored. Spaces are those characters for which + * Character.isSpaceChar() is true. + * Characters preceded by '\\' are escaped, losing any special + * meaning they otherwise have. Spaces may be included by + * escaping them. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + public void applyPattern(String pattern, boolean ignoreSpaces) { + ParsePosition pos = new ParsePosition(0); + + // To ignore spaces, create a new pattern without spaces. We + // have to process all '\' escapes. If '\' is encountered, + // insert it and the following character (if any -- let parse + // deal with any syntax errors) in the pattern. This allows + // escaped spaces. + if (ignoreSpaces) { + StringBuffer pat = new StringBuffer(); + for (int i=0; in, where 0 <= n <= 65536. + * + * @return the number of elements in this set (its cardinality). + */ + public int size() { + int n = 0; + for (int i=0; itrue if this set contains no elements. + * + * @return true if this set contains no elements. + */ + public boolean isEmpty() { + return pairs.length() == 0; + } + + /** + * Returns true if this set contains the specified range + * of chars. + * + * @return true if this set contains the specified range + * of chars. + */ + public boolean contains(char first, char last) { + // Set i to the end of the smallest range such that its end + // point >= last, or pairs.length() if no such range exists. + int i = 1; + while (ipairs.charAt(i)) i+=2; + return i=pairs.charAt(i-1); + } + + /** + * Returns true if this set contains the specified char. + * + * @return true if this set contains the specified char. + */ + public boolean contains(char c) { + return contains(c, c); + } + + /** + * Adds the specified range to this set if it is not already + * present. If this set already contains the specified range, + * the call leaves this set unchanged. If last > first + * then an empty range is added, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be added + * to this set. + * @param last last character, inclusive, of range to be added + * to this set. + */ + public void add(char first, char last) { + if (first <= last) { + addPair(pairs, first, last); + } + } + + /** + * Adds the specified character to this set if it is not already + * present. If this set already contains the specified character, + * the call leaves this set unchanged. + */ + public final void add(char c) { + add(c, c); + } + + /** + * Removes the specified range from this set if it is present. + * The set will not contain the specified range once the call + * returns. If last > first then an empty range is + * removed, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be removed + * from this set. + * @param last last character, inclusive, of range to be removed + * from this set. + */ + public void remove(char first, char last) { + if (first <= last) { + removePair(pairs, first, last); + } + } + + /** + * Removes the specified character from this set if it is present. + * The set will not contain the specified range once the call + * returns. + */ + public final void remove(char c) { + remove(c, c); + } + + /** + * Returns true if the specified set is a subset + * of this set. + * + * @param c set to be checked for containment in this set. + * @return true if this set contains all of the elements of the + * specified set. + */ + public boolean containsAll(UnicodeSet c) { + // The specified set is a subset if all of its pairs are contained + // in this set. + int i = 1; + for (int j=0; j= last, or pairs.length() if no such range + // exists. + while (ipairs.charAt(i)) i+=2; + if (i>pairs.length() || c.pairs.charAt(j) < pairs.charAt(i-1)) { + return false; + } + } + return true; + } + + /** + * Adds all of the elements in the specified set to this set if + * they're not already present. This operation effectively + * modifies this set so that its value is the union of the two + * sets. The behavior of this operation is unspecified if the specified + * collection is modified while the operation is in progress. + * + * @param c set whose elements are to be added to this set. + * @see #add(char, char) + */ + public void addAll(UnicodeSet c) { + doUnion(pairs, c.pairs.toString()); + } + + /** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the intersection of the two sets. + * + * @param c set that defines which elements this set will retain. + */ + public void retainAll(UnicodeSet c) { + doIntersection(pairs, c.pairs.toString()); + } + + /** + * Removes from this set all of its elements that are contained in the + * specified set. This operation effectively modifies this + * set so that its value is the asymmetric set difference of + * the two sets. + * + * @param c set that defines which elements will be removed from + * this set. + */ + public void removeAll(UnicodeSet c) { + doDifference(pairs, c.pairs.toString()); + } + + /** + * Inverts this set. This operation modifies this set so that + * its value is its complement. This is equivalent to the pseudo code: + * this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this). + */ + public void complement() { + doComplement(pairs); + } + + /** + * Removes all of the elements from this set. This set will be + * empty after this call returns. + */ + public void clear() { + pairs.setLength(0); + } + + /** + * Compares the specified object with this set for equality. Returns + * true if the specified object is also a set, the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o Object to be compared for equality with this set. + * @return true if the specified Object is equal to this set. + */ + public boolean equals(Object o) { + return o instanceof UnicodeSet && + pairs.equals(((UnicodeSet)o).pairs); + } + + /** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ + public int hashCode() { + return pairs.hashCode(); + } + + //---------------------------------------------------------------- + // Implementation: Pattern parsing + //---------------------------------------------------------------- + + /** + * Parses the given pattern, starting at the given position. The + * character at pattern.charAt(pos.getIndex()) must be '[', or the + * parse fails. Parsing continues until the corresponding closing + * ']'. If a syntax error is encountered between the opening and + * closing brace, the parse fails. Upon return from a successful + * parse, the ParsePosition is updated to point to the character + * following the closing ']', and a StringBuffer containing a + * pairs list for the parsed pattern is returned. This method calls + * itself recursively to parse embedded subpatterns. + * + * @param pattern the string containing the pattern to be parsed. + * The portion of the string from pos.getIndex(), which must be a + * '[', to the corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. + * The character at pattern.charAt(pos.getIndex()) must be a '['. + * Upon return from a successful parse, pos.getIndex() is either + * the character after the closing ']' of the parsed pattern, or + * pattern.length() if the closing ']' is the last character of + * the pattern string. + * @return a StringBuffer containing a pairs list for the parsed + * substring of pattern + * @exception IllegalArgumentException if the parse fails. + */ + private static StringBuffer parse(String pattern, ParsePosition pos) { + + boolean invert = false; + StringBuffer pairsBuf = new StringBuffer(); + + /** + * Nodes: 0 - idle, waiting for '[' + * 10 - like 11, but immediately after "[" or "[^" + * 11 - awaiting x, "]", "[...]", or "[:...:]" + * 21 - after x + * 23 - after x- + * + * The parsing state machine moves from node 0 through zero or more + * other nodes back to node 0, in a successful parse. + */ + int node = 0; + char first = 0; + int i; + + /** + * This loop iterates over the characters in the pattern. We + * start at the position specified by pos. We exit the loop + * when either a matching closing ']' is seen, or we read all + * characters of the pattern. + */ + for (i=pos.getIndex(); i= pattern.length()) { + throw new IllegalArgumentException("Invalid \\u escape"); + } + c = '\u0000'; + for (int j=(++i)+4; i "aq". addPair("ampz", 'n', + * 'o') => "az". + */ + private static void addPair(StringBuffer pairs, char c, char d) { + char a = 0; + char b = 0; + for (int i=0; i "ak". + * removePair("ampz", 'l', 'q') => "akrz". + */ + private static void removePair(StringBuffer pairs, char c, char d) { + // Iterate over pairs until we find a pair that overlaps + // with the given range. + for (int i=0; i= a. + // rangeEdited is set to true if we have modified the + // range a-b (the range at i) in place. + boolean rangeEdited = false; + if (c > a) { + // If c is after a and before b, then we have overlap + // of this sort: a--c==b--d or a--c==d--b, where a-b + // and c-d are the ranges of interest. We need to + // add the range a,c-1. + pairs.setCharAt(i+1, (char)(c-1)); + // i is already a + rangeEdited = true; + } + if (d < b) { + // If d is after a and before b, we overlap like this: + // c--a==d--b or a--c==d--b, where a-b is the range at + // i and c-d is the range being removed. We need to + // add the range d+1,b. + if (rangeEdited) { + pairs.insert(i+2, new char[] { (char)(d+1), b }); + i += 2; + } else { + pairs.setCharAt(i, (char)(d+1)); + // i+1 is already b + rangeEdited = true; + } + } + if (!rangeEdited) { + // If we didn't add any ranges, that means the entire + // range a-b must be deleted, since we have + // c--a==b--d. + stringBufferDelete(pairs, i, i+2); + i -= 2; + } + } + } + + //---------------------------------------------------------------- + // Implementation: Fundamental operators + //---------------------------------------------------------------- + + /** + * Changes the pairs list to represent the complement of the set it + * currently represents. The pairs list will be normalized (in + * order and in shortest possible form) if the original pairs list + * was normalized. + */ + private static void doComplement(StringBuffer pairs) { + if (pairs.length() == 0) { + pairs.append('\u0000').append('\uffff'); + return; + } + + // Change each end to a start and each start to an end of the + // gaps between the ranges. That is, 3-7 9-12 becomes x-2 8-8 + // 13-x, where 'x' represents a range that must now be fixed + // up. + for (int i=0; i 0 && c1.charAt(i - 1) > ub) + ub = c1.charAt(i - 1); + + // now advance j to the first character that is greater + // that "ub" plus one + while (j < c2.length() && c2.charAt(j) <= ub + 1) + ++j; + + // if j points to the endpoint of a range, update "ub" + // to that character, or if j points to the start of + // a range and the endpoint of the preceding range is + // greater than "ub", update "up" to _that_ character + if (j % 2 == 1) + ub = c2.charAt(j); + else if (j > 0 && c2.charAt(j - 1) > ub) + ub = c2.charAt(j - 1); + } + // when we finally fall out of this loop, we will have stitched + // together a series of ranges that overlap or touch, i and j + // will both point to starting points of ranges, and "ub" will + // be the endpoint of the range we're working on. Write "ub" + // to the result + result.append(ub); + + // loop back around to create the next range in the result + } + + // we fall out to here when we've exhausted all the characters in + // one of the operands. We can append all of the remaining characters + // in the other operand without doing any extra work. + if (i < c1.length()) + result.append(c1.substring(i)); + if (j < c2.length()) + result.append(c2.substring(j)); + + pairs.setLength(0); + pairs.append(result.toString()); + } + + /** + * Given two pairs lists, changes the first in place to represent + * the asymmetric difference of the two sets. + */ + private static void doDifference(StringBuffer pairs, String pairs2) { + StringBuffer p2 = new StringBuffer(pairs2); + doComplement(p2); + doIntersection(pairs, p2.toString()); + } + + /** + * Given two pairs lists, changes the first in place to represent + * the intersection of the two sets. + * + * This implementation format was stolen from Richard Gillam's + * CharSet class. + */ + private static void doIntersection(StringBuffer pairs, String c2) { + StringBuffer result = new StringBuffer(); + String c1 = pairs.toString(); + + int i = 0; + int j = 0; + int oldI; + int oldJ; + + // iterate until we've exhausted one of the operands + while (i < c1.length() && j < c2.length()) { + + // advance j until it points to a character that is larger than + // the one i points to. If this is the beginning of a one- + // character range, advance j to point to the end + if (i < c1.length() && i % 2 == 0) { + while (j < c2.length() && c2.charAt(j) < c1.charAt(i)) + ++j; + if (j < c2.length() && j % 2 == 0 && c2.charAt(j) == c1.charAt(i)) + ++j; + } + + // if j points to the endpoint of a range, save the current + // value of i, then advance i until it reaches a character + // which is larger than the character pointed at + // by j. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldI = i; + while (j % 2 == 1 && i < c1.length() && c1.charAt(i) <= c2.charAt(j)) + ++i; + result.append(c1.substring(oldI, i)); + + // if i points to the endpoint of a range, save the current + // value of j, then advance j until it reaches a character + // which is larger than the character pointed at + // by i. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldJ = j; + while (i % 2 == 1 && j < c2.length() && c2.charAt(j) <= c1.charAt(i)) + ++j; + result.append(c2.substring(oldJ, j)); + + // advance i until it points to a character larger than j + // If it points at the beginning of a one-character range, + // advance it to the end of that range + if (j < c2.length() && j % 2 == 0) { + while (i < c1.length() && c1.charAt(i) < c2.charAt(j)) + ++i; + if (i < c1.length() && i % 2 == 0 && c2.charAt(j) == c1.charAt(i)) + ++i; + } + } + + pairs.setLength(0); + pairs.append(result.toString()); + } + + //---------------------------------------------------------------- + // Implementation: Generation of pairs for Unicode categories + //---------------------------------------------------------------- + + /** + * Returns a pairs string for the given category, given its name. + * The category name must be either a two-letter name, such as + * "Lu", or a one letter name, such as "L". One-letter names + * indicate the logical union of all two-letter names that start + * with that letter. Case is significant. If the name starts + * with the character '^' then the complement of the given + * character set is returned. + * + * Although individual categories such as "Lu" are cached, we do + * not currently cache single-letter categories such as "L" or + * complements such as "^Lu" or "^L". It would be easy to cache + * these as well in a hashtable should the need arise. + */ + private static String getCategoryPairs(String catName) { + boolean invert = (catName.length() > 1 && + catName.charAt(0) == '^'); + if (invert) { + catName = catName.substring(1); + } + + StringBuffer cat = null; + + // if we have two characters, search the category map for that + // code and either construct and return a UnicodeSet from the + // data in the category map or throw an exception + if (catName.length() == 2) { + int i = CATEGORY_NAMES.indexOf(catName); + if (i>=0 && i%2==0) { + i /= 2; + if (i != UNSUPPORTED_CATEGORY) { + String pairs = getCategoryPairs(i); + if (!invert) { + return pairs; + } + cat = new StringBuffer(pairs); + } + } + } else if (catName.length() == 1) { + // if we have one character, search the category map for + // codes beginning with that letter, and union together + // all of the matching sets that we find (or throw an + // exception if there are no matches) + for (int i=0; i= 0) { + pairs.append((char)first).append((char)last); + } + first = last = i; + } + } + } + if (first >= 0) { + pairs.append((char)first).append((char)last); + } + CATEGORY_PAIRS_CACHE[cat] = pairs.toString(); + } + return CATEGORY_PAIRS_CACHE[cat]; + } + + //---------------------------------------------------------------- + // Implementation: Utility methods + //---------------------------------------------------------------- + + /** + * Returns the character after the given position, or '\uFFFF' if + * there is none. + + */ + private static final char charAfter(String str, int i) { + return ((++i) < str.length()) ? str.charAt(i) : '\uFFFF'; + } + + /** + * Deletes a range of character from a StringBuffer, from start to + * limit-1. This is not part of JDK 1.1 StringBuffer, but is + * present in Java 2. + * @param start inclusive start of range + * @param limit exclusive end of range + */ + private static void stringBufferDelete(StringBuffer buf, + int start, int limit) { + // In Java 2 just use: + // buf.delete(start, limit); + char[] chars = null; + if (buf.length() > limit) { + chars = new char[buf.length() - limit]; + buf.getChars(limit, buf.length(), chars, 0); + } + buf.setLength(start); + if (chars != null) { + buf.append(chars); + } + } +} diff --git a/icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java b/icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java new file mode 100755 index 00000000000..1e688f65fa9 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java @@ -0,0 +1,172 @@ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that converts from Unicode characters to + * hexadecimal Unicode escape sequences. It outputs a + * prefix specified in the constructor and optionally converts the hex + * digits to uppercase. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class UnicodeToHexTransliterator extends Transliterator { + + /** + * Package accessible ID for this transliterator. + */ + static String _ID = "Unicode-Hex"; + + private String prefix; + + private boolean uppercase; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructs a transliterator. + * @param prefix the string that will precede the four hex + * digits for UNICODE_HEX transliterators. Ignored + * if direction is HEX_UNICODE. + * @param uppercase if true, the four hex digits will be + * converted to uppercase; otherwise they will be lowercase. + * Ignored if direction is HEX_UNICODE. + */ + public UnicodeToHexTransliterator(String prefix, boolean uppercase, + UnicodeFilter filter) { + super(_ID, filter); + this.prefix = prefix; + this.uppercase = uppercase; + } + + /** + * Constructs a transliterator with the default prefix "\u" + * that outputs uppercase hex digits. + */ + public UnicodeToHexTransliterator() { + this("\\u", true, null); + } + + /** + * Returns the string that precedes the four hex digits. + * @return prefix string + */ + public String getPrefix() { + return prefix; + } + + /** + * Sets the string that precedes the four hex digits. + * + *

Callers must take care if a transliterator is in use by + * multiple threads. The prefix should not be changed by one + * thread while another thread may be transliterating. + * @param prefix prefix string + */ + public void setPrefix(String prefix) { + this.prefix = prefix; + } + + /** + * Returns true if this transliterator outputs uppercase hex digits. + */ + public boolean isUppercase() { + return uppercase; + } + + /** + * Sets if this transliterator outputs uppercase hex digits. + * + *

Callers must take care if a transliterator is in use by + * multiple threads. The uppercase mode should not be changed by + * one thread while another thread may be transliterating. + * @param outputUppercase if true, then this transliterator + * outputs uppercase hex digits. + */ + public void setUppercase(boolean outputUppercase) { + uppercase = outputUppercase; + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return the new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + int[] offsets = { start, limit, start }; + handleKeyboardTransliterate(text, offsets); + return offsets[LIMIT]; + } + + /** + * Implements {@link Transliterator#handleKeyboardTransliterate}. + */ + protected void handleKeyboardTransliterate(Replaceable text, + int[] offsets) { + /** + * Performs transliteration changing all characters to + * Unicode hexadecimal escapes. For example, '@' -> "U+0040", + * assuming the prefix is "U+". + */ + int cursor = offsets[CURSOR]; + int limit = offsets[LIMIT]; + + UnicodeFilter filter = getFilter(); + + loop: + while (cursor < limit) { + char c = text.charAt(cursor); + if (filter != null && !filter.isIn(c)) { + ++cursor; + continue; + } + String hex = hex(c); + text.replace(cursor, cursor+1, hex); + int len = hex.length(); + cursor += len; // Advance cursor by 1 and adjust for new text + --len; + limit += len; + } + + offsets[LIMIT] = limit; + offsets[CURSOR] = cursor; + } + + /** + * Return the length of the longest context required by this transliterator. + * This is preceding context. + * @param direction either FORWARD or REVERSE + * @return maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return 0; + } + + /** + * Form escape sequence. + */ + private final String hex(char c) { + StringBuffer buf = new StringBuffer(); + buf.append(prefix); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + String h = Integer.toHexString(c); + buf.append(uppercase ? h.toUpperCase() : h); + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java new file mode 100755 index 00000000000..96433f64a26 --- /dev/null +++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java @@ -0,0 +1,763 @@ +import com.ibm.text.*; +import java.text.*; +import java.util.*; + +/** + * @test + * @summary General test of Transliterator + */ +public class TransliteratorTest extends IntlTest { + + public static void main(String[] args) throws Exception { + new TransliteratorTest().run(args); + } + + /** + * A CommonPoint legacy round-trip test for the Kana transliterator. + */ +// public void TestKanaRoundTrip() { +// Transliterator t = Transliterator.getInstance("Kana"); +// StringTokenizer tok = new StringTokenizer(KANA_RT_DATA); +// while (tok.hasMoreTokens()) { +// String str = tok.nextToken(); +// ReplaceableString tmp = new ReplaceableString(str); +// t.transliterate(tmp, Transliterator.FORWARD); +// +// str = tmp.toString(); +// tmp = new ReplaceableString(str); +// t.transliterate(tmp, Transliterator.REVERSE); +// t.transliterate(tmp, Transliterator.FORWARD); +// if (!tmp.toString().equals(str)) { +// tmp = new ReplaceableString(str); +// t.transliterate(tmp, Transliterator.REVERSE); +// String a = tmp.toString(); +// t.transliterate(tmp, Transliterator.FORWARD); +// errln("FAIL: " + escape(str) + " -> " + +// escape(a) + " -> " + escape(tmp.toString())); +// } +// } +// } + + public void TestInstantiation() { + long ms = System.currentTimeMillis(); + String ID; + for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) { + ID = (String) e.nextElement(); + try { + Transliterator t = Transliterator.getInstance(ID); + // We should get a new instance if we try again + Transliterator t2 = Transliterator.getInstance(ID); + if (t != t2) { + logln(ID + ":" + t); + } else { + errln("FAIL: " + ID + " returned identical instances"); + } + } catch (IllegalArgumentException ex) { + errln("FAIL: " + ID); + throw ex; + } + } + + // Now test the failure path + try { + ID = ""; + Transliterator t = Transliterator.getInstance(ID); + errln("FAIL: " + ID + " returned " + t); + } catch (IllegalArgumentException ex) { + logln("OK: Bogus ID handled properly"); + } + + ms = System.currentTimeMillis() - ms; + logln("Elapsed time: " + ms + " ms"); + } + + public void TestSimpleRules() { + /* Example: rules 1. ab>x|y + * 2. yc>z + * + * []|eabcd start - no match, copy e to tranlated buffer + * [e]|abcd match rule 1 - copy output & adjust cursor + * [ex|y]cd match rule 2 - copy output & adjust cursor + * [exz]|d no match, copy d to transliterated buffer + * [exzd]| done + */ + expect("ab>x|y\n" + + "yc>z", + "eabcd", "exzd"); + + /* Another set of rules: + * 1. ab>x|yzacw + * 2. za>q + * 3. qc>r + * 4. cw>n + * + * []|ab Rule 1 + * [x|yzacw] No match + * [xy|zacw] Rule 2 + * [xyq|cw] Rule 4 + * [xyqn]| Done + */ + expect("ab>x|yzacw\n" + + "za>q\n" + + "qc>r\n" + + "cw>n", + "ab", "xyqn"); + + /* Test categories + */ + Transliterator t = new RuleBasedTransliterator("", + "dummy=\uE100\n" + + "vowel=[aeiouAEIOU]\n" + + "lu=[:Lu:]\n" + + "{vowel}[{lu}>!\n" + + "{vowel}>&\n" + + "!]{lu}>^\n" + + "{lu}>*\n" + + "a>ERROR"); + expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&"); + } + + // Restore this test if/when it's been deciphered. In general, + // tests that depend on a specific tranliterator are subject + // to the same fragility as tests that depend on resource data. + +// public void TestKana() { +// String DATA[] = { +// "a", "\u3042", +// "A", "\u30A2", +// "aA", "\u3042\u30A2", +// "aaaa", "\u3042\u3042\u3042\u3042", +// "akasata", "\u3042\u304B\u3055\u305F", +// }; +// +// Transliterator t = Transliterator.getInstance("Latin-Kana"); +// Transliterator rt = Transliterator.getInstance("Kana-Latin"); +// for (int i=0; izyx\n" + + "ab>yz\n" + + "bc>zx\n" + + "ca>xy\n" + + "a>x\n" + + "b>y\n" + + "c>z\n" + + + "abc", RULES); + Transliterator rev = new RuleBasedTransliterator("", RULES, + RuleBasedTransliterator.REVERSE, null); + for (int i=0; i", + "psch>Y\n" + +"ps>y\n" + +"ch>x\n" + +"a>A\n"); + String DATA[] = { + // insertion, buffer + "a", "A", + "p", "Ap", + "s", "Aps", + "c", "Apsc", + "a", "AycA", + "psch", "AycAY", + null, "AycAY", // null means finishKeyboardTransliteration + }; + + keyboardAux(t, DATA); + } + + /** + * Basic test of keyboard with cursor. + */ + public void TestKeyboard2() { + Transliterator t = new RuleBasedTransliterator("", + "ych>Y\n" + +"ps>|y\n" + +"ch>x\n" + +"a>A\n"); + String DATA[] = { + // insertion, buffer + "a", "A", + "p", "Ap", + "s", "Ay", + "c", "Ayc", + "a", "AycA", + "p", "AycAp", + "s", "AycAy", + "c", "AycAyc", + "h", "AycAY", + null, "AycAY", // null means finishKeyboardTransliteration + }; + + keyboardAux(t, DATA); + } + + /** + * Test keyboard transliteration with back-replacement. + */ + public void TestKeyboard3() { + // We want th>z but t>y. Furthermore, during keyboard + // transliteration we want t>y then yh>z if t, then h are + // typed. + String RULES = + "t>|y\n" + + "yh>z\n" + + ""; + + String[] DATA = { + // Column 1: characters to add to buffer (as if typed) + // Column 2: expected appearance of buffer after + // keyboard xliteration. + "a", "a", + "b", "ab", + "t", "aby", + "c", "abyc", + "t", "abycy", + "h", "abycz", + null, "abycz", // null means finishKeyboardTransliteration + }; + + Transliterator t = new RuleBasedTransliterator("", RULES); + keyboardAux(t, DATA); + } + + private void keyboardAux(Transliterator t, String[] DATA) { + int[] index = {0, 0, 0}; + ReplaceableString s = new ReplaceableString(); + for (int i=0; i "); + t.keyboardTransliterate(s, index, DATA[i]); + } else { + log = new StringBuffer(s.toString() + " => "); + t.finishKeyboardTransliteration(s, index); + } + String str = s.toString(); + // Show the start index '{' and the cursor '|' + log.append(str.substring(0, index[Transliterator.START])). + append('{'). + append(str.substring(index[Transliterator.START], + index[Transliterator.CURSOR])). + append('|'). + append(str.substring(index[Transliterator.CURSOR])); + if (str.equals(DATA[i+1])) { + logln(log.toString()); + } else { + errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]); + } + } + } + + public void TestArabic() { + String DATA[] = { + "Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+ + "\u0627\u0644\u0644\u063a\u0629\u0020"+ + "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+ + "\u0628\u0628\u0646\u0638\u0645\u0020"+ + "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+ + "\u062c\u0645\u064a\u0644\u0629", + }; + + Transliterator t = Transliterator.getInstance("Latin-Arabic"); + for (int i=0; i", trans); + + expect(t, "aaaaa", "aaaaa"); + } + + /** + * Compose the hex transliterators forward and reverse. + */ + public void TestCompoundHex() { + Transliterator a = Transliterator.getInstance("Unicode-Hex"); + Transliterator b = Transliterator.getInstance("Hex-Unicode"); + Transliterator[] trans = { a, b }; + Transliterator ab = new CompoundTransliterator("ab", trans); + String s = "abcde"; + expect(ab, s, s); + + trans = new Transliterator[] { b, a }; + Transliterator ba = new CompoundTransliterator("ba", trans); + ReplaceableString str = new ReplaceableString(s); + a.transliterate(str); + expect(ba, str.toString(), str.toString()); + } + + /** + * Do some basic tests of filtering. + */ + public void TestFiltering() { + Transliterator hex = Transliterator.getInstance("Unicode-Hex"); + hex.setFilter(new UnicodeFilter() { + public boolean isIn(char c) { + return c != 'c'; + } + }); + String s = "abcde"; + String out = hex.transliterate(s); + String exp = "\\u0061\\u0062c\\u0064\\u0065"; + if (out.equals(exp)) { + logln("Ok: \"" + exp + "\""); + } else { + logln("FAIL: \"" + out + "\", wanted \"" + exp + "\""); + } + } + + //====================================================================== + // Support methods + //====================================================================== + + void expect(String rules, String source, String expectedResult) { + expect(new RuleBasedTransliterator("", rules), source, expectedResult); + } + + void expect(Transliterator t, String source, String expectedResult, + Transliterator reverseTransliterator) { + expect(t, source, expectedResult); + if (reverseTransliterator != null) { + expect(reverseTransliterator, expectedResult, source); + } + } + + void expect(Transliterator t, String source, String expectedResult) { + String result = t.transliterate(source); + expectAux(t.getID() + ":String", source, result, expectedResult); + + ReplaceableString rsource = new ReplaceableString(source); + t.transliterate(rsource); + result = rsource.toString(); + expectAux(t.getID() + ":Replaceable", source, result, expectedResult); + + // Test keyboard (incremental) transliteration -- this result + // must be the same after we finalize (see below). + rsource.getStringBuffer().setLength(0); + int[] index = { 0, 0, 0 }; + StringBuffer log = new StringBuffer(); + + for (int i=0; i "); + t.keyboardTransliterate(rsource, index, + String.valueOf(source.charAt(i))); + // Append the string buffer with a vertical bar '|' where + // the committed index is. + String s = rsource.toString(); + log.append(s.substring(0, index[Transliterator.CURSOR])). + append('|'). + append(s.substring(index[Transliterator.CURSOR])); + } + + // As a final step in keyboard transliteration, we must call + // transliterate to finish off any pending partial matches that + // were waiting for more input. + t.finishKeyboardTransliteration(rsource, index); + result = rsource.toString(); + log.append(" => ").append(rsource.toString()); + + expectAux(t.getID() + ":Keyboard", log.toString(), + result.equals(expectedResult), + expectedResult); + } + + void expectAux(String tag, String source, + String result, String expectedResult) { + expectAux(tag, source + " -> " + result, + result.equals(expectedResult), + expectedResult); + } + + void expectAux(String tag, String summary, boolean pass, + String expectedResult) { + if (pass) { + logln("("+tag+") " + escape(summary)); + } else { + errln("FAIL: ("+tag+") " + + escape(summary) + + ", expected " + escape(expectedResult)); + } + } + + /** + * Escape non-ASCII characters as Unicode. + */ + public static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } + + /* + static final String KANA_RT_DATA = +"a "+ + +"ba bi bu be bo "+ +"bya byi byu bye byo "+ +"bba "+ + +"da di du de do "+ +"dya dyi dyu dye dyo "+ +"dha dhi dhu dhe dho "+ +"dda "+ + +"e "+ + +"fa fi fe fo "+ +"fya fyu fyo "+ +"ffa "+ + +"ga gi gu ge go "+ +"gya gyi gyu gye gyo "+ +"gwa gwi gwu gwe gwo "+ +"gga "+ + +"ha hi hu he ho "+ +"hya hyi hyu hye hyo "+ +"hha "+ + +"i "+ + +"ka ki ku ke ko "+ +"kwa kwi kwu kwe kwo "+ +"kya kyi kyu kye kyo "+ +"kka "+ + +"ma mi mu me mo "+ +"mya myi myu mye myo "+ +"mba mfa mma mpa mva "+ +"m'' "+ + +"na ni nu ne no "+ +"nya nyi nyu nye nyo "+ +"nn n'' n "+ + +"o "+ + +"pa pi pu pe po "+ +"pya pyi pyu pye pyo "+ +"ppa "+ + +"qa qi qu qe qo "+ +"qya qyi qyu qye qyo "+ +"qqa "+ + +"ra ri ru re ro "+ +"rya ryi ryu rye ryo "+ +"rra "+ + +"sa si su se so "+ +"sya syi syu sye syo "+ +"ssya ssa "+ + +"ta ti tu te to "+ +"tha thi thu the tho "+ +"tsa tsi tse tso "+ +"tya tyi tyu tye tyo "+ +"ttsa "+ +"tta "+ + +"u "+ + +"va vi vu ve vo "+ +"vya vyi vyu vye vyo "+ +"vva "+ + +"wa wi we wo "+ +"wwa "+ + +"ya yu ye yo "+ +"yya "+ + +"za zi zu ze zo "+ +"zya zyi zyu zye zyo "+ +"zza "+ + +"xa xi xu xe xo "+ +"xka xke "+ +"xtu "+ +"xwa "+ +"xya xyu xyo "+ + + "akka akki akku akke akko "+ + "akkya akkyu akkyo "+ + + "atta atti attu atte atto "+ + "attya attyu attyo "+ + "adda addi addu adde addo "+ + + "atcha atchi atchu atche atcho "+ + + "assa assi assu asse asso "+ + "assya assyu assyo "+ + + "ahha ahhi ahhu ahhe ahho "+ + "appa appi appu appe appo "+ + + "an "+ + "ana ani anu ane ano "+ + "anna anni annu anne anno "+ + "an'a an'i an'u an'e an'o "+ + + "annna annni annnu annne annno "+ + "an'na an'ni an'nu an'ne an'no "+ + + "anka anki anku anke anko "+ + "anga angi angu ange ango "+ + + "ansa ansi ansu anse anso "+ + "anza anzi anzu anze anzo "+ + "anzya anzyu anzyo "+ + + "anta anti antu ante anto "+ + "antya antyu antyo "+ + "anda andi andu ande ando "+ + + "ancha anchi anchu anche ancho "+ + "anja anji anju anje anjo "+ + "antsa antsu antso "+ + + "anpa anpi anpu anpe anpo "+ + "ampa ampi ampu ampe ampo "+ + + "anba anbi anbu anbe anbo "+ + "amba ambi ambu ambe ambo "+ + + "anma anmi anmu anme anmo "+ + "amma ammi ammu amme ammo "+ + + "anwa anwi anwu anwe anwo "+ + + "anha anhi anhu anhe anho "+ + + "anya anyi anyu anye anyo "+ + "annya annyi annyu annye annyo "+ + "an'ya an'yi an'yu an'ye an'yo "+ + + "kkk "+ + "ggg "+ + "sss "+ + "zzz "+ + "ttt "+ + "ddd "+ + "nnn "+ + "hhh "+ + "bbb "+ + "ppp "+ + "mmm "+ + "yyy "+ + "rrr "+ + "www "; +*/ + + /*+ + + "A I U E O "+ + "XA XI XU XE XO "+ + + "KA KI KU KE KO "+ + "KYA KYI KYU KYE KYO "+ + "KWA KWI KWU KWE KWO "+ + "QA QI QU QE QO "+ + "QYA QYI QYU QYE QYO "+ + "XKA XKE "+ + + "GA GI GU GE GO "+ + "GYA GYI GYU GYE GYO "+ + "GWA GWI GWU GWE GWO "+ + + "SA SI SU SE SO "+ + "SHA SHI SHU SHE SHO "+ + "SYA SYI SYU SYE SYO "+ + + "ZA ZI ZU ZE ZO "+ + "ZYA ZYI ZYU ZYE ZYO "+ + "JA JI JU JE JO "+ + "JYA JYU JYO "+ + + "TA TI TU TE TO "+ + "XTU XTSU "+ + "TYA TYU TYO "+ + "CYA CYU CYO "+ + "CHA CHI CHU CHE CHO "+ + "TSA TSI TSU TSE TSO "+ + "DA DI DU DE DO "+ + "DYA DYU DYO "+ + "THA THI THU THE THO "+ + "DHA DHI DHU DHE DHO "+ + + "NA NI NU NE NO "+ + "NYA NYU NYO "+ + + "HA HI HU HE HO "+ + "HYA HYU HYO "+ + "FA FI FU FE FO "+ + "FYA FYU FYO "+ + "BA BI BU BE BO "+ + "BYA BYU BYO "+ + "PA PI PU PE PO "+ + "PYA PYU PYO "+ + + "MA MI MU ME MO "+ + "MYA MYU MYO "+ + "YA YI YU YE YO "+ + "XYA XYI XYU XYE XYO "+ + + "RA RI RU RE RO "+ + "LA LI LU LE LO "+ + "RYA RYI RYU RYE RYO "+ + "LYA LYI LYU LYE LYO "+ + + "WA WI WU WE WO "+ + "VA VI VU VE VO "+ + "VYA VYU VYO "+ + + "CYA CYI CYU CYE CYO "+ + + "NN "+ + "N' "+ + "N "+ + + "AKKA AKKI AKKU AKKE AKKO "+ + "AKKYA AKKYU AKKYO "+ + + "ATTA ATTI ATTU ATTE ATTO "+ + "ATTYA ATTYU ATTYO "+ + "ADDA ADDI ADDU ADDE ADDO "+ + + "ATCHA ATCHI ATCHU ATCHE ATCHO "+ + + "ASSA ASSI ASSU ASSE ASSO "+ + "ASSYA ASSYU ASSYO "+ + + "AHHA AHHI AHHU AHHE AHHO "+ + "APPA APPI APPU APPE APPO "+ + + "AN "+ + "ANA ANI ANU ANE ANO "+ + "ANNA ANNI ANNU ANNE ANNO "+ + "AN'A AN'I AN'U AN'E AN'O "+ + + "ANNNA ANNNI ANNNU ANNNE ANNNO "+ + "AN'NA AN'NI AN'NU AN'NE AN'NO "+ + + "ANKA ANKI ANKU ANKE ANKO "+ + "ANGA ANGI ANGU ANGE ANGO "+ + + "ANSA ANSI ANSU ANSE ANSO "+ + "ANZA ANZI ANZU ANZE ANZO "+ + "ANZYA ANZYU ANZYO "+ + + "ANTA ANTI ANTU ANTE ANTO "+ + "ANTYA ANTYU ANTYO "+ + "ANDA ANDI ANDU ANDE ANDO "+ + + "ANCHA ANCHI ANCHU ANCHE ANCHO "+ + "ANJA ANJI ANJU ANJE ANJO "+ + "ANTSA ANTSU ANTSO "+ + + "ANPA ANPI ANPU ANPE ANPO "+ + "AMPA AMPI AMPU AMPE AMPO "+ + + "ANBA ANBI ANBU ANBE ANBO "+ + "AMBA AMBI AMBU AMBE AMBO "+ + + "ANMA ANMI ANMU ANME ANMO "+ + "AMMA AMMI AMMU AMME AMMO "+ + + "ANWA ANWI ANWU ANWE ANWO "+ + + "ANHA ANHI ANHU ANHE ANHO "+ + + "ANYA ANYI ANYU ANYE ANYO "+ + "ANNYA ANNYI ANNYU ANNYE ANNYO "+ + "AN'YA AN'YI AN'YU AN'YE AN'YO "+ + + "KKK "+ + "GGG "+ + "SSS "+ + "ZZZ "+ + "TTT "+ + "DDD "+ + "NNN "+ + "HHH "+ + "BBB "+ + "PPP "+ + "MMM "+ + "YYY "+ + "RRR "+ + "WWW";*/ +} diff --git a/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java new file mode 100755 index 00000000000..8417faf4b44 --- /dev/null +++ b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java @@ -0,0 +1,118 @@ +import com.ibm.text.*; +import java.text.*; +import java.util.*; + +/** + * @test + * @summary General test of UnicodeSet + */ +public class UnicodeSetTest extends IntlTest { + + public static void main(String[] args) throws Exception { + new UnicodeSetTest().run(args); + } + + public void TestPatterns() { + UnicodeSet set = new UnicodeSet(); + expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km"); + expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz"); + expectPattern(set, "[a\\-z]", "--aazz"); + expectPattern(set, "[-az]", "--aazz"); + expectPattern(set, "[az-]", "--aazz"); + expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz"); + + // Throw in a test of complement + set.complement(); + String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF'; + expectPairs(set, exp); + } + + public void TestAddRemove() { + UnicodeSet set = new UnicodeSet(); + set.add('a', 'z'); + expectPairs(set, "az"); + set.remove('m', 'p'); + expectPairs(set, "alqz"); + set.remove('e', 'g'); + expectPairs(set, "adhlqz"); + set.remove('d', 'i'); + expectPairs(set, "acjlqz"); + set.remove('c', 'r'); + expectPairs(set, "absz"); + set.add('f', 'q'); + expectPairs(set, "abfqsz"); + set.remove('a', 'g'); + expectPairs(set, "hqsz"); + set.remove('a', 'z'); + expectPairs(set, ""); + + // Try removing an entire set from another set + expectPattern(set, "[c-x]", "cx"); + UnicodeSet set2 = new UnicodeSet(); + expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz"); + set.removeAll(set2); + expectPairs(set, "deluxx"); + + // Try adding an entire set to another set + expectPattern(set, "[jackiemclean]", "aacceein"); + expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort"); + set.addAll(set2); + expectPairs(set, "aacehort"); + + // Test commutativity + expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort"); + expectPattern(set2, "[jackiemclean]", "aacceein"); + set.addAll(set2); + expectPairs(set, "aacehort"); + } + + void expectPattern(UnicodeSet set, + String pattern, + String expectedPairs) { + set.applyPattern(pattern); + if (!set.getPairs().equals(expectedPairs)) { + errln("FAIL: applyPattern(\"" + pattern + + "\") => pairs \"" + + escape(set.getPairs()) + "\", expected \"" + + escape(expectedPairs) + "\""); + } else { + logln("Ok: applyPattern(\"" + pattern + + "\") => pairs \"" + + escape(set.getPairs()) + "\""); + } + } + + void expectPairs(UnicodeSet set, String expectedPairs) { + if (!set.getPairs().equals(expectedPairs)) { + errln("FAIL: Expected pair list \"" + + escape(expectedPairs) + "\", got \"" + + escape(set.getPairs()) + "\""); + } + } + + /** + * Escape non-ASCII characters as Unicode. + */ + static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/text/CompoundTransliterator.java b/icu4j/src/com/ibm/text/CompoundTransliterator.java new file mode 100755 index 00000000000..c3582237d42 --- /dev/null +++ b/icu4j/src/com/ibm/text/CompoundTransliterator.java @@ -0,0 +1,285 @@ +package com.ibm.text; + +import java.util.Enumeration; +import java.util.Vector; + +/** + * A transliterator that is composed of two or more other + * transliterator objects linked together. For example, if one + * transliterator transliterates from script A to script B, and + * another transliterates from script B to script C, the two may be + * combined to form a new transliterator from A to C. + * + *

Composed transliterators may not behave as expected. For + * example, inverses may not combine to form the identity + * transliterator. See the class documentation for {@link + * Transliterator} for details. + * + *

If a non-null UnicodeFilter is applied to a + * CompoundTransliterator, it has the effect of being + * logically anded with the filter of each transliterator in + * the chain. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class CompoundTransliterator extends Transliterator { + + private static final boolean DEBUG = false; + + private Transliterator[] trans; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructs a new compound transliterator given an array of + * transliterators. The array of transliterators may be of any + * length, including zero or one, however, useful compound + * transliterators have at least two components. + * @param transliterators array of Transliterator + * objects + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + public CompoundTransliterator(String ID, Transliterator[] transliterators, + UnicodeFilter filter) { + super(ID, filter); + trans = new Transliterator[transliterators.length]; + System.arraycopy(transliterators, 0, trans, 0, trans.length); + } + + /** + * Constructs a new compound transliterator given an array of + * transliterators. The array of transliterators may be of any + * length, including zero or one, however, useful compound + * transliterators have at least two components. + * @param transliterators array of Transliterator + * objects + */ + public CompoundTransliterator(String ID, Transliterator[] transliterators) { + this(ID, transliterators, null); + } + + /** + * Returns the number of transliterators in this chain. + * @return number of transliterators in this chain. + */ + public int getCount() { + return trans.length; + } + + /** + * Returns the transliterator at the given index in this chain. + * @param index index into chain, from 0 to getCount() - 1 + * @return transliterator at the given index + */ + public Transliterator getTransliterator(int index) { + return trans[index]; + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return the new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + for (int i=0; i abca/u + * S C L S C L gl=f->a + * + * 2. upup, changes "x" to "XX" + * + * 4 7 a 4 7 a + * abca/u => abcAA/u + * S CL S C + * L gl=a->b + * 3. u-h, changes Unicode to hex + * + * 4 7 a 4 7 a d 0 3 + * abcAA/u => abc/u0041/u0041/u + * S C L S C + * L gl=b->15 + * 4. return + * + * 4 7 a d 0 3 + * abc/u0041/u0041/u + * S C L + */ + + /** + * One more wrinkle. If there is a filter F for the compound + * transliterator as a whole, then we need to modify every + * non-null filter f in the chain to be f' = F & f. Then, + * when we're done, we restore the original filters. + * + * A possible future optimization is to change f to f' at + * construction time, but then if anyone else is using the + * transliterators in the chain outside of this context, they + * will get unexpected results. + */ + UnicodeFilter F = getFilter(); + UnicodeFilter[] f = null; + if (F != null) { + f = new UnicodeFilter[trans.length]; + for (int i=0; i \"")); + } + + trans[i].handleKeyboardTransliterate(text, index); + + if (DEBUG) { + System.out.println(escape( + substring(text, index[START], index[CURSOR]) + '|' + + substring(text, index[CURSOR], index[LIMIT]) + + '"')); + } + + // Adjust overall limit for insertions/deletions + globalLimit += index[LIMIT] - limit; + limit = index[CURSOR]; // Move limit to end of committed text + } + // Cursor is good where it is -- where the last + // transliterator left it. Limit needs to be put back + // where it was, modulo adjustments for deletions/insertions. + index[LIMIT] = globalLimit; + + } finally { + // Fixup the transliterator filters, if we had to modify them. + if (f != null) { + for (int i=0; ipreceding context. + * @return maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + int max = 0; + for (int i=0; i max) { + max = len; + } + } + return max; + } + + /** + * DEBUG + * Returns a substring of a Replaceable. + */ + private static final String substring(Replaceable str, int start, int limit) { + StringBuffer buf = new StringBuffer(); + while (start < limit) { + buf.append(str.charAt(start++)); + } + return buf.toString(); + } + + /** + * DEBUG + * Escapes non-ASCII characters as Unicode. + */ + private static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java b/icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java new file mode 100755 index 00000000000..18673e15fe7 --- /dev/null +++ b/icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java @@ -0,0 +1,130 @@ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that converts from hexadecimal Unicode + * escape sequences to the characters they represent. For example, "U+0040" + * and '\u0040'. It recognizes the + * prefixes "U+", "u+", "\U", and "\u". Hex values may be + * upper- or lowercase. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class HexToUnicodeTransliterator extends Transliterator { + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Package accessible ID for this transliterator. + */ + static String _ID = "Hex-Unicode"; + + /** + * Constructs a transliterator. + */ + public HexToUnicodeTransliterator() { + super(_ID, null); + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return the new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + int[] offsets = { start, limit, start }; + handleKeyboardTransliterate(text, offsets); + return offsets[LIMIT]; + } + + /** + * Implements {@link Transliterator#handleKeyboardTransliterate}. + */ + protected void handleKeyboardTransliterate(Replaceable text, + int[] offsets) { + /** + * Performs transliteration changing Unicode hexadecimal + * escapes to characters. For example, "U+0040" -> '@'. A fixed + * set of prefixes is recognized: "\u", "\U", "u+", "U+". + */ + int cursor = offsets[CURSOR]; + int limit = offsets[LIMIT]; + + int maxCursor = limit - 6; + loop: + while (cursor <= maxCursor) { + char c = filteredCharAt(text, cursor + 5); + int digit0 = Character.digit(c, 16); + if (digit0 < 0) { + if (c == '\\') { + cursor += 5; + } else if (c == 'U' || c == 'u' || c == '+') { + cursor += 4; + } else { + cursor += 6; + } + continue; + } + + int u = digit0; + + for (int i=4; i>=2; --i) { + c = filteredCharAt(text, cursor + i); + int digit = Character.digit(c, 16); + if (digit < 0) { + if (c == 'U' || c == 'u' || c == '+') { + cursor += i-1; + } else { + cursor += 6; + } + continue loop; + } + u |= digit << (4 * (5-i)); + } + + c = filteredCharAt(text, cursor); + char d = filteredCharAt(text, cursor + 1); + if (((c == 'U' || c == 'u') && d == '+') + || (c == '\\' && (d == 'U' || d == 'u'))) { + + // At this point, we have a match; replace cursor..cursor+5 + // with u. + text.replace(cursor, cursor+6, String.valueOf((char) u)); + limit -= 5; + maxCursor -= 5; + + ++cursor; + } else { + cursor += 6; + } + } + + offsets[LIMIT] = limit; + offsets[CURSOR] = cursor; + } + + private char filteredCharAt(Replaceable text, int i) { + char c; + UnicodeFilter filter = getFilter(); + return (filter == null) ? text.charAt(i) : + (filter.isIn(c = text.charAt(i)) ? c : '\uFFFF'); + } + + /** + * Return the length of the longest context required by this transliterator. + * This is preceding context. + * @param direction either FORWARD or REVERSE + * @return maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return 0; + } +} diff --git a/icu4j/src/com/ibm/text/Replaceable.java b/icu4j/src/com/ibm/text/Replaceable.java new file mode 100755 index 00000000000..b4c8519689c --- /dev/null +++ b/icu4j/src/com/ibm/text/Replaceable.java @@ -0,0 +1,77 @@ +package com.ibm.text; + +/** + * Replaceable is an interface that supports the + * operation of replacing a substring with another piece of text. + * Replaceable is needed in order to change a piece of + * text while retaining style attributes. For example, if the string + * "the bold font" has range (4, 8) replaced with "strong", + * then it becomes "the strong font". + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: Replaceable.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public interface Replaceable { + /** + * Return the number of characters in the text. + * @return number of characters in text + */ + int length(); + + /** + * Return the character at the given offset into the text. + * @param offset an integer between 0 and length()-1 + * inclusive + * @return character of text at given offset + */ + char charAt(int offset); + + /** + * Copies characters from this object into the destination + * character array. The first character to be copied is at index + * srcStart; the last character to be copied is at + * index srcLimit-1 (thus the total number of + * characters to be copied is srcLimit-srcStart). The + * characters are copied into the subarray of dst + * starting at index dstStart and ending at index + * dstStart + (srcLimit-srcStart) - 1. + * + * @param srcStart the beginning index to copy, inclusive; 0 + * <= start <= limit. + * @param srcLimit the ending index to copy, exclusive; + * start <= limit <= length(). + * @param dst the destination array. + * @param dstStart the start offset in the destination array. + */ + void getChars(int srcStart, int srcLimit, char dst[], int dstStart); + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param text the text to replace characters start + * to limit - 1 + */ + void replace(int start, int limit, String text); + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param chars the text to replace characters start + * to limit - 1 + * @param charsStart the beginning index into chars, + * inclusive; 0 <= start <= limit. + * @param charsLen the number of characters of chars. + */ + void replace(int start, int limit, char[] chars, + int charsStart, int charsLen); + // Note: We use length rather than limit to conform to StringBuffer + // and System.arraycopy. +} diff --git a/icu4j/src/com/ibm/text/ReplaceableString.java b/icu4j/src/com/ibm/text/ReplaceableString.java new file mode 100755 index 00000000000..d6a7df06db5 --- /dev/null +++ b/icu4j/src/com/ibm/text/ReplaceableString.java @@ -0,0 +1,159 @@ +package com.ibm.text; + +/** + * ReplaceableString is an adapter class that implements the + * Replaceable API around an ordinary StringBuffer. + * + *

Note: This class does not support attributes and is not + * intended for general use. Most clients will need to implement + * {@link Replaceable} in their text representation class. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @see Replaceable + * @author Alan Liu + * @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class ReplaceableString implements Replaceable { + private StringBuffer buf; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a new object with the given initial contents. + * @param str initial contents + */ + public ReplaceableString(String str) { + buf = new StringBuffer(str); + } + + /** + * Construct a new object using buf for internal + * storage. The contents of buf at the time of + * construction are used as the initial contents. Note! + * Modifications to buf will modify this object, and + * vice versa. + * @param buf object to be used as internal storage + */ + public ReplaceableString(StringBuffer buf) { + this.buf = buf; + } + + /** + * Construct a new empty object. + */ + public ReplaceableString() { + buf = new StringBuffer(); + } + + /** + * Return the contents of this object as a String. + * @return string contents of this object + */ + public String toString() { + return buf.toString(); + } + + /** + * Return the internal storage of this object. Note! Any + * changes made to the returned object affect this object's + * contents, and vice versa. + * @return internal buffer used by this object + */ + public StringBuffer getStringBuffer() { + return buf; + } + + /** + * Return the number of characters contained in this object. + * Replaceable API. + */ + public int length() { + return buf.length(); + } + + /** + * Return the character at the given position in this object. + * Replaceable API. + * @param offset offset into the contents, from 0 to + * length() - 1 + */ + public char charAt(int offset) { + return buf.charAt(offset); + } + + /** + * Copies characters from this object into the destination + * character array. The first character to be copied is at index + * srcStart; the last character to be copied is at + * index srcLimit-1 (thus the total number of + * characters to be copied is srcLimit-srcStart). The + * characters are copied into the subarray of dst + * starting at index dstStart and ending at index + * dstStart + (srcLimit-srcStart) - 1. + * + * @param srcStart the beginning index to copy, inclusive; 0 + * <= start <= limit. + * @param srcLimit the ending index to copy, exclusive; + * start <= limit <= length(). + * @param dst the destination array. + * @param dstStart the start offset in the destination array. + */ + public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) { + buf.getChars(srcStart, srcLimit, dst, dstStart); + } + + /** + * Replace zero or more characters with new characters. + * Replaceable API. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param text new text to replace characters start to + * limit - 1 + */ + public void replace(int start, int limit, String text) { + if (start == limit) { + buf.insert(start, text); + } else { + char[] tail = null; + if (limit < buf.length()) { + tail = new char[buf.length() - limit]; + buf.getChars(limit, buf.length(), tail, 0); + } + buf.setLength(start); + buf.append(text); + if (tail != null) { + buf.append(tail); + } + } + } + + /** + * Replace a substring of this object with the given text. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= length(). + * @param chars the text to replace characters start + * to limit - 1 + * @param charsStart the beginning index into chars, + * inclusive; 0 <= start <= limit. + * @param charsLen the number of characters of chars. + */ + public void replace(int start, int limit, char[] chars, + int charsStart, int charsLen) { + char[] tail = null; + if (limit < buf.length()) { + tail = new char[buf.length() - limit]; + buf.getChars(limit, buf.length(), tail, 0); + } + buf.setLength(start); + buf.append(chars, charsStart, charsLen); + if (tail != null) { + buf.append(tail); + } + } +} diff --git a/icu4j/src/com/ibm/text/RuleBasedTransliterator.java b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java new file mode 100755 index 00000000000..4a433e9479d --- /dev/null +++ b/icu4j/src/com/ibm/text/RuleBasedTransliterator.java @@ -0,0 +1,1187 @@ +package com.ibm.text; + +import java.util.Hashtable; +import java.util.Vector; + +/** + * A transliterator that reads a set of rules in order to determine how to + * perform translations. Rules are stored in resource bundles indexed by name. + * Rules are separated by newline characters ('\n'); to include a literal + * newline, prefix it with a backslash ('\\\n'). Whitespace is significant. If + * the first character on a line is '#', the entire line is ignored as a + * comment. + * + *

Each set of rules consists of two groups, one forward, and one reverse. + * This is a convention that is not enforced; rules for one direction may be + * omitted, with the result that translations in that direction will not modify + * the source text. + * + *

Rule syntax + * + *

Rule statements take one of the following forms: + *

+ *
alefmadda=\u0622
+ * + *
Variable definition. The name on the left is + * assigned the character or expression on the right. Names may not + * contain any special characters (see list below). Duplicate names + * (including duplicates of simple variables or category names) + * cause an exception to be thrown. If the right hand side consists + * of one character, then the variable stands for that character. + * In this example, after this statement, instances of the left hand + * name surrounded by braces, "{alefmadda}", + * will be replaced by the Unicode character U+0622.
If the + * right hand side is longer than one character, then it is + * interpreted as a character category expression; see below for + * details. + * + *
softvowel=[eiyEIY]
+ * + *
Category definition. The name on the left is assigned + * to stand for a set of characters. The same rules for names of simple + * variables apply. After this statement, the left hand variable will be + * interpreted as indicating a set of characters in appropriate contexts. The + * pattern syntax defining sets of characters is defined by {@link UnicodeSet}. + * Examples of valid patterns are: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
[abc]The set containing the characters 'a', 'b', and 'c'.
[^abc]The set of all characters except 'a', 'b', and 'c'.
[A-Z]The set of all characters from 'A' to 'Z' in Unicode order.
[:Lu:]The set of Unicode uppercase letters. See + * www.unicode.org + * for a complete list of categories and their two-letter codes.
[^a-z[:Lu:][:Ll:]]The set of all characters except 'a' through 'z' and + * uppercase or lowercase letters.
+ * + * See {@link UnicodeSet} for more documentation and examples. + *
+ * + *
ai>{alefmadda}
+ * + *
Forward translation rule. This rule states that the + * string on the left will be changed to the string on the right when + * performing forward transliteration.
+ * + *
ai<{alefmadda}
+ * + *
Reverse translation rule. This rule states that the + * string on the right will be changed to the string on the left when + * performing reverse transliteration.
+ * + *
+ * + *

Forward and reverse translation rules consist of a match + * pattern and an output string. The match pattern consists + * of literal characters, optionally preceded by context, and optionally + * followed by context. Context characters, like literal pattern characters, + * must be matched in the text being transliterated. However, unlike literal + * pattern characters, they are not replaced by the output text. For example, + * the pattern "[abc]def" indicates the characters + * "def" must be preceded by "abc" for a successful + * match. If there is a successful match, "def" will be replaced, + * but not "abc". The initial '[' is optional, so + * "abc]def" is equivalent to "[abc]def". Another + * example is "123[456]" (or "123[456") in which the + * literal pattern "123" must be followed by "456". + * + *

The output string of a forward or reverse rule consists of characters to + * replace the literal pattern characters. If the output string contains the + * character '|', this is taken to indicate the location of the + * cursor after replacement. The cursor is the point in the text + * at which the next replacement, if any, will be applied. + * + *

Example + * + *

The following example rules illustrate many of the features of the rule + * language. + * + * + * + * + * + * + * + *
Rule 1.abc]def>x|y
Rule 2.xyz>r
Rule 3.yz>q
+ * + *

Applying these rules to the string "adefabcdefz" yields the + * following results: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
|adefabcdefzInitial state, no rules match. Advance cursor.
a|defabcdefzStill no match. Rule 1 does not match because the preceding + * context is not present.
ad|efabcdefzStill no match. Keep advancing until there is a match...
ade|fabcdefz...
adef|abcdefz...
adefa|bcdefz...
adefab|cdefz...
adefabc|defzRule 1 matches; replace "def" with "xy" + * and back up the cursor to before the 'y'.
adefabcx|yzAlthough "xyz" is present, rule 2 does not match + * because the cursor is before the 'y', not before the + * 'x'. Rule 3 does match. Replace "yz" with + * "q".
adefabcxq|The cursor is at the end; transliteration is complete.
+ * + *

The order of rules is significant. If multiple rules may match at some + * point, the first matching rule is applied. + * + *

Forward and reverse rules may have an empty output string. Otherwise, an + * empty left or right hand side of any statement is a syntax error. + * + *

Single quotes are used to quote the special characters + * =><{}[]|. To specify a single quote itself, inside or + * outside of quotes, use two single quotes in a row. For example, the rule + * "'>'>o''clock" changes the string ">" to + * the string "o'clock". + * + *

Notes + * + *

While a RuleBasedTransliterator is being built, it checks that the rules + * are added in proper order. For example, if the rule "a>x" is followed by the + * rule "ab>y", then the second rule will throw an exception. The reason is + * that the second rule can never be triggered, since the first rule always + * matches anything it matches. In other words, the first rule masks + * the second rule. There is a cost of O(n^2) to make this check; in real-world + * tests it appears to approximately double build time. + * + *

One optimization that can be made is to add a pragma to the rule language, + * "#pragma order", that turns off ordering checking. This pragma can then be + * added to all of our resource-based rules (after we build these once and + * determine that there are no ordering errors). I haven't made this change yet + * in the interests of keeping the code from getting too byzantine. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: RuleBasedTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class RuleBasedTransliterator extends Transliterator { + /** + * Direction constant passed to constructor to create a transliterator + * using the forward rules. + */ + public static final int FORWARD = 0; + + /** + * Direction constant passed to constructor to create a transliterator + * using the reverse rules. + */ + public static final int REVERSE = 1; + + private Data data; + + static final boolean DEBUG = false; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructs a new transliterator from the given rules. + * @param rules rules, separated by '\n' + * @param direction either FORWARD or REVERSE. + * @exception IllegalArgumentException if rules are malformed + * or direction is invalid. + */ + public RuleBasedTransliterator(String ID, String rules, int direction, + UnicodeFilter filter) { + super(ID, filter); + if (direction != FORWARD && direction != REVERSE) { + throw new IllegalArgumentException("Invalid direction"); + } + data = parse(rules, direction); + } + + /** + * Constructs a new transliterator from the given rules in the + * FORWARD direction. + * @param rules rules, separated by '\n' + * @exception IllegalArgumentException if rules are malformed + * or direction is invalid. + */ + public RuleBasedTransliterator(String ID, String rules) { + this(ID, rules, FORWARD, null); + } + + RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) { + super(ID, filter); + this.data = data; + } + + static Data parse(String rules, int direction) { + return new Parser(rules, direction).getData(); + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result buffer to receive the transliterated text; previous + * contents are discarded + */ + public void transliterate(String text, int start, int limit, + StringBuffer result) { + /* In the following loop there is a virtual buffer consisting of the + * text transliterated so far followed by the untransliterated text. There is + * also a cursor, which may be in the already transliterated buffer or just + * before the untransliterated text. + * + * Example: rules 1. ab>x|y + * 2. yc>z + * + * []|eabcd start - no match, copy e to tranlated buffer + * [e]|abcd match rule 1 - copy output & adjust cursor + * [ex|y]cd match rule 2 - copy output & adjust cursor + * [exz]|d no match, copy d to transliterated buffer + * [exzd]| done + * + * cursor: an index into the virtual buffer, 0..result.length()-1. + * Matches take place at the cursor. If there is no match, the cursor + * is advanced, and one character is moved from the source text to the + * result buffer. + * + * start, limit: these designate the substring of the source text which + * has not been processed yet. The range of offsets is start..limit-1. + * At any moment the virtual buffer consists of result + + * text.substring(start, limit). + */ + int cursor = 0; + result.setLength(0); + while (start < limit || cursor < result.length()) { + TransliterationRule r = data.ruleSet.findMatch(text, start, limit, result, + cursor, data.setVariables, getFilter()); + if (DEBUG) { + StringBuffer buf = new StringBuffer( + result.toString() + '#' + text.substring(start, limit)); + buf.insert(cursor <= result.length() + ? cursor : (cursor + 1), + '|'); + System.err.print((r == null ? "nomatch:" : ("match:" + r + ", ")) + + buf); + } + + if (r == null) { + if (cursor == result.length()) { + result.append(text.charAt(start++)); + } + ++cursor; + } else { + // resultPad is length of result to right of cursor; >= 0 + int resultPad = result.length() - cursor; + char[] tail = null; + if (r.getKeyLength() > resultPad) { + start += r.getKeyLength() - resultPad; + } else if (r.getKeyLength() < resultPad) { + tail = new char[resultPad - r.getKeyLength()]; + result.getChars(cursor + r.getKeyLength(), result.length(), + tail, 0); + } + result.setLength(cursor); + result.append(r.getOutput()); + if (tail != null) { + result.append(tail); + } + cursor += r.getCursorPos(); + } + + if (DEBUG) { + StringBuffer buf = new StringBuffer( + result.toString() + '#' + text.substring(start, limit)); + buf.insert(cursor <= result.length() + ? cursor : (cursor + 1), + '|'); + System.err.println(" => " + buf); + } + } + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return The new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + /* When using Replaceable, the algorithm is simpler, since we don't have + * two separate buffers. We keep start and limit fixed the entire time, + * relative to the text -- limit may move numerically if text is + * inserted or removed. The cursor moves from start to limit, with + * replacements happening under it. + * + * Example: rules 1. ab>x|y + * 2. yc>z + * + * |eabcd start - no match, advance cursor + * e|abcd match rule 1 - change text & adjust cursor + * ex|ycd match rule 2 - change text & adjust cursor + * exz|d no match, advance cursor + * exzd| done + */ + int cursor = start; + while (cursor < limit) { + TransliterationRule r = data.ruleSet.findMatch(text, start, limit, + cursor, data.setVariables, getFilter()); + if (r == null) { + ++cursor; + } else { + text.replace(cursor, cursor + r.getKeyLength(), r.getOutput()); + limit += r.getOutput().length() - r.getKeyLength(); + cursor += r.getCursorPos(); + } + } + return limit; + } + + /** + * Implements {@link Transliterator#handleKeyboardTransliterate}. + */ + protected void handleKeyboardTransliterate(Replaceable text, + int[] index) { + int start = index[START]; + int limit = index[LIMIT]; + int cursor = index[CURSOR]; + + if (DEBUG) { + System.out.print("\"" + + escape(rsubstring(text, start, cursor)) + '|' + + escape(rsubstring(text, cursor, limit)) + "\""); + } + + boolean partial[] = new boolean[1]; + + while (cursor < limit) { + TransliterationRule r = data.ruleSet.findIncrementalMatch( + text, start, limit, cursor, data.setVariables, partial, getFilter()); + /* If we match a rule then apply it by replacing the key + * with the rule output and repositioning the cursor + * appropriately. If we get a partial match, then we + * can't do anything without more text; return with the + * cursor at the current position. If we get null, then + * there is no match at this position, and we can advance + * the cursor. + */ + if (r == null) { + if (partial[0]) { + break; + } else { + ++cursor; + } + } else { + text.replace(cursor, cursor + r.getKeyLength(), r.getOutput()); + limit += r.getOutput().length() - r.getKeyLength(); + cursor += r.getCursorPos(); + } + } + + if (DEBUG) { + System.out.println(" -> \"" + + escape(rsubstring(text, start, cursor)) + '|' + + escape(rsubstring(text, cursor, cursor)) + '|' + + escape(rsubstring(text, cursor, limit)) + "\""); + } + + index[LIMIT] = limit; + index[CURSOR] = cursor; + } + + /** + * Returns the length of the longest context required by this transliterator. + * This is preceding context. + * @return Maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return data.ruleSet.getMaximumContextLength(); + } + + + /** + * FOR DEBUGGING: Return a substring of a Replaceable. + */ + private static String rsubstring(Replaceable r, int start, int limit) { + StringBuffer buf = new StringBuffer(); + while (start < limit) { + buf.append(r.charAt(start++)); + } + return buf.toString(); + } + + /** + * FOR DEBUGGING: Escape non-ASCII characters as Unicode. + */ + private static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + if (c == '\\') { + buf.append("\\\\"); // That is, "\\" + } else { + buf.append(c); + } + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } + + + + + + static class Data { + public Data() { + variableNames = new Hashtable(); + setVariables = new Hashtable(); + ruleSet = new TransliterationRuleSet(); + } + + /** + * Rule table. May be empty. + */ + public TransliterationRuleSet ruleSet; + + /** + * Map variable name (String) to variable (Character). A variable + * name may correspond to a single literal character, in which + * case the character is stored in this hash. It may also + * correspond to a UnicodeSet, in which case a character is + * again stored in this hash, but the character is a stand-in: it + * is a key for a secondary lookup in data.setVariables. The stand-in + * also represents the UnicodeSet in the stored rules. + */ + public Hashtable variableNames; + + /** + * Map category variable (Character) to set (UnicodeSet). + * Variables that correspond to a set of characters are mapped + * from variable name to a stand-in character in data.variableNames. + * The stand-in then serves as a key in this hash to lookup the + * actual UnicodeSet object. In addition, the stand-in is + * stored in the rule text to represent the set of characters. + */ + public Hashtable setVariables; + } + + + + + + + private static class Parser { + private String rules; + + private int direction; + + private Data data; + + /** + * The next available stand-in for variables. This starts at some point in + * the private use area (discovered dynamically) and increments up toward + * variableLimit. At any point during parsing, available + * variables are variableNext..variableLimit-1. + */ + private char variableNext; + + /** + * The last available stand-in for variables. This is discovered + * dynamically. At any point during parsing, available variables are + * variableNext..variableLimit-1. + */ + private char variableLimit; + + // Operators + private static final char VARIABLE_DEF_OP = '='; + private static final char FORWARD_RULE_OP = '>'; + private static final char REVERSE_RULE_OP = '<'; + + private static final String OPERATORS = "=><"; + + // Other special characters + private static final char QUOTE = '\''; + private static final char VARIABLE_REF_OPEN = '{'; + private static final char VARIABLE_REF_CLOSE = '}'; + private static final char CONTEXT_OPEN = '['; + private static final char CONTEXT_CLOSE = ']'; + private static final char CURSOR_POS = '|'; + private static final char RULE_COMMENT_CHAR = '#'; + + /** + * Specials must be quoted in rules to be used as literals. + * Specials may not occur in variable names. + */ + private static final String SPECIALS = "'{}[]|#" + OPERATORS; + + /** + * Specials that must be quoted in variable definitions. + */ + private static final String DEF_SPECIALS = "'{}"; + + /** + * @param rules list of rules, separated by newline characters + * @exception IllegalArgumentException if there is a syntax error in the + * rules + */ + public Parser(String rules, int direction) { + this.rules = rules; + this.direction = direction; + data = new Data(); + parseRules(); + } + + public Data getData() { + return data; + } + + /** + * Parse the given string as a sequence of rules, separated by newline + * characters ('\n'), and cause this object to implement those rules. Any + * previous rules are discarded. Typically this method is called exactly + * once, during construction. + * @exception IllegalArgumentException if there is a syntax error in the + * rules + */ + private void parseRules() { + determineVariableRange(); + + int n = rules.length(); + int i = 0; + while (i0 && rules.charAt(limit-1) == '\\') { + limit = rules.indexOf('\n', limit+1); + } + + if (limit == -1) { + limit = n; + } + // Skip over empty lines and line starting with # + if (limit > i && rules.charAt(i) != RULE_COMMENT_CHAR) { + applyRule(i, limit); + } + i = limit + 1; + } + + data.ruleSet.freeze(); + } + + /** + * Parse the given substring as a rule, and append it to the rules currently + * represented in this object. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @exception IllegalArgumentException if there is a syntax error in the + * rules + */ + private void applyRule(int start, int limit) { + /* General description of parsing: Initially, rules contain two types of + * quoted characters. First, there are variable references, such as + * "{alpha}". Second, there are quotes, such as "'<'" or "''". One of + * the first steps in parsing a rule is to resolve such quoted matter. + * Quotes are removed early, leaving unquoted literal matter. Variable + * references are resolved and replaced by single characters. In some + * instances these characters represent themselves; in others, they + * stand for categories of characters. Character categories are either + * predefined (e.g., "{Lu}"), or are defined by the user using a + * statement (e.g., "vowels:aeiouAEIOU"). + * + * Another early step in parsing is to split each rule into component + * pieces. These pieces are, for every rule, a left-hand side, a right- + * hand side, and an operator. The left- and right-hand sides may not + * be empty, except for the output patterns of forward and reverse + * rules. In addition to this partitioning, the match patterns of + * forward and reverse rules must be partitioned into antecontext, + * postcontext, and literal pattern, where the context portions may or + * may not be present. Finally, output patterns must have the cursor + * indicator '|' detected and removed, with its position recorded. + * + * Quote removal, variable resolution, and sub-pattern splitting must + * all happen at once. This is due chiefly to the quoting mechanism, + * which allows special characters to appear at arbitrary positions in + * the final unquoted text. (For this reason, alteration of the rule + * language is somewhat clumsy; it entails reassessment and revision of + * the parsing methods as a whole.) + * + * After this processing of rules is complete, the final end products + * are unquoted pieces of text of various types, and an integer cursor + * position, if one is specified. These processed raw materials are now + * easy to deal with; other classes such as UnicodeSet and + * TransliterationRule need know nothing of quoting or variables. + */ + StringBuffer left = new StringBuffer(); + StringBuffer right = new StringBuffer(); + StringBuffer anteContext = new StringBuffer(); + StringBuffer postContext = new StringBuffer(); + int cursorPos[] = new int[1]; + + char operator = parseRule(start, limit, left, right, + anteContext, postContext, cursorPos); + + switch (operator) { + case VARIABLE_DEF_OP: + applyVariableDef(left.toString(), right.toString()); + break; + case FORWARD_RULE_OP: + if (direction == FORWARD) { + data.ruleSet.addRule(new TransliterationRule( + left.toString(), right.toString(), + anteContext.toString(), postContext.toString(), + cursorPos[0])); + } // otherwise ignore the rule; it's not the direction we want + break; + case REVERSE_RULE_OP: + if (direction == REVERSE) { + data.ruleSet.addRule(new TransliterationRule( + right.toString(), left.toString(), + anteContext.toString(), postContext.toString(), + cursorPos[0])); + } // otherwise ignore the rule; it's not the direction we want + break; + } + } + + /** + * Add a variable definition. + * @param name the name of the variable. It must not already be defined. + * @param pattern the value of the variable. It may be a single character + * or a pattern describing a character set. + * @exception IllegalArgumentException if there is a syntax error + */ + private final void applyVariableDef(String name, String pattern) { + validateVariableName(name); + if (data.variableNames.get(name) != null) { + throw new IllegalArgumentException("Duplicate variable definition: " + + name + '=' + pattern); + } +//! if (UnicodeSet.getCategoryID(name) >= 0) { +//! throw new IllegalArgumentException("Reserved variable name: " +//! + name); +//! } + if (pattern.length() < 1) { + throw new IllegalArgumentException("Variable definition missing: " + + name); + } + if (pattern.length() == 1) { + // Got a single character variable definition + data.variableNames.put(name, new Character(pattern.charAt(0))); + } else { + // Got more than one character; parse it as a category + if (variableNext >= variableLimit) { + throw new RuntimeException("Private use variables exhausted"); + } + Character c = new Character(variableNext++); + data.variableNames.put(name, c); + data.setVariables.put(c, new UnicodeSet(pattern)); + } + } + + /** + * Given a rule, parses it into three pieces: The left side, the right side, + * and the operator. Returns the operator. Quotes and variable references + * are resolved; the otuput text in all StringBuffer parameters + * is literal text. This method delegates to other parsing methods to + * handle the match pattern, output pattern, and other sub-patterns in the + * rule. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param left left side of rule is appended to this buffer + * with the quotes removed and variables resolved + * @param right right side of rule is appended to this buffer + * with the quotes removed and variables resolved + * @param anteContext the preceding context of the match pattern, + * if there is one, is appended to this buffer + * @param postContext the following context of the match pattern, + * if there is one, is appended to this buffer + * @param cursorPos if there is a cursor in the output pattern, its + * offset is stored in cursorPos[0] + * @return The operator character, one of the characters in OPERATORS. + */ + private char parseRule(int start, int limit, + StringBuffer left, StringBuffer right, + StringBuffer anteContext, + StringBuffer postContext, + int[] cursorPos) { + if (false) { + System.err.println("Parsing " + rules.substring(start, limit)); + } + /* Parse the rule into three pieces -- left, operator, and right, + * parsing out quotes. The result is that left and right will have + * unquoted text. E.g., "gt<'>'" will have right = ">". Unquoted + * operators throw an exception. Two quotes inside or outside + * quotes indicates a quote literal. E.g., "o''clock" -> "o'clock". + */ + int i = quotedIndexOf(rules, start, limit, OPERATORS); + if (i < 0) { + throw new IllegalArgumentException( + "Syntax error: " + + rules.substring(start, limit)); + } + char c = rules.charAt(i); + switch (c) { + case FORWARD_RULE_OP: + if (i == start) { + throw new IllegalArgumentException( + "Empty left side: " + + rules.substring(start, limit)); + } + parseMatchPattern(start, i, left, anteContext, postContext); + if (i != (limit-1)) { + parseOutputPattern(i+1, limit, right, cursorPos); + } + break; + case REVERSE_RULE_OP: + if (i == (limit-1)) { + throw new IllegalArgumentException( + "Empty right side: " + + rules.substring(start, limit)); + } + if (i != start) { + parseOutputPattern(start, i, left, cursorPos); + } + parseMatchPattern(i+1, limit, right, anteContext, postContext); + break; + default: + if (i == start || i == (limit-1)) { + throw new IllegalArgumentException( + "Empty left or right side: " + + rules.substring(start, limit)); + } + parseSubPattern(start, i, left); + parseDefPattern(i+1, limit, right); + break; + } + return c; + } + + /** + * Parses the match pattern of a forward or reverse rule. Given the raw + * match pattern, return the match text and the context on both sides, if + * any. Resolves all quotes and variables. + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param text the key to be matched will be appended to this buffer + * @param anteContext the preceding context, if any, will be appended + * to this buffer. + * @param postContext the following context, if any, will be appended + * to this buffer. + */ + private void parseMatchPattern(int start, int limit, + StringBuffer text, + StringBuffer anteContext, + StringBuffer postContext) { + if (start >= limit) { + throw new IllegalArgumentException( + "Empty expression in rule: " + + rules.substring(start, limit)); + } + if (anteContext != null) { + // Ignore optional opening and closing context characters + if (rules.charAt(start) == CONTEXT_OPEN) { + ++start; + } + if (rules.charAt(limit-1) == CONTEXT_CLOSE) { + --limit; + } + // The four possibilities are: + // key + // anteContext]key + // anteContext]key[postContext + // key[postContext + int ante = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_CLOSE)); + int post = quotedIndexOf(rules, start, limit, String.valueOf(CONTEXT_OPEN)); + if (ante >= 0 && post >= 0 && ante > post) { + throw new IllegalArgumentException( + "Syntax error in context specifier: " + + rules.substring(start, limit)); + } + if (ante >= 0) { + parseSubPattern(start, ante, anteContext); + start = ante+1; + } + if (post >= 0) { + parseSubPattern(post+1, limit, postContext); + limit = post; + } + } + parseSubPattern(start, limit, text); + } + + private final void parseSubPattern(int start, int limit, + StringBuffer text) { + parseSubPattern(start, limit, text, null, SPECIALS); + } + + /** + * Parse a variable definition sub pattern. This kind of sub + * pattern differs in the set of characters that are considered + * special. In particular, the '[' and ']' characters are not + * special, since these are used in UnicodeSet patterns. + */ + private final void parseDefPattern(int start, int limit, + StringBuffer text) { + parseSubPattern(start, limit, text, null, DEF_SPECIALS); + } + + /** + * Parses the output pattern of a forward or reverse rule. Given the + * output pattern, return the output text and the position of the cursor, + * if any. Resolves all quotes and variables. + * @param rules the string to be parsed + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param text the output text will be appended to this buffer + * @param cursorPos if this parameter is not null, then cursorPos[0] + * will be set to the cursor position, or -1 if there is none. If this + * parameter is null, then cursors will be disallowed. + */ + private final void parseOutputPattern(int start, int limit, + StringBuffer text, + int[] cursorPos) { + parseSubPattern(start, limit, text, cursorPos, SPECIALS); + } + + /** + * Parses a sub-pattern of a rule. Return the text and the position of the cursor, + * if any. Resolves all quotes and variables. + * @param rules the string to be parsed + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= rules.length(). + * @param text the output text will be appended to this buffer + * @param cursorPos if this parameter is not null, then cursorPos[0] + * will be set to the cursor position, or -1 if there is none. If this + * parameter is null, then cursors will be disallowed. + * @param specials characters that must be quoted; typically either + * SPECIALS or DEF_SPECIALS. + */ + private void parseSubPattern(int start, int limit, + StringBuffer text, + int[] cursorPos, + String specials) { + boolean inQuote = false; + + if (start >= limit) { + throw new IllegalArgumentException("Empty expression in rule"); + } + if (cursorPos != null) { + cursorPos[0] = -1; + } + for (int i=start; i= 0) { + throw new IllegalArgumentException("Multiple cursors: " + + rules.substring(start, limit)); + } + cursorPos[0] = text.length(); + } else if (specials.indexOf(c) >= 0) { + throw new IllegalArgumentException("Unquoted special character: " + + rules.substring(start, limit)); + } else { + text.append(c); + } + } + } + + private static void validateVariableName(String name) { + if (indexOf(name, SPECIALS) >= 0) { + throw new IllegalArgumentException( + "Special character in variable name: " + + name); + } + } + + /** + * Returns the single character value of the given variable name. Defined + * names are recognized. + * + * NO LONGER SUPPORTED: + * If a Unicode category name is given, a standard character variable + * in the range firstCategoryVariable to lastCategoryVariable is returned, + * with value firstCategoryVariable + n, where n is the category + * number. + * @exception IllegalArgumentException if the name is unknown. + */ + private Character getVariableDef(String name) { + Character ch = (Character) data.variableNames.get(name); +//! if (ch == null) { +//! int id = UnicodeSet.getCategoryID(name); +//! if (id >= 0) { +//! ch = new Character((char) (firstCategoryVariable + id)); +//! data.variableNames.put(name, ch); +//! data.setVariables.put(ch, new UnicodeSet(id)); +//! } +//! } + if (ch == null) { + throw new IllegalArgumentException("Undefined variable: " + + name); + } + return ch; + } + + /** + * Determines what part of the private use region of Unicode we can use for + * variable stand-ins. The correct way to do this is as follows: Parse each + * rule, and for forward and reverse rules, take the FROM expression, and + * make a hash of all characters used. The TO expression should be ignored. + * When done, everything not in the hash is available for use. In practice, + * this method may employ some other algorithm for improved speed. + */ + private final void determineVariableRange() { + Range r = new Range('\uE000', 0x1900); // Private use area + r = r.largestUnusedSubrange(rules); + + if (r == null) { + throw new RuntimeException( + "No private use characters available for variables"); + } + + variableNext = r.start; + variableLimit = (char) (r.start + r.length); + + if (variableNext >= variableLimit) { + throw new RuntimeException( + "Too few private use characters available for variables"); + } + } + + /** + * Returns the index of the first character in a set, ignoring quoted text. + * For example, in the string "abc'hide'h", the 'h' in "hide" will not be + * found by a search for "h". Unlike String.indexOf(), this method searches + * not for a single character, but for any character of the string + * setOfChars. + * @param text text to be searched + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param setOfChars string with one or more distinct characters + * @return Offset of the first character in setOfChars + * found, or -1 if not found. + * @see #indexOf + */ + private static int quotedIndexOf(String text, int start, int limit, + String setOfChars) { + for (int i=start; i= 0) { + return i; + } + } + return -1; + } + + /** + * Returns the index of the first character in a set. Unlike + * String.indexOf(), this method searches not for a single character, but + * for any character of the string setOfChars. + * @param text text to be searched + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param setOfChars string with one or more distinct characters + * @return Offset of the first character in setOfChars + * found, or -1 if not found. + * @see #quotedIndexOf + */ + private static int indexOf(String text, int start, int limit, + String setOfChars) { + for (int i=start; i= 0) { + return i; + } + } + return -1; + } + + /** + * Returns the index of the first character in a set. Unlike + * String.indexOf(), this method searches not for a single character, but + * for any character of the string setOfChars. + * @param text text to be searched + * @param setOfChars string with one or more distinct characters + * @return Offset of the first character in setOfChars + * found, or -1 if not found. + * @see #quotedIndexOf + */ + private static int indexOf(String text, String setOfChars) { + return indexOf(text, 0, text.length(), setOfChars); + } + + + + /** + * A range of Unicode characters. Support the operations of testing for + * inclusion (does this range contain this character?) and splitting. + * Splitting involves breaking a range into two smaller ranges around a + * character inside the original range. The split character is not included + * in either range. If the split character is at either extreme end of the + * range, one of the split products is an empty range. + * + * This class is used internally to determine the largest available private + * use character range for variable stand-ins. + */ + private static class Range implements Cloneable { + char start; + int length; + + Range(char start, int length) { + this.start = start; + this.length = length; + } + + public Object clone() { + return new Range(start, length); + } + + boolean contains(char c) { + return c >= start && (c - start) < length; + } + + /** + * Assume that contains(c) is true. Split this range into two new + * ranges around the character c. Make this range one of the new ranges + * (modify it in place) and return the other new range. The character + * itself is not included in either range. If the split results in an + * empty range (that is, if c == start or c == start + length - 1) then + * return null. + */ + Range split(char c) { + if (c == start) { + ++start; + --length; + return null; + } else if (c - start == length - 1) { + --length; + return null; + } else { + ++c; + Range r = new Range(c, start + length - c); + length = --c - start; + return r; + } + } + + /** + * Finds the largest unused subrange by the given string. A + * subrange is unused by a string if the string contains no + * characters in that range. If the given string contains no + * characters in this range, then this range itself is + * returned. + */ + Range largestUnusedSubrange(String str) { + int n = str.length(); + + Vector v = new Vector(1); + v.addElement(clone()); + for (int i=0; i bestRange.length) { + bestRange = r; + } + } + + return bestRange; + } + } + } +} diff --git a/icu4j/src/com/ibm/text/TransliterationRule.java b/icu4j/src/com/ibm/text/TransliterationRule.java new file mode 100755 index 00000000000..383c77ed340 --- /dev/null +++ b/icu4j/src/com/ibm/text/TransliterationRule.java @@ -0,0 +1,530 @@ +package com.ibm.text; + +import java.util.Dictionary; + +/** + * A transliteration rule used by + * RuleBasedTransliterator. + * TransliterationRule is an immutable object. + * + *

A rule consists of an input pattern and an output string. When + * the input pattern is matched, the output string is emitted. The + * input pattern consists of zero or more characters which are matched + * exactly (the key) and optional context. Context must match if it + * is specified. Context may be specified before the key, after the + * key, or both. The key, preceding context, and following context + * may contain variables. Variables represent a set of Unicode + * characters, such as the letters a through z. + * Variables are detected by looking up each character in a supplied + * variable list to see if it has been so defined. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +class TransliterationRule { + /** + * Constant returned by getMatchDegree() indicating a mismatch + * between the text and this rule. One or more characters of the context or + * key do not match the text. + * @see #getMatchDegree + */ + public static final int MISMATCH = 0; + + /** + * Constant returned by getMatchDegree() indicating a partial + * match between the text and this rule. All characters of the text match + * the corresponding context or key, but more characters are required for a + * complete match. There are some key or context characters at the end of + * the pattern that remain unmatched because the text isn't long enough. + * @see #getMatchDegree + */ + public static final int PARTIAL_MATCH = 1; + + /** + * Constant returned by getMatchDegree() indicating a complete + * match between the text and this rule. The text matches all context and + * key characters. + * @see #getMatchDegree + */ + public static final int FULL_MATCH = 2; + + /** + * The string that must be matched. + */ + private String key; + + /** + * The string that is emitted if the key, anteContext, and postContext + * are matched. + */ + private String output; + + /** + * The string that must match before the key. Must not be the empty string. + * May be null; if null, then there is no matching requirement before the + * key. + */ + private String anteContext; + + /** + * The string that must match after the key. Must not be the empty string. + * May be null; if null, then there is no matching requirement after the + * key. + */ + private String postContext; + + /** + * The position of the cursor after emitting the output string, from 0 to + * output.length(). For most rules with no special cursor specification, + * the cursorPos is output.length(). + */ + private int cursorPos; + + /** + * A string used to implement masks(). + */ + private String maskKey; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a new rule with the given key, output text, and other + * attributes. Zero, one, or two context strings may be specified. A + * cursor position may be specified for the output text. + * @param key the string to match + * @param output the string to produce when the key is seen + * @param anteContext if not null and not empty, then it must be matched + * before the key + * @param postContext if not null and not empty, then it must be matched + * after the key + * @param cursorPos a position for the cursor after the output + * is emitted. If less than zero, then the cursor is placed after the + * output; that is, -1 is equivalent to + * output.length(). If greater than + * output.length() then an exception is thrown. + * @exception IllegalArgumentException if the cursor position is out of + * range. + */ + public TransliterationRule(String key, String output, + String anteContext, String postContext, + int cursorPos) { + this.key = key; + this.output = output; + this.anteContext = (anteContext != null && anteContext.length() > 0) + ? anteContext : null; + this.postContext = (postContext != null && postContext.length() > 0) + ? postContext : null; + this.cursorPos = cursorPos < 0 ? output.length() : cursorPos; + if (this.cursorPos > output.length()) { + throw new IllegalArgumentException("Illegal cursor position"); + } + + /* The mask key is needed when we are adding individual rules to a rule + * set, for performance. Here are the numbers: Without mask key, 13.0 + * seconds. With mask key, 6.2 seconds. However, once the rules have + * been added to the set, then they can be discarded to free up space. + * This is what the freeze() method does. After freeze() has been + * called, the method masks() must NOT be called. + */ + maskKey = key; + if (postContext != null) { + maskKey += postContext; + } + } + + /** + * Return the length of the key. Equivalent to getKey().length(). + * @return the length of the match key. + */ + public int getKeyLength() { + return key.length(); + } + + /** + * Return the key. + * @return the match key. + */ + public String getKey() { + return key; + } + + /** + * Return the output string. + * @return the output string. + */ + public String getOutput() { + return output; + } + + /** + * Return the position of the cursor within the output string. + * @return a value from 0 to getOutput().length(), inclusive. + */ + public int getCursorPos() { + return cursorPos; + } + + /** + * Return the preceding context length. This method is needed to + * support the Transliterator method + * getMaximumContextLength(). + */ + public int getAnteContextLength() { + return anteContext == null ? 0 : anteContext.length(); + } + + /** + * Return true if this rule masks another rule. If r1 masks r2 then + * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks + * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". + * "[c]a>x" masks "[dc]a>y". + * + *

This method must not be called after freeze() is called. + */ + public boolean masks(TransliterationRule r2) { + /* There are three cases of masking. In each instance, rule1 + * masks rule2. + * + * 1. KEY mask: len(key1) < len(key2), key2 starts with key1. + * + * 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2), + * prefix2 ends with prefix1, suffix2 starts with suffix1. + * + * 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2), + * prefix2 ends with prefix1, suffix2 starts with suffix1. + */ + + /* LIMITATION of the current mask algorithm: Some rule + * maskings are currently not detected. For example, + * "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking, + * we need a subset operator on UnicodeSet objects, which we + * currently do not have. This can be added later. + */ + return ((maskKey.length() < r2.maskKey.length() && + r2.maskKey.startsWith(maskKey)) || + (r2.anteContext != null && maskKey.equals(r2.maskKey) && + ((anteContext == null) || + (anteContext.length() < r2.anteContext.length() && + r2.anteContext.endsWith(anteContext))))); + } + + /** + * Free up space. Once this method is called, masks() must NOT be called. + * If it is called, an exception will be thrown. + */ + public void freeze() { + maskKey = null; + } + + /** + * Return a string representation of this object. + * @return string representation of this object + */ + public String toString() { + return getClass().getName() + '[' + + escape((anteContext != null ? ("[" + anteContext + ']') : "") + + key + + (postContext != null ? ("[" + postContext + ']') : "") + + " -> " + + (cursorPos < output.length() + ? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos)) + : output)) + + ']'; + } + + /** + * Return true if this rule matches the given text. The text being matched + * occupies a virtual buffer consisting of the contents of + * result concatenated to a substring of text. + * The substring is specified by start and limit. + * The value of cursor is an index into this virtual buffer, + * from 0 to the length of the buffer. In terms of the parameters, + * cursor must be between 0 and result.length() + limit - + * start. + * @param text the untranslated text + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result translated text so far + * @param cursor position at which to translate next, an offset into result. + * If greater than or equal to result.length(), represents offset start + + * cursor - result.length() into text. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + public boolean matches(String text, int start, int limit, + StringBuffer result, int cursor, + Dictionary variables, + UnicodeFilter filter) { + return + (anteContext == null + || regionMatches(text, start, limit, result, + cursor - anteContext.length(), + anteContext, variables, filter)) && + regionMatches(text, start, limit, result, cursor, + key, variables, filter) && + (postContext == null + || regionMatches(text, start, limit, result, + cursor + key.length(), + postContext, variables, filter)); + } + + /** + * Return true if this rule matches the given text. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + public boolean matches(Replaceable text, int start, int limit, + int cursor, Dictionary variables, + UnicodeFilter filter) { + return + (anteContext == null + || regionMatches(text, start, limit, cursor - anteContext.length(), + anteContext, variables, filter)) && + regionMatches(text, start, limit, cursor, + key, variables, filter) && + (postContext == null + || regionMatches(text, start, limit, cursor + key.length(), + postContext, variables, filter)); + } + + /** + * Return the degree of match between this rule and the given text. The + * degree of match may be mismatch, a partial match, or a full match. A + * mismatch means at least one character of the text does not match the + * context or key. A partial match means some context and key characters + * match, but the text is not long enough to match all of them. A full + * match means all context and key characters match. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return one of MISMATCH, PARTIAL_MATCH, or + * FULL_MATCH. + * @see #MISMATCH + * @see #PARTIAL_MATCH + * @see #FULL_MATCH + */ + public int getMatchDegree(Replaceable text, int start, int limit, + int cursor, Dictionary variables, + UnicodeFilter filter) { + if (anteContext != null + && !regionMatches(text, start, limit, cursor - anteContext.length(), + anteContext, variables, filter)) { + return MISMATCH; + } + int len = getRegionMatchLength(text, start, limit, cursor, + key, variables, filter); + if (len < 0) { + return MISMATCH; + } + if (len < key.length()) { + return PARTIAL_MATCH; + } + if (postContext == null) { + return FULL_MATCH; + } + len = getRegionMatchLength(text, start, limit, + cursor + key.length(), + postContext, variables, filter); + return (len < 0) ? MISMATCH + : ((len == postContext.length()) ? FULL_MATCH + : PARTIAL_MATCH); + } + + /** + * Return true if a template matches the text. The entire length of the + * template is compared to the text at the cursor. As in + * matches(), the text being matched occupies a virtual buffer + * consisting of the contents of result concatenated to a + * substring of text. See matches() for details. + * @param text the untranslated text + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result translated text so far + * @param cursor position at which to translate next, an offset into result. + * If greater than or equal to result.length(), represents offset start + + * cursor - result.length() into text. + * @param template the text to match against. All characters must match. + * @param variables a dictionary of variables mapping Character + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return true if there is a match + */ + protected static boolean regionMatches(String text, int start, int limit, + StringBuffer result, int cursor, + String template, + Dictionary variables, + UnicodeFilter filter) { + int rlen = result.length(); + if (cursor < 0 + || (cursor + template.length()) > (rlen + limit - start)) { + return false; + } + for (int i=0; i0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param template the text to match against. All characters must match. + * @param variables a dictionary of variables mapping Character + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return true if there is a match + */ + protected static boolean regionMatches(Replaceable text, int start, int limit, + int cursor, + String template, Dictionary variables, + UnicodeFilter filter) { + if (cursor < start + || (cursor + template.length()) > limit) { + return false; + } + for (int i=0; i0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param template the text to match against. All characters must match. + * @param variables a dictionary of variables mapping Character + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return -1 if there is a mismatch, 0 if the text is not long enough to + * match any characters, otherwise the number of characters of text that + * match this rule. + */ + protected static int getRegionMatchLength(Replaceable text, int start, + int limit, int cursor, + String template, + Dictionary variables, + UnicodeFilter filter) { + if (cursor < start) { + return -1; + } + int i; + for (i=0; iCharacter + * to UnicodeSet + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + protected static boolean charMatches(char keyChar, char textChar, + Dictionary variables, UnicodeFilter filter) { + UnicodeSet set = null; + return (filter == null || filter.isIn(textChar)) && + ((set = (UnicodeSet) variables.get(new Character(keyChar))) + == null) ? + keyChar == textChar : set.contains(textChar); + } + + /** + * Escape non-ASCII characters as Unicode. + */ + public static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + buf.append(c); + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/text/TransliterationRuleSet.java b/icu4j/src/com/ibm/text/TransliterationRuleSet.java new file mode 100755 index 00000000000..d57bf75464a --- /dev/null +++ b/icu4j/src/com/ibm/text/TransliterationRuleSet.java @@ -0,0 +1,218 @@ +package com.ibm.text; + +import java.util.*; + +/** + * A set of rules for a RuleBasedTransliterator. This set encodes + * the transliteration in one direction from one set of characters or short + * strings to another. A RuleBasedTransliterator consists of up to + * two such sets, one for the forward direction, and one for the reverse. + * + *

A TransliterationRuleSet has one important operation, that of + * finding a matching rule at a given point in the text. This is accomplished + * by the findMatch() method. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +class TransliterationRuleSet { + /* Note: There was an old implementation that indexed by first letter of + * key. Problem with this is that key may not have a meaningful first + * letter; e.g., {Lu}>*. One solution is to keep a separate vector of all + * rules whose intial key letter is a category variable. However, the + * problem is that they must be kept in order with respect to other rules. + * One solution -- add a sequence number to each rule. Do the usual + * first-letter lookup, and also a lookup from the spare bin with rules like + * {Lu}>*. Take the lower sequence number. This seems complex and not + * worth the trouble, but we may revisit this later. For documentation (or + * possible resurrection) the old code is included below, commented out + * with the remark "// OLD INDEXED IMPLEMENTATION". Under the old + * implementation, rules is a Hashtable, not a Vector. + */ + + /** + * Vector of rules, in the order added. + */ + private Vector rules; + + /** + * Length of the longest preceding context + */ + private int maxContextLength; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a new empty rule set. + */ + public TransliterationRuleSet() { + rules = new Vector(); + maxContextLength = 0; + } + + /** + * Return the maximum context length. + * @return the length of the longest preceding context. + */ + public int getMaximumContextLength() { + return maxContextLength; + } + + /** + * Add a rule to this set. Rules are added in order, and order is + * significant. + * + *

Once freeze() is called, this method must not be called. + * @param rule the rule to add + */ + public void addRule(TransliterationRule rule) { + + // Build time, no checking : 3562 ms + // Build time, with checking: 6234 ms + + for (int i=0; i maxContextLength) { + maxContextLength = len; + } + } + + /** + * Free up space. Once this method is called, addRule() must NOT + * be called again. + */ + public void freeze() { + for (int i=0; iresult concatenated to a substring of text. + * The substring is specified by start and limit. + * The value of cursor is an index into this virtual buffer, + * from 0 to the length of the buffer. In terms of the parameters, + * cursor must be between 0 and result.length() + limit - + * start. + * @param text the untranslated text + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result tranlated text + * @param cursor position at which to translate next, an offset into result. + * If greater than or equal to result.length(), represents offset start + + * cursor - result.length() into text. + * @param variables a dictionary mapping variables to the sets they + * represent (maps Character to UnicodeSet) + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return the matching rule, or null if none found. + */ + public TransliterationRule findMatch(String text, int start, int limit, + StringBuffer result, int cursor, + Dictionary variables, + UnicodeFilter filter) { + for (Enumeration e = rules.elements(); e.hasMoreElements(); ) { + TransliterationRule rule = (TransliterationRule) e.nextElement(); + if (rule.matches(text, start, limit, result, cursor, variables, filter)) { + return rule; + } + } + return null; + } + + /** + * Attempt to find a matching rule at the specified point in the text. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param variables a dictionary mapping variables to the sets they + * represent (maps Character to UnicodeSet) + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return the matching rule, or null if none found. + */ + public TransliterationRule findMatch(Replaceable text, int start, int limit, + int cursor, + Dictionary variables, + UnicodeFilter filter) { + for (Enumeration e = rules.elements(); e.hasMoreElements(); ) { + TransliterationRule rule = (TransliterationRule) e.nextElement(); + if (rule.matches(text, start, limit, cursor, variables, filter)) { + return rule; + } + } + return null; + } + + /** + * Attempt to find a matching rule at the specified point in the text. + * Unlike findMatch(), this method does an incremental match. + * An incremental match requires that there be no partial matches that might + * pre-empt the full match that is found. If there are partial matches, + * then null is returned. A non-null result indicates that a full match has + * been found, and that it cannot be pre-empted by a partial match + * regardless of what additional text is added to the translation buffer. + * @param text the text, both translated and untranslated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param cursor position at which to translate next, representing offset + * into text. This value must be between start and + * limit. + * @param variables a dictionary mapping variables to the sets they + * represent (maps Character to UnicodeSet) + * @param partial output parameter. partial[0] is set to + * true if a partial match is returned. + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return the matching rule, or null if none found, or if the text buffer + * does not have enough text yet to unambiguously match a rule. + */ + public TransliterationRule findIncrementalMatch(Replaceable text, int start, + int limit, int cursor, + Dictionary variables, + boolean partial[], + UnicodeFilter filter) { + partial[0] = false; + for (Enumeration e = rules.elements(); e.hasMoreElements(); ) { + TransliterationRule rule = (TransliterationRule) e.nextElement(); + int match = rule.getMatchDegree(text, start, limit, cursor, + variables, filter); + switch (match) { + case TransliterationRule.FULL_MATCH: + return rule; + case TransliterationRule.PARTIAL_MATCH: + partial[0] = true; + return null; + } + } + return null; + } +} diff --git a/icu4j/src/com/ibm/text/Transliterator.java b/icu4j/src/com/ibm/text/Transliterator.java new file mode 100755 index 00000000000..83171a961e7 --- /dev/null +++ b/icu4j/src/com/ibm/text/Transliterator.java @@ -0,0 +1,860 @@ +package com.ibm.text; + +import java.util.*; +import java.text.MessageFormat; + +/** + * Transliterator is an abstract class that + * transliterates text from one format to another. The most common + * kind of transliterator is a script, or alphabet, transliterator. + * For example, a Russian to Latin transliterator changes Russian text + * written in Cyrillic characters to phonetically equivalent Latin + * characters. It does not translate Russian to English! + * Transliteration, unlike translation, operates on characters, without + * reference to the meanings of words and sentences. + * + *

Although script conversion is its most common use, a + * transliterator can actually perform a more general class of tasks. + * In fact, Transliterator defines a very general API + * which specifies only that a segment of the input text is replaced + * by new text. The particulars of this conversion are determined + * entirely by subclasses of Transliterator. + * + *

Transliterators are stateless + * + *

Transliterator objects are stateless; they + * retain no information between calls to + * transliterate(). As a result, threads may share + * transliterators without synchronizing them. This might seem to + * limit the complexity of the transliteration operation. In + * practice, subclasses perform complex transliterations by delaying + * the replacement of text until it is known that no other + * replacements are possible. In other words, although the + * Transliterator objects are stateless, the source text + * itself embodies all the needed information, and delayed operation + * allows arbitrary complexity. + * + *

Batch transliteration + * + *

The simplest way to perform transliteration is all at once, on a + * string of existing text. This is referred to as batch + * transliteration. For example, given a string input + * and a transliterator t, the call + * + *

String result = t.transliterate(input); + *
+ * + * will transliterate it and return the result. Other methods allow + * the client to specify a substring to be transliterated and to use + * {@link Replaceable} objects instead of strings, in order to + * preserve out-of-band information (such as text styles). + * + *

Keyboard transliteration + * + *

Somewhat more involved is keyboard, or incremental + * transliteration. This is the transliteration of text that is + * arriving from some source (typically the user's keyboard) one + * character at a time, or in some other piecemeal fashion. + * + *

In keyboard transliteration, a Replaceable buffer + * stores the text. As text is inserted, as much as possible is + * transliterated on the fly. This means a GUI that displays the + * contents of the buffer may show text being modified as each new + * character arrives. + * + *

Consider the simple RuleBasedTransliterator: + * + *

+ * th>{theta}
+ * t>{tau} + *
+ * + * When the user types 't', nothing will happen, since the + * transliterator is waiting to see if the next character is 'h'. To + * remedy this, we introduce the notion of a cursor, marked by a '|' + * in the output string: + * + *
+ * t>|{tau}
+ * {tau}h>{theta} + *
+ * + * Now when the user types 't', tau appears, and if the next character + * is 'h', the tau changes to a theta. This is accomplished by + * maintaining a cursor position (independent of the insertion point, + * and invisible in the GUI) across calls to + * keyboardTransliterate(). Typically, the cursor will + * be coincident with the insertion point, but in a case like the one + * above, it will precede the insertion point. + * + *

Keyboard transliteration methods maintain a set of three indices + * that are updated with each call to + * keyboardTransliterate(), including the cursor, start, + * and limit. Since these indices are changed by the method, they are + * passed in an int[] array. The START index + * marks the beginning of the substring that the transliterator will + * look at. It is advanced as text becomes committed (but it is not + * the committed index; that's the CURSOR). The + * CURSOR index, described above, marks the point at + * which the transliterator last stopped, either because it reached + * the end, or because it required more characters to disambiguate + * between possible inputs. The CURSOR can also be + * explicitly set by rules in a RuleBasedTransliterator. + * Any characters before the CURSOR index are frozen; + * future keyboard transliteration calls within this input sequence + * will not change them. New text is inserted at the + * LIMIT index, which marks the end of the substring that + * the transliterator looks at. + * + *

Because keyboard transliteration assumes that more characters + * are to arrive, it is conservative in its operation. It only + * transliterates when it can do so unambiguously. Otherwise it waits + * for more characters to arrive. When the client code knows that no + * more characters are forthcoming, perhaps because the user has + * performed some input termination operation, then it should call + * finishKeyboardTransliteration() to complete any + * pending transliterations. + * + *

Inverses + * + *

Pairs of transliterators may be inverses of one another. For + * example, if transliterator A transliterates characters by + * incrementing their Unicode value (so "abc" -> "def"), and + * transliterator B decrements character values, then A + * is an inverse of B and vice versa. If we compose A + * with B in a compound transliterator, the result is the + * indentity transliterator, that is, a transliterator that does not + * change its input text. + * + * The Transliterator method getInverse() + * returns a transliterator's inverse, if one exists, or + * null otherwise. However, the result of + * getInverse() usually will not be a true + * mathematical inverse. This is because true inverse transliterators + * are difficult to formulate. For example, consider two + * transliterators: AB, which transliterates the character 'A' + * to 'B', and BA, which transliterates 'B' to 'A'. It might + * seem that these are exact inverses, since + * + *

"A" x AB -> "B"
+ * "B" x BA -> "A"
+ * + * where 'x' represents transliteration. However, + * + *
"ABCD" x AB -> "BBCD"
+ * "BBCD" x BA -> "AACD"
+ * + * so AB composed with BA is not the + * identity. Nonetheless, BA may be usefully considered to be + * AB's inverse, and it is on this basis that + * AB.getInverse() could legitimately return + * BA. + * + *

IDs and display names + * + *

A transliterator is designated by a short identifier string or + * ID. IDs follow the format source-destination, + * where source describes the entity being replaced, and + * destination describes the entity replacing + * source. The entities may be the names of scripts, + * particular sequences of characters, or whatever else it is that the + * transliterator converts to or from. For example, a transliterator + * from Russian to Latin might be named "Russian-Latin". A + * transliterator from keyboard escape sequences to Latin-1 characters + * might be named "KeyboardEscape-Latin1". By convention, system + * entity names are in English, with the initial letters of words + * capitalized; user entity names may follow any format so long as + * they do not contain dashes. + * + *

In addition to programmatic IDs, transliterator objects have + * display names for presentation in user interfaces, returned by + * {@link #getDisplayName}. + * + *

Factory methods and registration + * + *

In general, client code should use the factory method + * getInstance() to obtain an instance of a + * transliterator given its ID. Valid IDs may be enumerated using + * getAvailableIDs(). Since transliterators are + * stateless, multiple calls to getInstance() with the + * same ID will return the same object. + * + *

In addition to the system transliterators registered at startup, + * user transliterators may be registered by calling + * registerInstance() at run time. To register a + * transliterator subclass without instantiating it (until it is + * needed), users may call registerClass(). + * + *

Subclassing + * + *

Subclasses must implement the abstract + * transliterate() method. They should also override the + * transliterate() method taking a String + * and StringBuffer if the performance of these methods + * can be improved over the performance obtained by the default + * implementations in this class. Subclasses must also implement + * handleKeyboardTransliterate(). + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: Transliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public abstract class Transliterator { + /** + * In the keyboardTransliterate() + * index[] array, the beginning index, inclusive + * @see #keyboardTransliterate + */ + public static final int START = 0; + + /** + * In the keyboardTransliterate() + * index[] array, the ending index, exclusive + * @see #keyboardTransliterate + */ + public static final int LIMIT = 1; + + /** + * In the keyboardTransliterate() + * index[] array, the next character to be considered + * for transliteration + * @see #keyboardTransliterate + */ + public static final int CURSOR = 2; + + /** + * Programmatic name, e.g., "Latin-Arabic". + */ + private String ID; + + /** + * This transliterator's filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + private UnicodeFilter filter; + + /** + * Dictionary of known transliterators. Keys are String + * names, values are one of the following: + * + *

  • Transliterator objects + * + *
  • Class objects. Such objects must represent + * subclasses of Transliterator, and must satisfy the + * constraints described in registerClass() + * + *
  • RULE_BASED_PLACEHOLDER, in which case the ID + * will have its first '-' removed and be appended to + * RB_RULE_BASED_PREFIX to form a resource bundle name from which + * the RB_RULE key is looked up to obtain the rule. + * + *
  • REVERSE_RULE_BASED_PLACEHOLDER. Like + * RULE_BASED_PLACEHOLDER, except the entity names in + * the ID are reversed, and the argument + * RuleBasedTransliterator.REVERSE is pased to the + * RuleBasedTransliterator constructor. + *
+ */ + private static Hashtable cache; + + /** + * Internal object used to stand for instances of + * RuleBasedTransliterator that have not been + * constructed yet in the cache. When a + * getInstance() call retrieves this object, it is + * replaced by the actual RuleBasedTransliterator. + * This allows Transliterator to delay instantiation + * of such transliterators until they are needed. + */ + private static final Object RULE_BASED_PLACEHOLDER = new Object(); + + /** + * Internal object used to stand for instances of + * RuleBasedTransliterator that have not been + * constructed yet in the cache. These instances are + * constructed with an argument + * RuleBasedTransliterator.REVERSE. + */ + private static final Object REVERSE_RULE_BASED_PLACEHOLDER = new Object(); + + /** + * Prefix for resource bundle key for the display name for a + * transliterator. The ID is appended to this to form the key. + * The resource bundle value should be a String. + */ + private static final String RB_DISPLAY_NAME_PREFIX = "T:"; + + /** + * Resource bundle key for display name pattern. + * The resource bundle value should be a String forming a + * MessageFormat pattern, e.g.: + * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}". + */ + private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern"; + + /** + * Resource bundle key for the list of RuleBasedTransliterator IDs. + * The resource bundle value should be a String[] with each element + * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX + * to obtain the class name in which the RB_RULE key will be sought. + */ + private static final String RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs"; + + /** + * Resource bundle containing display name keys and the + * RB_RULE_BASED_IDS array. + * + *

If we ever integrate this with the Sun JDK, the resource bundle + * root will change to java.text.resources.LocaleElements + */ + private static final String RB_LOCALE_ELEMENTS = + "com.ibm.text.resources.LocaleElements"; + + /** + * Prefix for resource bundle containing RuleBasedTransliterator + * RB_RULE string. The ID is munged to remove the first '-' then appended + * to this String to obtain the class name. + */ + private static final String RB_RULE_BASED_PREFIX = + "com.ibm.text.resources.TransliterationRule"; + + /** + * Resource bundle key for the RuleBasedTransliterator rule. + */ + private static final String RB_RULE = "Rule"; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Default constructor. + * @param ID the string identifier for this transliterator + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + */ + protected Transliterator(String ID, UnicodeFilter filter) { + if (ID == null) { + throw new NullPointerException(); + } + this.ID = ID; + this.filter = filter; + } + + /** + * Transliterates the segment of a string that begins at the + * character at offset start and extends to the + * character at offset limit - 1, with optional + * filtering. A default implementaion is provided here; + * subclasses should provide a more efficient implementation if + * possible. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param result buffer to receive the transliterated text; previous + * contents are discarded + */ + public void transliterate(String text, int start, int limit, + StringBuffer result) { + /* This is a default implementation that should be replaced by + * a more efficient subclass implementation if possible. + */ + result.setLength(0); + result.append(text.substring(start, limit)); + transliterate(new ReplaceableString(result), + 0, result.length()); + } + + /** + * Transliterates a segment of a string, with optional filtering. + * Subclasses must override this abstract method. + * + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @param filter the filter. Any character for which + * filter.isIn() returns false will not be + * altered by this transliterator. If filter is + * null then no filtering is applied. + * @return The new limit index. The text previously occupying [start, + * limit) has been transliterated, possibly to a string of a different + * length, at [start, new-limit), where + * new-limit is the return value. + */ + public abstract int transliterate(Replaceable text, int start, int limit); + + /** + * Transliterates an entire string. Convenience method. + * @param text the string to be transliterated + * @param result buffer to receive the transliterated text; previous + * contents are discarded + */ + public final void transliterate(String text, StringBuffer result) { + transliterate(text, 0, text.length(), result); + } + + /** + * Transliterate an entire string and returns the result. Convenience method. + * + * @param text the string to be transliterated + * @return The transliterated text + */ + public final String transliterate(String text) { + StringBuffer result = new StringBuffer(); + transliterate(text, 0, text.length(), result); + return result.toString(); + } + + /** + * Transliterates an entire string in place. Convenience method. + * @param text the string to be transliterated + */ + public final void transliterate(Replaceable text) { + transliterate(text, 0, text.length()); + } + + /** + * Transliterates the portion of the text buffer that can be + * transliterated unambiguosly after new text has been inserted, + * typically as a result of a keyboard event. The new text in + * insertion will be inserted into text + * at index[LIMIT], advancing + * index[LIMIT] by insertion.length(). + * Then the transliterator will try to transliterate characters of + * text between index[CURSOR] and + * index[LIMIT]. Characters before + * index[CURSOR] will not be changed. + * + *

Upon return, values in index[] will be updated. + * index[START] will be advanced to the first + * character that future calls to this method will read. + * index[CURSOR] and index[LIMIT] will + * be adjusted to delimit the range of text that future calls to + * this method may change. + * + *

Typical usage of this method begins with an initial call + * with index[START] and index[LIMIT] + * set to indicate the portion of text to be + * transliterated, and index[CURSOR] == index[START]. + * Thereafter, index[] can be used without + * modification in future calls, provided that all changes to + * text are made via this method. + * + *

This method assumes that future calls may be made that will + * insert new text into the buffer. As a result, it only performs + * unambiguous transliterations. After the last call to this + * method, there may be untransliterated text that is waiting for + * more input to resolve an ambiguity. In order to perform these + * pending transliterations, clients should call {@link + * #finishKeyboardTransliteration} after the last call to this + * method has been made. + * + * @param text the buffer holding transliterated and untransliterated text + * @param index an array of three integers. + * + *

  • index[START]: the beginning index, + * inclusive; 0 <= index[START] <= index[LIMIT]. + * + *
  • index[LIMIT]: the ending index, exclusive; + * index[START] <= index[LIMIT] <= text.length(). + * insertion is inserted at + * index[LIMIT]. + * + *
  • index[CURSOR]: the next character to be + * considered for transliteration; index[START] <= + * index[CURSOR] <= index[LIMIT]. Characters before + * index[CURSOR] will not be changed by future calls + * to this method.
+ * + * @param insertion text to be inserted and possibly + * transliterated into the translation buffer at + * index[LIMIT]. If null then no text + * is inserted. + * @see #START + * @see #LIMIT + * @see #CURSOR + * @see #handleKeyboardTransliterate + * @exception IllegalArgumentException if index[] + * is invalid + */ + public final void keyboardTransliterate(Replaceable text, int[] index, + String insertion) { + if (index.length < 3 || + index[START] < 0 || + index[LIMIT] > text.length() || + index[CURSOR] < index[START] || + index[CURSOR] > index[LIMIT]) { + throw new IllegalArgumentException("Invalid index array"); + } + + int originalStart = index[START]; + if (insertion != null) { + text.replace(index[LIMIT], index[LIMIT], insertion); + index[LIMIT] += insertion.length(); + } + + handleKeyboardTransliterate(text, index); + + index[START] = Math.max(index[CURSOR] - getMaximumContextLength(), + originalStart); + } + + /** + * Transliterates the portion of the text buffer that can be + * transliterated unambiguosly after a new character has been + * inserted, typically as a result of a keyboard event. This is a + * convenience method; see {@link + * #keyboardTransliterate(Replaceable, int[], String)} for details. + * @param text the buffer holding transliterated and + * untransliterated text + * @param index an array of three integers. See {@link + * #keyboardTransliterate(Replaceable, int[], String)}. + * @param insertion text to be inserted and possibly + * transliterated into the translation buffer at + * index[LIMIT]. + * @see #keyboardTransliterate(Replaceable, int[], String) + */ + public final void keyboardTransliterate(Replaceable text, int[] index, + char insertion) { + keyboardTransliterate(text, index, String.valueOf(insertion)); + } + + /** + * Transliterates the portion of the text buffer that can be + * transliterated unambiguosly. This is a convenience method; see + * {@link #keyboardTransliterate(Replaceable, int[], String)} for + * details. + * @param text the buffer holding transliterated and + * untransliterated text + * @param index an array of three integers. See {@link + * #keyboardTransliterate(Replaceable, int[], String)}. + * @see #keyboardTransliterate(Replaceable, int[], String) + */ + public final void keyboardTransliterate(Replaceable text, int[] index) { + keyboardTransliterate(text, index, null); + } + + /** + * Finishes any pending transliterations that were waiting for + * more characters. Clients should call this method as the last + * call after a sequence of one or more calls to + * keyboardTransliterate(). + * @param text the buffer holding transliterated and + * untransliterated text. + * @param index the array of indices previously passed to {@link + * #keyboardTransliterate} + */ + public final void finishKeyboardTransliteration(Replaceable text, + int[] index) { + transliterate(text, index[START], index[LIMIT]); + } + + /** + * Abstract method that concrete subclasses define to implement + * keyboard transliteration. This method should transliterate all + * characters between index[CURSOR] and + * index[LIMIT] that can be unambiguously + * transliterated, regardless of future insertions of text at + * index[LIMIT]. index[CURSOR] should + * be advanced past committed characters (those that will not + * change in future calls to this method). + * index[LIMIT] should be updated to reflect text + * replacements that shorten or lengthen the text between + * index[CURSOR] and index[LIMIT]. Upon + * return, neither index[CURSOR] nor + * index[LIMIT] should be less than the initial value + * of index[CURSOR]. index[START] + * should not be changed. + * + * @param text the buffer holding transliterated and + * untransliterated text + * @param index an array of three integers. See {@link + * #keyboardTransliterate(Replaceable, int[], String)}. + * @see #keyboardTransliterate + */ + protected abstract void handleKeyboardTransliterate(Replaceable text, + int[] index); + + /** + * Returns the length of the longest context required by this transliterator. + * This is preceding context. The default implementation supplied + * by Transliterator returns zero; subclasses + * that use preceding context should override this method to return the + * correct value. For example, if a transliterator translates "ddd" (where + * d is any digit) to "555" when preceded by "(ddd)", then the preceding + * context length is 5, the length of "(ddd)". + * + * @return The maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return 0; + } + + /** + * Returns a programmatic identifier for this transliterator. + * If this identifier is passed to getInstance(), it + * will return this object, if it has been registered. + * @see #registerInstance + * @see #registerClass + * @see #getAvailableIDs + */ + public final String getID() { + return ID; + } + + /** + * Returns a name for this transliterator that is appropriate for + * display to the user in the default locale. See {@link + * #getDisplayName(Locale)} for details. + */ + public final String getDisplayName() { + return getDisplayName(Locale.getDefault()); + } + + /** + * Returns a name for this transliterator that is appropriate for + * display to the user in the given locale. This name is taken + * from the locale resource data in the standard manner of the + * java.text package. + * + *

If no localized names exist in the system resource bundles, + * a name is synthesized using a localized + * MessageFormat pattern from the resource data. The + * arguments to this pattern are an integer followed by one or two + * strings. The integer is the number of strings, either 1 or 2. + * The strings are formed by splitting the ID for this + * transliterator at the first '-'. If there is no '-', then the + * entire ID forms the only string. + * @param inLocale the Locale in which the display name should be + * localized. + * @see java.text.MessageFormat + */ + public String getDisplayName(Locale inLocale) { + ResourceBundle bundle = ResourceBundle.getBundle( + RB_LOCALE_ELEMENTS, inLocale); + + try { + return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID); + } catch (MissingResourceException e) {} + + try { + // Construct the formatter first; if getString() fails + // we'll exit the try block + MessageFormat format = new MessageFormat( + bundle.getString(RB_DISPLAY_NAME_PATTERN)); + // Construct the argument array + int i = ID.indexOf('-'); + Object[] args = (i < 0) + ? new Object[] { new Integer(1), ID } + : new Object[] { new Integer(2), ID.substring(0, i), + ID.substring(i+1) }; + // Format it using the pattern in the resource + return format.format(args); + } catch (MissingResourceException e2) {} + + // We should not reach this point unless there is something + // wrong with the build or the RB_DISPLAY_NAME_PATTERN has + // been deleted from the root RB_LOCALE_ELEMENTS resource. + throw new RuntimeException(); + } + + /** + * Returns the filter used by this transliterator, or null + * if this transliterator uses no filter. + */ + public UnicodeFilter getFilter() { + return filter; + } + + /** + * Changes the filter used by this transliterator. If the filter + * is set to null then no filtering will occur. + * + *

Callers must take care if a transliterator is in use by + * multiple threads. The filter should not be changed by one + * thread while another thread may be transliterating. + */ + public void setFilter(UnicodeFilter filter) { + this.filter = filter; + } + + /** + * Returns this transliterator's inverse. See the class + * documentation for details. This implementation simply inverts + * the two entities in the ID and attempts to retrieve the + * resulting transliterator. That is, if getID() + * returns "A-B", then this method will return the result of + * getInstance("B-A"), or null if that + * call fails. + * + *

This method does not take filtering into account. The + * returned transliterator will have no filter. + * + *

Subclasses with knowledge of their inverse may wish to + * override this method. + * + * @return a transliterator that is an inverse, not necessarily + * exact, of this transliterator, or null if no such + * transliterator is registered. + * @see #registerInstance + */ + public Transliterator getInverse() { + int i = ID.indexOf('-'); + if (i >= 0) { + String inverseID = ID.substring(i+1) + '-' + ID.substring(0, i); + return internalGetInstance(inverseID); + } + return null; + } + + /** + * Returns a Transliterator object given its ID. + * The ID must be either a system transliterator ID or a ID registered + * using registerInstance(). + * + * @param ID a valid ID, as enumerated by getAvailableIDs() + * @return A Transliterator object with the given ID + * @exception IllegalArgumentException if the given ID is invalid. + * @see #registerInstance + * @see #getAvailableIDs + * @see #getID + */ + public static Transliterator getInstance(String ID) { + Transliterator t = internalGetInstance(ID); + if (t != null) { + return t; + } + throw new IllegalArgumentException("Unsupported transliterator: " + + ID); + } + + /** + * Returns a transliterator object given its ID. Unlike getInstance(), + * this method returns null if it cannot make use of the given ID. + */ + private static Transliterator internalGetInstance(String ID) { + Object obj = cache.get(ID); + RuleBasedTransliterator.Data data = null; + + if (obj instanceof RuleBasedTransliterator.Data) { + data = (RuleBasedTransliterator.Data) obj; + // Fall through to construct transliterator from cached Data object. + } else if (obj instanceof Class) { + try { + return (Transliterator) ((Class) obj).newInstance(); + } catch (InstantiationException e) { + } catch (IllegalAccessException e2) {} + } else { + synchronized (cache) { + boolean isReverse = (obj == REVERSE_RULE_BASED_PLACEHOLDER); + String resourceName = RB_RULE_BASED_PREFIX; + int i = ID.indexOf('-'); + if (i < 0) { + resourceName += ID; + } else { + String IDLeft = ID.substring(0, i); + String IDRight = ID.substring(i+1); + resourceName += isReverse ? (IDRight + IDLeft) + : (IDLeft + IDRight); + } + try { + ResourceBundle resource = ResourceBundle.getBundle(resourceName); + + data = RuleBasedTransliterator.parse(resource.getString(RB_RULE), + isReverse + ? RuleBasedTransliterator.REVERSE + : RuleBasedTransliterator.FORWARD); + + cache.put(ID, data); + // Fall through to construct transliterator from Data object. + } catch (MissingResourceException e) {} + } + } + + if (data != null) { + return new RuleBasedTransliterator(ID, data, null); + } + + return null; + } + + /** + * Registers a subclass of Transliterator with the + * system. This subclass must have a public constructor taking no + * arguments. When that constructor is called, the resulting + * object must return the ID passed to this method if + * its getID() method is called. + * + * @param ID the result of getID() for this + * transliterator + * @param transClass a subclass of Transliterator + * @see #registerInstance + * @see #unregister + */ + public static void registerClass(String ID, Class transClass) { + cache.put(ID, transClass); + } + + /** + * Unregisters a transliterator or class. This may be either + * a system transliterator or a user transliterator or class. + * + * @param ID the ID of the transliterator or class + * @return the Object that was registered with + * ID, or null if none was + * @see #registerInstance + * @see #registerClass + */ + public static Object unregister(String ID) { + return cache.remove(ID); + } + + /** + * Returns an enumeration over the programmatic names of registered + * Transliterator objects. This includes both system + * transliterators and user transliterators registered using + * registerInstance(). The enumerated names may be + * passed to getInstance(). + * + * @return An Enumeration over String objects + * @see #getInstance + * @see #registerInstance + */ + public static final Enumeration getAvailableIDs() { + return cache.keys(); + } + + static { + ResourceBundle bundle = ResourceBundle.getBundle(RB_LOCALE_ELEMENTS); + + try { + String[] ruleBasedIDs = bundle.getStringArray(RB_RULE_BASED_IDS); + + cache = new Hashtable(); + + for (int i=0; iUnicodeFilter defines a protocol for selecting a + * subset of the full range (U+0000 to U+FFFF) of Unicode characters. + * Currently, filters are used in conjunction with classes like {@link + * Transliterator} to only process selected characters through a + * transformation. + * + * {@link UnicodeFilterLogic} + */ + +public interface UnicodeFilter { + + /** + * Returns true for characters that are in the selected + * subset. In other words, if a character is to be + * filtered, then isIn() returns + * false. + */ + public boolean isIn(char c); +} diff --git a/icu4j/src/com/ibm/text/UnicodeFilterLogic.java b/icu4j/src/com/ibm/text/UnicodeFilterLogic.java new file mode 100755 index 00000000000..f9e6ec1c609 --- /dev/null +++ b/icu4j/src/com/ibm/text/UnicodeFilterLogic.java @@ -0,0 +1,112 @@ +package com.ibm.text; + +/** + * UnicodeFilterLogic provides logical operators on + * {@link UnicodeFilter} objects. This class cannot be instantiated; + * it consists only of static methods. The static methods return + * filter objects that perform logical inversion (not), + * intersection (and), or union (or) of the given + * filter objects. + */ +public final class UnicodeFilterLogic { + + /** + * Returns a UnicodeFilter that implements the inverse of + * the given filter. + */ + public static UnicodeFilter not(final UnicodeFilter f) { + return new UnicodeFilter() { + public boolean isIn(char c) { + return !f.isIn(c); + } + }; + } + + /** + * Returns a UnicodeFilter that implements a short + * circuit AND of the result of the two given filters. That is, + * if f.isIn() is false, then g.isIn() + * is not called, and isIn() returns false. + * + *

Either f or g must be non-null. + */ + public static UnicodeFilter and(final UnicodeFilter f, + final UnicodeFilter g) { + if (f == null) { + return g; + } + if (g == null) { + return f; + } + return new UnicodeFilter() { + public boolean isIn(char c) { + return f.isIn(c) && g.isIn(c); + } + }; + } + + /** + * Returns a UnicodeFilter that implements a short + * circuit AND of the result of the given filters. That is, if + * f[i].isIn() is false, then + * f[j].isIn() is not called, where j > i, and + * isIn() returns false. + */ + public static UnicodeFilter and(final UnicodeFilter[] f) { + return new UnicodeFilter() { + public boolean isIn(char c) { + for (int i=0; iUnicodeFilter that implements a short + * circuit OR of the result of the two given filters. That is, if + * f.isIn() is true, then g.isIn() is + * not called, and isIn() returns true. + * + *

Either f or g must be non-null. + */ + public static UnicodeFilter or(final UnicodeFilter f, + final UnicodeFilter g) { + if (f == null) { + return g; + } + if (g == null) { + return f; + } + return new UnicodeFilter() { + public boolean isIn(char c) { + return f.isIn(c) || g.isIn(c); + } + }; + } + + /** + * Returns a UnicodeFilter that implements a short + * circuit OR of the result of the given filters. That is, if + * f[i].isIn() is false, then + * f[j].isIn() is not called, where j > i, and + * isIn() returns true. + */ + public static UnicodeFilter or(final UnicodeFilter[] f) { + return new UnicodeFilter() { + public boolean isIn(char c) { + for (int i=0; icharacter classes used in regular expressions. + * Such classes specify a subset of the set of all Unicode characters, + * which in this implementation is the characters from U+0000 to + * U+FFFF, ignoring surrogates. + * + *

This class supports two APIs. The first is modeled after Java 2's + * java.util.Set interface, although this class does not + * implement that interface. All methods of Set are + * supported, with the modification that they take a character range + * or single character instead of an Object, and they + * take a UnicodeSet instead of a Collection. + * + *

The second API is the + * applyPattern()/toPattern() API from the + * java.text.Format-derived classes. Unlike the + * methods that add characters, add categories, and control the logic + * of the set, the method applyPattern() sets all + * attributes of a UnicodeSet at once, based on a + * string pattern. + * + *

In addition, the set complement operation is supported through + * the complement() method. + * + *

Pattern syntax

+ * + * Patterns are accepted by the constructors and the + * applyPattern() methods and returned by the + * toPattern() method. These patterns follow a syntax + * similar to that employed by version 8 regular expression character + * classes: + * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
pattern :=  ('[' '^'? item* ']') | + * ('[:' '^'? category ':]')
item :=  char | (char '-' char) | pattern-expr
+ *
pattern-expr :=  pattern | pattern-expr pattern | + * pattern-expr op pattern
+ *
op :=  '&' | '-'
+ *
special :=  '[' | ']' | '-'
+ *
char :=  any character that is not special
+ * | ('\u005C'
any character)
+ * | ('\u005Cu' hex hex hex hex)
+ *
hex :=  any character for which + * Character.digit(c, 16) + * returns a non-negative result
category :=  'M' | 'N' | 'Z' | 'C' | 'L' | 'P' | + * 'S' | 'Mn' | 'Mc' | 'Me' | 'Nd' | 'Nl' | 'No' | 'Zs' | 'Zl' | + * 'Zp' | 'Cc' | 'Cf' | 'Cs' | 'Co' | 'Cn' | 'Lu' | 'Ll' | 'Lt' + * | 'Lm' | 'Lo' | 'Pc' | 'Pd' | 'Ps' | 'Pe' | 'Po' | 'Sm' | + * 'Sc' | 'Sk' | 'So'
+ *
+ * + * + * + * + *
Legend: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
a := b  a may be replaced by b
a?zero or one instance of a
+ *
a*one or more instances of a
+ *
a | beither a or b
+ *
'a'the literal string between the quotes
+ *
+ *
+ * + * Patterns specify individual characters, ranges of characters, and + * Unicode character categories. When elements are concatenated, they + * specify their union. To complement a set, place a '^' immediately + * after the opening '[' or '[:'. In any other location, '^' has no + * special meaning. + * + *

Ranges are indicated by placing two a '-' between two + * characters, as in "a-z". This specifies the range of all + * characters from the left to the right, in Unicode order. If the + * left and right characters are the same, then the range consists of + * just that character. If the left character is greater than the + * right character it is a syntax error. If a '-' occurs as the first + * character after the opening '[' or '[^', or if it occurs as the + * last character before the closing ']', then it is taken as a + * literal. Thus "[a\u005C-b]", "[-ab]", and "[ab-]" all indicate the same + * set of three characters, 'a', 'b', and '-'. + * + *

Sets may be intersected using the '&' operator or the asymmetric + * set difference may be taken using the '-' operator, for example, + * "[[:L:]&[\u005Cu0000-\u005Cu0FFF]]" indicates the set of all Unicode letters + * with values less than 4096. Operators ('&' and '|') have equal + * precedence and bind left-to-right. Thus + * "[[:L:]-[a-z]-[\u005Cu0100-\u005Cu01FF]]" is equivalent to + * "[[[:L:]-[a-z]]-[\u005Cu0100-\u005Cu01FF]]". This only really matters for + * difference; intersection is commutative. + * + * + *
[a]The set containing 'a' + *
[a-z]The set containing 'a' + * through 'z' and all letters in between, in Unicode order + *
[^a-z]The set containing + * all characters but 'a' through 'z', + * that is, U+0000 through 'a'-1 and 'z'+1 through U+FFFF + *
[[pat1][pat2]] + * The union of sets specified by pat1 and pat2 + *
[[pat1]&[pat2]] + * The intersection of sets specified by pat1 and pat2 + *
[[pat1]-[pat2]] + * The asymmetric difference of sets specified by pat1 and + * pat2 + *
[:Lu:] + * The set of characters belonging to the given + * Unicode category, as defined by Character.getType(); in + * this case, Unicode uppercase letters + *
[:L:] + * The set of characters belonging to all Unicode categories + * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. + *
+ * + *

Character categories. + * + * Character categories are specified using the POSIX-like syntax + * '[:Lu:]'. The complement of a category is specified by inserting + * '^' after the opening '[:'. The following category names are + * recognized. Actual determination of category data uses + * Character.getType(), so it reflects the underlying + * implmementation used by Character. As of Java 2 and + * JDK 1.1.8, this is Unicode 2.1.2. + * + *

+ * Normative
+ *     Mn = Mark, Non-Spacing
+ *     Mc = Mark, Spacing Combining
+ *     Me = Mark, Enclosing
+ * 
+ *     Nd = Number, Decimal Digit
+ *     Nl = Number, Letter
+ *     No = Number, Other
+ * 
+ *     Zs = Separator, Space
+ *     Zl = Separator, Line
+ *     Zp = Separator, Paragraph
+ * 
+ *     Cc = Other, Control
+ *     Cf = Other, Format
+ *     Cs = Other, Surrogate
+ *     Co = Other, Private Use
+ *     Cn = Other, Not Assigned
+ * 
+ * Informative
+ *     Lu = Letter, Uppercase
+ *     Ll = Letter, Lowercase
+ *     Lt = Letter, Titlecase
+ *     Lm = Letter, Modifier
+ *     Lo = Letter, Other
+ * 
+ *     Pc = Punctuation, Connector
+ *     Pd = Punctuation, Dash
+ *     Ps = Punctuation, Open
+ *     Pe = Punctuation, Close
+ *    *Pi = Punctuation, Initial quote
+ *    *Pf = Punctuation, Final quote
+ *     Po = Punctuation, Other
+ * 
+ *     Sm = Symbol, Math
+ *     Sc = Symbol, Currency
+ *     Sk = Symbol, Modifier
+ *     So = Symbol, Other
+ * 
+ * *Unsupported by Java (and hence unsupported by UnicodeSet). + * + * @author Alan Liu + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ */ +public class UnicodeSet { + /** + * The internal representation is a StringBuffer of even length. + * Each pair of characters represents a range that is included in + * the set. A single character c is represented as cc. Thus, the + * ranges in the set are (a,b), a and b inclusive, where a = + * pairs.charAt(i) and b = pairs.charAt(i+1) for all even i, 0 <= + * i <= pairs.length()-2. Pairs are always stored in ascending + * Unicode order. Pairs are always stored in shortest form. For + * example, if the pair "hh", representing the single character + * 'h', is added to the pairs list "agik", representing the ranges + * 'a'-'g' and 'i'-'k', the result is "ak", not "aghhik". + * + * This representation format was originally used in Richard + * Gillam's CharSet class. + */ + private StringBuffer pairs; + + private static final String CATEGORY_NAMES = + // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 + //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8 + "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo"; + + private static final int UNSUPPORTED_CATEGORY = 17; + + private static final int CATEGORY_COUNT = 29; + + /** + * A cache mapping character category integers, as returned by + * Character.getType(), to pairs strings. Entries are initially + * null and are created on demand. + */ + private static final String[] CATEGORY_PAIRS_CACHE = + new String[CATEGORY_COUNT]; + + //---------------------------------------------------------------- + // Debugging and testing + //---------------------------------------------------------------- + + /** + * Return the representation of this set as a list of character + * ranges. Ranges are listed in ascending Unicode order. For + * example, the set [a-zA-M3] is represented as "33AMaz". + */ + public String getPairs() { + return pairs.toString(); + } + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + + /** + * Constructs an empty set. + */ + public UnicodeSet() { + pairs = new StringBuffer(); + } + + /** + * Constructs a set from the given pattern. See the class description + * for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @exception IllegalArgumentException if the pattern contains + * a syntax error. + */ + public UnicodeSet(String pattern) { + applyPattern(pattern, false); + } + + /** + * Constructs a set from the given pattern, optionally ignoring + * white space. See the class description for the syntax of the + * pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored, except those preceded by '\u005C'. Spaces are + * those characters for which Character.isSpaceChar() + * is true. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + public UnicodeSet(String pattern, boolean ignoreSpaces) { + applyPattern(pattern, ignoreSpaces); + } + + /** + * Constructs a set from the given Unicode character category. + * @param category an integer indicating the character category as + * returned by Character.getType(). + * @exception IllegalArgumentException if the given + * category is invalid. + */ + public UnicodeSet(int category) { + if (category < 0 || category >= CATEGORY_COUNT || + category == UNSUPPORTED_CATEGORY) { + throw new IllegalArgumentException("Invalid category"); + } + pairs = new StringBuffer(getCategoryPairs(category)); + } + + /** + * Modifies this set to represent the set specified by the given + * pattern. See the class description for the syntax of the + * pattern language. + * @param pattern a string specifying what characters are in the set + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + public final void applyPattern(String pattern) { + applyPattern(pattern, false); + } + + /** + * Modifies this set to represent the set specified by the given + * pattern, optionally ignoring white space. See the class + * description for the syntax of the pattern language. + * @param pattern a string specifying what characters are in the set + * @param ignoreSpaces if true, all spaces in the + * pattern are ignored. Spaces are those characters for which + * Character.isSpaceChar() is true. + * Characters preceded by '\\' are escaped, losing any special + * meaning they otherwise have. Spaces may be included by + * escaping them. + * @exception IllegalArgumentException if the pattern + * contains a syntax error. + */ + public void applyPattern(String pattern, boolean ignoreSpaces) { + ParsePosition pos = new ParsePosition(0); + + // To ignore spaces, create a new pattern without spaces. We + // have to process all '\' escapes. If '\' is encountered, + // insert it and the following character (if any -- let parse + // deal with any syntax errors) in the pattern. This allows + // escaped spaces. + if (ignoreSpaces) { + StringBuffer pat = new StringBuffer(); + for (int i=0; in, where 0 <= n <= 65536. + * + * @return the number of elements in this set (its cardinality). + */ + public int size() { + int n = 0; + for (int i=0; itrue if this set contains no elements. + * + * @return true if this set contains no elements. + */ + public boolean isEmpty() { + return pairs.length() == 0; + } + + /** + * Returns true if this set contains the specified range + * of chars. + * + * @return true if this set contains the specified range + * of chars. + */ + public boolean contains(char first, char last) { + // Set i to the end of the smallest range such that its end + // point >= last, or pairs.length() if no such range exists. + int i = 1; + while (ipairs.charAt(i)) i+=2; + return i=pairs.charAt(i-1); + } + + /** + * Returns true if this set contains the specified char. + * + * @return true if this set contains the specified char. + */ + public boolean contains(char c) { + return contains(c, c); + } + + /** + * Adds the specified range to this set if it is not already + * present. If this set already contains the specified range, + * the call leaves this set unchanged. If last > first + * then an empty range is added, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be added + * to this set. + * @param last last character, inclusive, of range to be added + * to this set. + */ + public void add(char first, char last) { + if (first <= last) { + addPair(pairs, first, last); + } + } + + /** + * Adds the specified character to this set if it is not already + * present. If this set already contains the specified character, + * the call leaves this set unchanged. + */ + public final void add(char c) { + add(c, c); + } + + /** + * Removes the specified range from this set if it is present. + * The set will not contain the specified range once the call + * returns. If last > first then an empty range is + * removed, leaving the set unchanged. + * + * @param first first character, inclusive, of range to be removed + * from this set. + * @param last last character, inclusive, of range to be removed + * from this set. + */ + public void remove(char first, char last) { + if (first <= last) { + removePair(pairs, first, last); + } + } + + /** + * Removes the specified character from this set if it is present. + * The set will not contain the specified range once the call + * returns. + */ + public final void remove(char c) { + remove(c, c); + } + + /** + * Returns true if the specified set is a subset + * of this set. + * + * @param c set to be checked for containment in this set. + * @return true if this set contains all of the elements of the + * specified set. + */ + public boolean containsAll(UnicodeSet c) { + // The specified set is a subset if all of its pairs are contained + // in this set. + int i = 1; + for (int j=0; j= last, or pairs.length() if no such range + // exists. + while (ipairs.charAt(i)) i+=2; + if (i>pairs.length() || c.pairs.charAt(j) < pairs.charAt(i-1)) { + return false; + } + } + return true; + } + + /** + * Adds all of the elements in the specified set to this set if + * they're not already present. This operation effectively + * modifies this set so that its value is the union of the two + * sets. The behavior of this operation is unspecified if the specified + * collection is modified while the operation is in progress. + * + * @param c set whose elements are to be added to this set. + * @see #add(char, char) + */ + public void addAll(UnicodeSet c) { + doUnion(pairs, c.pairs.toString()); + } + + /** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the intersection of the two sets. + * + * @param c set that defines which elements this set will retain. + */ + public void retainAll(UnicodeSet c) { + doIntersection(pairs, c.pairs.toString()); + } + + /** + * Removes from this set all of its elements that are contained in the + * specified set. This operation effectively modifies this + * set so that its value is the asymmetric set difference of + * the two sets. + * + * @param c set that defines which elements will be removed from + * this set. + */ + public void removeAll(UnicodeSet c) { + doDifference(pairs, c.pairs.toString()); + } + + /** + * Inverts this set. This operation modifies this set so that + * its value is its complement. This is equivalent to the pseudo code: + * this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this). + */ + public void complement() { + doComplement(pairs); + } + + /** + * Removes all of the elements from this set. This set will be + * empty after this call returns. + */ + public void clear() { + pairs.setLength(0); + } + + /** + * Compares the specified object with this set for equality. Returns + * true if the specified object is also a set, the two sets + * have the same size, and every member of the specified set is + * contained in this set (or equivalently, every member of this set is + * contained in the specified set). + * + * @param o Object to be compared for equality with this set. + * @return true if the specified Object is equal to this set. + */ + public boolean equals(Object o) { + return o instanceof UnicodeSet && + pairs.equals(((UnicodeSet)o).pairs); + } + + /** + * Returns the hash code value for this set. + * + * @return the hash code value for this set. + * @see Object#hashCode() + */ + public int hashCode() { + return pairs.hashCode(); + } + + //---------------------------------------------------------------- + // Implementation: Pattern parsing + //---------------------------------------------------------------- + + /** + * Parses the given pattern, starting at the given position. The + * character at pattern.charAt(pos.getIndex()) must be '[', or the + * parse fails. Parsing continues until the corresponding closing + * ']'. If a syntax error is encountered between the opening and + * closing brace, the parse fails. Upon return from a successful + * parse, the ParsePosition is updated to point to the character + * following the closing ']', and a StringBuffer containing a + * pairs list for the parsed pattern is returned. This method calls + * itself recursively to parse embedded subpatterns. + * + * @param pattern the string containing the pattern to be parsed. + * The portion of the string from pos.getIndex(), which must be a + * '[', to the corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. + * The character at pattern.charAt(pos.getIndex()) must be a '['. + * Upon return from a successful parse, pos.getIndex() is either + * the character after the closing ']' of the parsed pattern, or + * pattern.length() if the closing ']' is the last character of + * the pattern string. + * @return a StringBuffer containing a pairs list for the parsed + * substring of pattern + * @exception IllegalArgumentException if the parse fails. + */ + private static StringBuffer parse(String pattern, ParsePosition pos) { + + boolean invert = false; + StringBuffer pairsBuf = new StringBuffer(); + + /** + * Nodes: 0 - idle, waiting for '[' + * 10 - like 11, but immediately after "[" or "[^" + * 11 - awaiting x, "]", "[...]", or "[:...:]" + * 21 - after x + * 23 - after x- + * + * The parsing state machine moves from node 0 through zero or more + * other nodes back to node 0, in a successful parse. + */ + int node = 0; + char first = 0; + int i; + + /** + * This loop iterates over the characters in the pattern. We + * start at the position specified by pos. We exit the loop + * when either a matching closing ']' is seen, or we read all + * characters of the pattern. + */ + for (i=pos.getIndex(); i= pattern.length()) { + throw new IllegalArgumentException("Invalid \\u escape"); + } + c = '\u0000'; + for (int j=(++i)+4; i "aq". addPair("ampz", 'n', + * 'o') => "az". + */ + private static void addPair(StringBuffer pairs, char c, char d) { + char a = 0; + char b = 0; + for (int i=0; i "ak". + * removePair("ampz", 'l', 'q') => "akrz". + */ + private static void removePair(StringBuffer pairs, char c, char d) { + // Iterate over pairs until we find a pair that overlaps + // with the given range. + for (int i=0; i= a. + // rangeEdited is set to true if we have modified the + // range a-b (the range at i) in place. + boolean rangeEdited = false; + if (c > a) { + // If c is after a and before b, then we have overlap + // of this sort: a--c==b--d or a--c==d--b, where a-b + // and c-d are the ranges of interest. We need to + // add the range a,c-1. + pairs.setCharAt(i+1, (char)(c-1)); + // i is already a + rangeEdited = true; + } + if (d < b) { + // If d is after a and before b, we overlap like this: + // c--a==d--b or a--c==d--b, where a-b is the range at + // i and c-d is the range being removed. We need to + // add the range d+1,b. + if (rangeEdited) { + pairs.insert(i+2, new char[] { (char)(d+1), b }); + i += 2; + } else { + pairs.setCharAt(i, (char)(d+1)); + // i+1 is already b + rangeEdited = true; + } + } + if (!rangeEdited) { + // If we didn't add any ranges, that means the entire + // range a-b must be deleted, since we have + // c--a==b--d. + stringBufferDelete(pairs, i, i+2); + i -= 2; + } + } + } + + //---------------------------------------------------------------- + // Implementation: Fundamental operators + //---------------------------------------------------------------- + + /** + * Changes the pairs list to represent the complement of the set it + * currently represents. The pairs list will be normalized (in + * order and in shortest possible form) if the original pairs list + * was normalized. + */ + private static void doComplement(StringBuffer pairs) { + if (pairs.length() == 0) { + pairs.append('\u0000').append('\uffff'); + return; + } + + // Change each end to a start and each start to an end of the + // gaps between the ranges. That is, 3-7 9-12 becomes x-2 8-8 + // 13-x, where 'x' represents a range that must now be fixed + // up. + for (int i=0; i 0 && c1.charAt(i - 1) > ub) + ub = c1.charAt(i - 1); + + // now advance j to the first character that is greater + // that "ub" plus one + while (j < c2.length() && c2.charAt(j) <= ub + 1) + ++j; + + // if j points to the endpoint of a range, update "ub" + // to that character, or if j points to the start of + // a range and the endpoint of the preceding range is + // greater than "ub", update "up" to _that_ character + if (j % 2 == 1) + ub = c2.charAt(j); + else if (j > 0 && c2.charAt(j - 1) > ub) + ub = c2.charAt(j - 1); + } + // when we finally fall out of this loop, we will have stitched + // together a series of ranges that overlap or touch, i and j + // will both point to starting points of ranges, and "ub" will + // be the endpoint of the range we're working on. Write "ub" + // to the result + result.append(ub); + + // loop back around to create the next range in the result + } + + // we fall out to here when we've exhausted all the characters in + // one of the operands. We can append all of the remaining characters + // in the other operand without doing any extra work. + if (i < c1.length()) + result.append(c1.substring(i)); + if (j < c2.length()) + result.append(c2.substring(j)); + + pairs.setLength(0); + pairs.append(result.toString()); + } + + /** + * Given two pairs lists, changes the first in place to represent + * the asymmetric difference of the two sets. + */ + private static void doDifference(StringBuffer pairs, String pairs2) { + StringBuffer p2 = new StringBuffer(pairs2); + doComplement(p2); + doIntersection(pairs, p2.toString()); + } + + /** + * Given two pairs lists, changes the first in place to represent + * the intersection of the two sets. + * + * This implementation format was stolen from Richard Gillam's + * CharSet class. + */ + private static void doIntersection(StringBuffer pairs, String c2) { + StringBuffer result = new StringBuffer(); + String c1 = pairs.toString(); + + int i = 0; + int j = 0; + int oldI; + int oldJ; + + // iterate until we've exhausted one of the operands + while (i < c1.length() && j < c2.length()) { + + // advance j until it points to a character that is larger than + // the one i points to. If this is the beginning of a one- + // character range, advance j to point to the end + if (i < c1.length() && i % 2 == 0) { + while (j < c2.length() && c2.charAt(j) < c1.charAt(i)) + ++j; + if (j < c2.length() && j % 2 == 0 && c2.charAt(j) == c1.charAt(i)) + ++j; + } + + // if j points to the endpoint of a range, save the current + // value of i, then advance i until it reaches a character + // which is larger than the character pointed at + // by j. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldI = i; + while (j % 2 == 1 && i < c1.length() && c1.charAt(i) <= c2.charAt(j)) + ++i; + result.append(c1.substring(oldI, i)); + + // if i points to the endpoint of a range, save the current + // value of j, then advance j until it reaches a character + // which is larger than the character pointed at + // by i. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldJ = j; + while (i % 2 == 1 && j < c2.length() && c2.charAt(j) <= c1.charAt(i)) + ++j; + result.append(c2.substring(oldJ, j)); + + // advance i until it points to a character larger than j + // If it points at the beginning of a one-character range, + // advance it to the end of that range + if (j < c2.length() && j % 2 == 0) { + while (i < c1.length() && c1.charAt(i) < c2.charAt(j)) + ++i; + if (i < c1.length() && i % 2 == 0 && c2.charAt(j) == c1.charAt(i)) + ++i; + } + } + + pairs.setLength(0); + pairs.append(result.toString()); + } + + //---------------------------------------------------------------- + // Implementation: Generation of pairs for Unicode categories + //---------------------------------------------------------------- + + /** + * Returns a pairs string for the given category, given its name. + * The category name must be either a two-letter name, such as + * "Lu", or a one letter name, such as "L". One-letter names + * indicate the logical union of all two-letter names that start + * with that letter. Case is significant. If the name starts + * with the character '^' then the complement of the given + * character set is returned. + * + * Although individual categories such as "Lu" are cached, we do + * not currently cache single-letter categories such as "L" or + * complements such as "^Lu" or "^L". It would be easy to cache + * these as well in a hashtable should the need arise. + */ + private static String getCategoryPairs(String catName) { + boolean invert = (catName.length() > 1 && + catName.charAt(0) == '^'); + if (invert) { + catName = catName.substring(1); + } + + StringBuffer cat = null; + + // if we have two characters, search the category map for that + // code and either construct and return a UnicodeSet from the + // data in the category map or throw an exception + if (catName.length() == 2) { + int i = CATEGORY_NAMES.indexOf(catName); + if (i>=0 && i%2==0) { + i /= 2; + if (i != UNSUPPORTED_CATEGORY) { + String pairs = getCategoryPairs(i); + if (!invert) { + return pairs; + } + cat = new StringBuffer(pairs); + } + } + } else if (catName.length() == 1) { + // if we have one character, search the category map for + // codes beginning with that letter, and union together + // all of the matching sets that we find (or throw an + // exception if there are no matches) + for (int i=0; i= 0) { + pairs.append((char)first).append((char)last); + } + first = last = i; + } + } + } + if (first >= 0) { + pairs.append((char)first).append((char)last); + } + CATEGORY_PAIRS_CACHE[cat] = pairs.toString(); + } + return CATEGORY_PAIRS_CACHE[cat]; + } + + //---------------------------------------------------------------- + // Implementation: Utility methods + //---------------------------------------------------------------- + + /** + * Returns the character after the given position, or '\uFFFF' if + * there is none. + + */ + private static final char charAfter(String str, int i) { + return ((++i) < str.length()) ? str.charAt(i) : '\uFFFF'; + } + + /** + * Deletes a range of character from a StringBuffer, from start to + * limit-1. This is not part of JDK 1.1 StringBuffer, but is + * present in Java 2. + * @param start inclusive start of range + * @param limit exclusive end of range + */ + private static void stringBufferDelete(StringBuffer buf, + int start, int limit) { + // In Java 2 just use: + // buf.delete(start, limit); + char[] chars = null; + if (buf.length() > limit) { + chars = new char[buf.length() - limit]; + buf.getChars(limit, buf.length(), chars, 0); + } + buf.setLength(start); + if (chars != null) { + buf.append(chars); + } + } +} diff --git a/icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java b/icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java new file mode 100755 index 00000000000..1e688f65fa9 --- /dev/null +++ b/icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java @@ -0,0 +1,172 @@ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that converts from Unicode characters to + * hexadecimal Unicode escape sequences. It outputs a + * prefix specified in the constructor and optionally converts the hex + * digits to uppercase. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class UnicodeToHexTransliterator extends Transliterator { + + /** + * Package accessible ID for this transliterator. + */ + static String _ID = "Unicode-Hex"; + + private String prefix; + + private boolean uppercase; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructs a transliterator. + * @param prefix the string that will precede the four hex + * digits for UNICODE_HEX transliterators. Ignored + * if direction is HEX_UNICODE. + * @param uppercase if true, the four hex digits will be + * converted to uppercase; otherwise they will be lowercase. + * Ignored if direction is HEX_UNICODE. + */ + public UnicodeToHexTransliterator(String prefix, boolean uppercase, + UnicodeFilter filter) { + super(_ID, filter); + this.prefix = prefix; + this.uppercase = uppercase; + } + + /** + * Constructs a transliterator with the default prefix "\u" + * that outputs uppercase hex digits. + */ + public UnicodeToHexTransliterator() { + this("\\u", true, null); + } + + /** + * Returns the string that precedes the four hex digits. + * @return prefix string + */ + public String getPrefix() { + return prefix; + } + + /** + * Sets the string that precedes the four hex digits. + * + *

Callers must take care if a transliterator is in use by + * multiple threads. The prefix should not be changed by one + * thread while another thread may be transliterating. + * @param prefix prefix string + */ + public void setPrefix(String prefix) { + this.prefix = prefix; + } + + /** + * Returns true if this transliterator outputs uppercase hex digits. + */ + public boolean isUppercase() { + return uppercase; + } + + /** + * Sets if this transliterator outputs uppercase hex digits. + * + *

Callers must take care if a transliterator is in use by + * multiple threads. The uppercase mode should not be changed by + * one thread while another thread may be transliterating. + * @param outputUppercase if true, then this transliterator + * outputs uppercase hex digits. + */ + public void setUppercase(boolean outputUppercase) { + uppercase = outputUppercase; + } + + /** + * Transliterates a segment of a string. Transliterator API. + * @param text the string to be transliterated + * @param start the beginning index, inclusive; 0 <= start + * <= limit. + * @param limit the ending index, exclusive; start <= limit + * <= text.length(). + * @return the new limit index + */ + public int transliterate(Replaceable text, int start, int limit) { + int[] offsets = { start, limit, start }; + handleKeyboardTransliterate(text, offsets); + return offsets[LIMIT]; + } + + /** + * Implements {@link Transliterator#handleKeyboardTransliterate}. + */ + protected void handleKeyboardTransliterate(Replaceable text, + int[] offsets) { + /** + * Performs transliteration changing all characters to + * Unicode hexadecimal escapes. For example, '@' -> "U+0040", + * assuming the prefix is "U+". + */ + int cursor = offsets[CURSOR]; + int limit = offsets[LIMIT]; + + UnicodeFilter filter = getFilter(); + + loop: + while (cursor < limit) { + char c = text.charAt(cursor); + if (filter != null && !filter.isIn(c)) { + ++cursor; + continue; + } + String hex = hex(c); + text.replace(cursor, cursor+1, hex); + int len = hex.length(); + cursor += len; // Advance cursor by 1 and adjust for new text + --len; + limit += len; + } + + offsets[LIMIT] = limit; + offsets[CURSOR] = cursor; + } + + /** + * Return the length of the longest context required by this transliterator. + * This is preceding context. + * @param direction either FORWARD or REVERSE + * @return maximum number of preceding context characters this + * transliterator needs to examine + */ + protected int getMaximumContextLength() { + return 0; + } + + /** + * Form escape sequence. + */ + private final String hex(char c) { + StringBuffer buf = new StringBuffer(); + buf.append(prefix); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + String h = Integer.toHexString(c); + buf.append(uppercase ? h.toUpperCase() : h); + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/text/components/AppletFrame.java b/icu4j/src/com/ibm/text/components/AppletFrame.java new file mode 100755 index 00000000000..cf6cc399ddd --- /dev/null +++ b/icu4j/src/com/ibm/text/components/AppletFrame.java @@ -0,0 +1,126 @@ +package com.ibm.text.components; +import java.applet.*; +import java.net.URL; +import java.util.Enumeration; +import java.awt.*; +import java.awt.event.*; + +/** + *

A Frame that runs an Applet within itself, making it possible + * for an applet to run as an application. Usage: + * + *

+ * public class MyApplet extends Applet {
+ *     public static void main(String args[]) {
+ *         MyApplet applet = new MyApplet();
+ *         new AppletFrame("My Applet Running As An App", applet, 640, 480);
+ *     }
+ *     ...
+ * }
+ * 
+ *
+ * 

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: AppletFrame.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class AppletFrame extends Frame implements AppletStub, AppletContext { + + Applet applet; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Construct a Frame running the given Applet with the default size + * of 640 by 480. + * When the Frame is closed, the applet's stop() method is called, + * the Frame is dispose()d of, and System.exit(0) is called. + * + * @param name the Frame title + * @param applet the applet to be run + */ + public AppletFrame(String name, Applet applet) { + this(name, applet, 640, 480); + } + + /** + * Construct a Frame running the given Applet with the given size. + * When the Frame is closed, the applet's stop() method is called, + * the Frame is dispose()d of, and System.exit(0) is called. + * + * @param name the Frame title + * @param applet the applet to be run + * @param width width of the Frame + * @param height height of the Frame + */ + public AppletFrame(String name, Applet applet, int width, int height) { + super(name); + this.applet = applet; + applet.setStub(this); + + resize(width, height); + add("Center", applet); + show(); + addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + AppletFrame.this.applet.stop(); + dispose(); + System.exit(0); + } + }); + + applet.init(); + applet.start(); + } + + // AppletStub API + public void appletResize(int width, + int height) { + resize(width, height); + } + + public AppletContext getAppletContext() { + return this; + } + + public URL getCodeBase() { + return null; + } + + public URL getDocumentBase() { + return null; + } + + public String getParameter(String name) { + return "PARAMETER"; + } + + public boolean isActive() { + return true; + } + + // AppletContext API + public Applet getApplet(String name) { + return applet; + } + + public Enumeration getApplets() { + return null; + } + + public AudioClip getAudioClip(URL url) { + return null; + } + + public Image getImage(URL url) { + return null; + } + + public void showDocument(URL url) {} + public void showDocument(URL url, String target) {} + + public void showStatus(String status) { + System.out.println(status); + } +} diff --git a/icu4j/src/com/ibm/text/components/DumbTextComponent.java b/icu4j/src/com/ibm/text/components/DumbTextComponent.java new file mode 100755 index 00000000000..a400b9a76f1 --- /dev/null +++ b/icu4j/src/com/ibm/text/components/DumbTextComponent.java @@ -0,0 +1,708 @@ +package com.ibm.text.components; +import java.awt.*; +import java.awt.event.*; +import java.text.*; +import java.awt.datatransfer.*; + +// LIU: Changed from final to non-final +public class DumbTextComponent extends Canvas + implements KeyListener, MouseListener, MouseMotionListener, FocusListener + { + private transient static final String copyright = + "Copyright \u00A9 1998, Mark Davis. All Rights Reserved."; + private transient static boolean DEBUG = false; + + private String contents = ""; + private Selection selection = new Selection(); + private boolean editable = true; + + private transient Selection tempSelection = new Selection(); + private transient boolean focus; + private transient BreakIterator lineBreaker = BreakIterator.getLineInstance(); + private transient BreakIterator wordBreaker = BreakIterator.getWordInstance(); + private transient BreakIterator charBreaker = BreakIterator.getCharacterInstance(); + private transient int lineAscent; + private transient int lineHeight; + private transient int lineLeading; + private transient int lastHeight = 10; + private transient int lastWidth = 50; + private static final int MAX_LINES = 200; // LIU: Use symbolic name + private transient int[] lineStarts = new int[MAX_LINES]; // LIU + private transient int lineCount = 1; + + private transient boolean valid = false; + private transient FontMetrics fm; + private transient boolean redoLines = true; + private transient boolean doubleClick = false; + private transient TextListener textListener; + private transient ActionListener selectionListener; + private transient Image cacheImage; + private transient Dimension mySize; + private transient int xInset = 5; + private transient int yInset = 5; + private transient Point startPoint = new Point(); + private transient Point endPoint = new Point(); + private transient Point caretPoint = new Point(); + private transient static String clipBoard; + + private static final char CR = '\015'; // LIU + + // ============================================ + + public DumbTextComponent() { + addMouseListener(this); + addMouseMotionListener(this); + addKeyListener(this); + addFocusListener(this); + setCursor(Cursor.getPredefinedCursor(Cursor.TEXT_CURSOR)); + + } + +// ================ Events ==================== + + public boolean isFocusTraversable() { return true; } + + public void addActionListener(ActionListener l) { + selectionListener = AWTEventMulticaster.add(selectionListener, l); + } + + public void removeActionListener(ActionListener l) { + selectionListener = AWTEventMulticaster.remove(selectionListener, l); + } + + public void addTextListener(TextListener l) { + textListener = AWTEventMulticaster.add(textListener, l); + } + + public void removeTextListener(TextListener l) { + textListener = AWTEventMulticaster.remove(textListener, l); + } + + private transient boolean pressed; + + public void mousePressed(MouseEvent e) { + if (DEBUG) System.out.println("mousePressed"); + if (pressed) { + select(e,false); + } else { + doubleClick = e.getClickCount() > 1; + requestFocus(); + select(e, true); + pressed = true; + } + } + + public void mouseDragged(MouseEvent e) { + if (DEBUG) System.out.println("mouseDragged"); + select(e, false); + } + + public void mouseReleased(MouseEvent e) { + if (DEBUG) System.out.println("mouseReleased"); + pressed = false; + } + + public void mouseEntered(MouseEvent e) { + //if (pressed) select(e, false); + } + + public void mouseExited(MouseEvent e){ + //if (pressed) select(e, false); + } + + public void mouseClicked(MouseEvent e) {} + public void mouseMoved(MouseEvent e) {} + + + public void focusGained(FocusEvent e) { + if (DEBUG) System.out.println("focusGained"); + focus = true; + valid = false; + repaint(16); + } + public void focusLost(FocusEvent e) { + if (DEBUG) System.out.println("focusLost"); + focus = false; + valid = false; + repaint(16); + } + + public void select(MouseEvent e, boolean first) { + point2Offset(e.getPoint(), tempSelection); + if (first) { + if ((e.getModifiers() & InputEvent.SHIFT_MASK) == 0) { + tempSelection.anchor = tempSelection.caret; + } + } + // fix words + if (doubleClick) { + tempSelection.expand(wordBreaker); + } + select(tempSelection); + } + + public void keyPressed(KeyEvent e) { + int code = e.getKeyCode(); + if (DEBUG) System.out.println("keyPressed " + + hex((char)code) + ", " + hex((char)e.getModifiers())); + int start = selection.getStart(); + int end = selection.getEnd(); + boolean shift = (e.getModifiers() & KeyEvent.SHIFT_MASK) != 0; + boolean ctrl = (e.getModifiers() & KeyEvent.CTRL_MASK) != 0; + switch (code) { + case KeyEvent.VK_Q: + if (!ctrl || !editable) break; + fixHex(); + break; + case KeyEvent.VK_V: + if (!ctrl || !editable) break; + insertText(clipBoard); + break; + case KeyEvent.VK_C: + if (!ctrl) break; + clipBoard = contents.substring(selection.getStart(), selection.getEnd()); + break; + case KeyEvent.VK_X: + if (!ctrl) break; + clipBoard = contents.substring(selection.getStart(), selection.getEnd()); + if (editable) break; + insertText(""); + break; + case KeyEvent.VK_A: + if (!ctrl) break; + select(Integer.MAX_VALUE, 0, false); + break; + case KeyEvent.VK_RIGHT: + tempSelection.set(selection); + tempSelection.nextBound(ctrl ? wordBreaker : charBreaker, +1, shift); + select(tempSelection); + break; + case KeyEvent.VK_LEFT: + tempSelection.set(selection); + tempSelection.nextBound(ctrl ? wordBreaker : charBreaker, -1, shift); + select(tempSelection); + break; + case KeyEvent.VK_UP: // LIU: Add support for up arrow + tempSelection.set(selection); + tempSelection.caret = lineDelta(tempSelection.caret, -1); + if (!shift) { + tempSelection.anchor = tempSelection.caret; + } + select(tempSelection); + break; + case KeyEvent.VK_DOWN: // LIU: Add support for down arrow + tempSelection.set(selection); + tempSelection.caret = lineDelta(tempSelection.caret, +1); + if (!shift) { + tempSelection.anchor = tempSelection.caret; + } + select(tempSelection); + break; + case KeyEvent.VK_DELETE: // LIU: Add delete key support + if (!editable) break; + if (contents.length() == 0) break; + start = selection.getStart(); + end = selection.getEnd(); + if (start == end) { + ++end; + if (end > contents.length()) { + getToolkit().beep(); + return; + } + } + replaceRange("", start, end); + break; + } + } + + /** + * LIU: Given an offset into contents, moves up or down by lines, + * according to lineStarts[]. + * @param off the offset into contents + * @param delta how many lines to move up (< 0) or down (> 0) + * @return the new offset into contents + */ + private int lineDelta(int off, int delta) { + int line = findLine(off, false); + int posInLine = off - lineStarts[line]; + // System.out.println("off=" + off + " at " + line + ":" + posInLine); + line += delta; + if (line < 0) { + line = posInLine = 0; + } else if (line >= lineCount) { + return contents.length(); + } + off = lineStarts[line] + posInLine; + if (off >= lineStarts[line+1]) { + off = lineStarts[line+1] - 1; + } + return off; + } + + public void keyReleased(KeyEvent e) { + int code = e.getKeyCode(); + if (DEBUG) System.out.println("keyReleased " + + hex((char)code) + ", " + hex((char)e.getModifiers())); + } + + public void keyTyped(KeyEvent e) { + char ch = e.getKeyChar(); + if (DEBUG) System.out.println("keyTyped " + + hex((char)ch) + ", " + hex((char)e.getModifiers())); + if ((e.getModifiers() & KeyEvent.CTRL_MASK) != 0) return; + switch (ch) { + case KeyEvent.CHAR_UNDEFINED: + break; + case KeyEvent.VK_BACK_SPACE: + if (!editable) break; + if (contents.length() == 0) break; + int start = selection.getStart(); + int end = selection.getEnd(); + if (start == end) { + --start; + if (start < 0) { + getToolkit().beep(); // LIU: Add audio feedback of NOP + return; + } + } + replaceRange("", start, end); + break; + default: + if (!editable) break; + // LIU: Dispatch to subclass API + handleKeyTyped(e); + break; + } + } + + // LIU: Subclass API for handling of key typing + protected void handleKeyTyped(KeyEvent e) { + insertText(String.valueOf(e.getKeyChar())); + } + +// ===================== Control ====================== + + public synchronized void setEditable(boolean b) { + editable = b; + } + + public boolean isEditable() { + return editable; + } + + public void select(Selection newSelection) { + newSelection.pin(contents); + if (!selection.equals(newSelection)) { + selection.set(newSelection); + if (selectionListener != null) { + selectionListener.actionPerformed( + new ActionEvent(this, ActionEvent.ACTION_PERFORMED, + "Selection Changed", 0)); + } + repaint(10); + valid = false; + } + } + + public void select(int start, int end) { + select(start, end, false); + } + + public void select(int start, int end, boolean clickAfter) { + tempSelection.set(start, end, clickAfter); + select(tempSelection); + } + + public int getSelectionStart() { + return selection.getStart(); + } + + public int getSelectionEnd() { + return selection.getEnd(); + } + + public void setBounds(int x, int y, int w, int h) { + super.setBounds(x,y,w,h); + redoLines = true; + } + + public Dimension getPreferredSize() { + return new Dimension(lastWidth,lastHeight); + } + + public Dimension getMaximumSize() { + return new Dimension(lastWidth,lastHeight); + } + + public Dimension getMinimumSize() { + return new Dimension(lastHeight,lastHeight); + } + + public void setText(String text) { + setText2(text); + select(tempSelection.set(selection).pin(contents)); + } + + public void setText2(String text) { + contents = text; + charBreaker.setText(text); + wordBreaker.setText(text); + lineBreaker.setText(text); + redoLines = true; + if (textListener != null) + textListener.textValueChanged( + new TextEvent(this, TextEvent.TEXT_VALUE_CHANGED)); + repaint(16); + } + + public void insertText(String text) { + replaceRange(text, selection.getStart(), selection.getEnd()); + } + + public void replaceRange(String s, int start, int end) { + setText2(contents.substring(0,start) + s + + contents.substring(end)); + select(tempSelection.set(selection). + fixAfterReplace(start, end, s.length())); + } + + public String getText() { + return contents; + } + + public void setFont(Font font) { + super.setFont(font); + redoLines = true; + repaint(16); + } + + // ================== Graphics ====================== + + public void update(Graphics g) { + if (DEBUG) System.out.println("update"); + paint(g); + } + + public void paint(Graphics g) { + mySize = getSize(); + if (cacheImage == null + || cacheImage.getHeight(this) != mySize.height + || cacheImage.getWidth(this) != mySize.width) { + cacheImage = createImage(mySize.width, mySize.height); + valid = false; + } + if (!valid || redoLines) { + if (DEBUG) System.out.println("painting"); + paint2(cacheImage.getGraphics()); + valid = true; + } + //getToolkit().sync(); + if (DEBUG) System.out.println("copying"); + g.drawImage(cacheImage, + 0, 0, mySize.width, mySize.height, + 0, 0, mySize.width, mySize.height, + this); + } + + public void paint2(Graphics g) { + g.clearRect(0, 0, mySize.width, mySize.height); + if (DEBUG) System.out.println("print"); + if (focus) g.setColor(Color.black); + else g.setColor(Color.gray); + g.drawRect(0,0,mySize.width-1,mySize.height-1); + g.setClip(1,1, + mySize.width-2,mySize.height-2); + g.setColor(Color.black); + g.setFont(getFont()); + fm = g.getFontMetrics(); + lineAscent = fm.getAscent(); + lineLeading = fm.getLeading(); + lineHeight = lineAscent + fm.getDescent() + lineLeading; + int y = yInset + lineAscent; + String lastSubstring = ""; + if (redoLines) fixLineStarts(mySize.width-xInset-xInset); + for (int i = 0; i < lineCount; y += lineHeight, ++i) { + // LIU: Don't display terminating ^M characters + int lim = lineStarts[i+1]; + if (lim > 0 && contents.length() > 0 && + contents.charAt(lim-1) == CR) --lim; + lastSubstring = contents.substring(lineStarts[i],lim); + g.drawString(lastSubstring, xInset, y); + } + drawSelection(g, lastSubstring); + lastHeight = y + yInset - lineHeight + yInset; + lastWidth = mySize.width-xInset-xInset; + } + + void paintRect(Graphics g, int x, int y, int w, int h) { + if (focus) { + g.fillRect(x, y, w, h); + } else { + g.drawRect(x, y, w-1, h-1); + } + } + + public void drawSelection(Graphics g, String lastSubstring) { + g.setXORMode(Color.black); + if (selection.isCaret()) { + offset2Point(selection.caret, selection.clickAfter, caretPoint); + } else { + if (focus) g.setColor(Color.blue); + else g.setColor(Color.yellow); + offset2Point(selection.getStart(), true, startPoint); + offset2Point(selection.getEnd(), false, endPoint); + if (selection.getStart() == selection.caret) + caretPoint.setLocation(startPoint); + else caretPoint.setLocation(endPoint); + if (startPoint.y == endPoint.y) { + paintRect(g, startPoint.x, startPoint.y, + Math.max(1,endPoint.x-startPoint.x), lineHeight); + } else { + paintRect(g, startPoint.x, startPoint.y, + (mySize.width-xInset)-startPoint.x, lineHeight); + if (startPoint.y + lineHeight < endPoint.y) + paintRect(g, xInset, startPoint.y + lineHeight, + (mySize.width-xInset)-xInset, endPoint.y - startPoint.y - lineHeight); + paintRect(g, xInset, endPoint.y, endPoint.x-xInset, lineHeight); + } + } + if (focus || selection.isCaret()) { + if (focus) g.setColor(Color.green); + else g.setColor(Color.red); + int line = caretPoint.x - (selection.clickAfter ? 0 : 1); + g.fillRect(line, caretPoint.y, 1, lineHeight); + int w = lineHeight/12 + 1; + int braces = line - (selection.clickAfter ? -1 : w); + g.fillRect(braces, caretPoint.y, w, 1); + g.fillRect(braces, caretPoint.y + lineHeight - 1, w, 1); + } + } + + public Point offset2Point(int off, boolean start, Point p) { + int line = findLine(off, start); + int width = 0; + try { + width = fm.stringWidth( + contents.substring(lineStarts[line], off)); + } catch (Exception e) { + System.out.println(e); + } + p.x = width + xInset; + if (p.x > mySize.width - xInset) + p.x = mySize.width - xInset; + p.y = lineHeight * line + yInset; + return p; + } + + private int findLine(int off, boolean start) { + // if it is start, then go to the next line! + if (start) ++off; + for (int i = 1; i < lineCount; ++i) { + // LIU: This was <= ; changed to < to make caret after + // final CR in line appear at START of next line. + if (off < lineStarts[i]) return i-1; + } + // LIU: Check for special case; after CR at end of the last line + if (off == lineStarts[lineCount] && + off > 0 && contents.length() > 0 && contents.charAt(off-1) == CR) { + return lineCount; + } + return lineCount-1; + } + + // offsets on any line will go from start,true to end,false + // excluding start,false and end,true + public Selection point2Offset(Point p, Selection o) { + if (p.y < yInset) { + o.caret = 0; + o.clickAfter = true; + return o; + } + int line = (p.y - yInset)/lineHeight; + if (line >= lineCount) { + o.caret = contents.length(); + o.clickAfter = false; + return o; + } + int target = p.x - xInset; + if (target <= 0) { + o.caret = lineStarts[line]; + o.clickAfter = true; + return o; + } + int lowGuess = lineStarts[line]; + int lowWidth = 0; + int highGuess = lineStarts[line+1]; + int highWidth = fm.stringWidth(contents.substring(lineStarts[line],highGuess)); + if (target >= highWidth) { + o.caret = lineStarts[line+1]; + o.clickAfter = false; + return o; + } + while (lowGuess < highGuess - 1) { + int guess = (lowGuess + highGuess)/2; + int width = fm.stringWidth(contents.substring(lineStarts[line],guess)); + if (width <= target) { + lowGuess = guess; + lowWidth = width; + if (width == target) break; + } else { + highGuess = guess; + highWidth = width; + } + } + // at end, either lowWidth < target < width(low+1), or lowWidth = target + int highBound = charBreaker.following(lowGuess); + int lowBound = charBreaker.previous(); + // we are now at character boundaries + if (lowBound != lowGuess) + lowWidth = fm.stringWidth(contents.substring(lineStarts[line],lowBound)); + if (highBound != highGuess) + highWidth = fm.stringWidth(contents.substring(lineStarts[line],highBound)); + // we now have the right widths + if (target - lowWidth < highWidth - target) { + o.caret = lowBound; + o.clickAfter = true; + } else { + o.caret = highBound; + o.clickAfter = false; + } + // we now have the closest! + return o; + } + + private void fixLineStarts(int width) { + lineCount = 1; + lineStarts[0] = 0; + if (contents.length() == 0) { + lineStarts[1] = 0; + return; + } + int end = 0; + // LIU: Add check for MAX_LINES + for (int start = 0; start < contents.length() && lineCount < MAX_LINES; + start = end) { + end = nextLine(fm, start, width); + lineStarts[lineCount++] = end; + if (end == start) { // LIU: Assertion + throw new RuntimeException("nextLine broken"); + } + } + --lineCount; + redoLines = false; + } + + // LIU: Enhanced to wrap long lines. Bug with return of start fixed. + public int nextLine(FontMetrics fm, int start, int width) { + int len = contents.length(); + for (int i = start; i < len; ++i) { + // check for line separator + char ch = (contents.charAt(i)); + if (ch >= 0x000A && ch <= 0x000D || ch == 0x2028 || ch == 0x2029) { + len = i + 1; + if (ch == 0x000D && i+1 < len && contents.charAt(i+1) == 0x000A) // crlf + ++len; // grab extra char + break; + } + } + String subject = contents.substring(start,len); + if (visibleWidth(fm, subject) <= width) + return len; + + // LIU: Remainder of this method rewritten to accomodate lines + // longer than the component width by first trying to break + // into lines; then words; finally chars. + int n = findFittingBreak(fm, subject, width, lineBreaker); + if (n == 0) { + n = findFittingBreak(fm, subject, width, wordBreaker); + } + if (n == 0) { + n = findFittingBreak(fm, subject, width, charBreaker); + } + return n > 0 ? start + n : len; + } + + /** + * LIU: Finds the longest substring that fits a given width + * composed of subunits returned by a BreakIterator. If the smallest + * subunit is too long, returns 0. + * @param fm metrics to use + * @param line the string to be fix into width + * @param width line.substring(0, result) must be <= width + * @param breaker the BreakIterator that will be used to find subunits + * @return maximum characters, at boundaries returned by breaker, + * that fit into width, or zero on failure + */ + private int findFittingBreak(FontMetrics fm, String line, int width, + BreakIterator breaker) { + breaker.setText(line); + int last = breaker.first(); + int end = breaker.next(); + while (end != BreakIterator.DONE && + visibleWidth(fm, line.substring(0, end)) <= width) { + last = end; + end = breaker.next(); + } + return last; + } + + public int visibleWidth(FontMetrics fm, String s) { + int i; + for (i = s.length()-1; i >= 0; --i) { + char ch = s.charAt(i); + if (!(ch == ' ' || ch >= 0x000A && ch <= 0x000D || ch == 0x2028 || ch == 0x2029)) + return fm.stringWidth(s.substring(0,i+1));; + } + return 0; + } + +// =============== Utility ==================== + + private void fixHex() { + if (selection.getEnd() == 0) return; + int store = 0; + int places = 1; + int count = 0; + int min = Math.min(8,selection.getEnd()); + for (int i = 0; i < min; ++i) { + char ch = contents.charAt(selection.getEnd()-1-i); + int value = Character.getNumericValue(ch); + if (value < 0 || value > 15) break; + store += places * value; + ++count; + places *= 16; + } + String add = ""; + int bottom = store & 0xFFFF; + if (store >= 0xD8000000 && store < 0xDC000000 + && bottom >= 0xDC00 && bottom < 0xE000) { // surrogates + add = "" + (char)(store >> 16) + (char)bottom; + } else if (store > 0xFFFF && store <= 0x10FFFF) { + store -= 0x10000; + add = "" + (char)(((store >> 10) & 0x3FF) + 0xD800) + + (char)((store & 0x3FF) + 0xDC00); + + } else if (count >= 4) { + count = 4; + add = ""+(char)(store & 0xFFFF); + } else { + count = 1; + char ch = contents.charAt(selection.getEnd()-1); + add = hex(ch); + if (ch >= 0xDC00 && ch <= 0xDFFF && selection.getEnd() > 1) { + ch = contents.charAt(selection.getEnd()-2); + if (ch >= 0xD800 && ch <= 0xDBFF) { + count = 2; + add = hex(ch) + add; + } + } + } + replaceRange(add, selection.getEnd()-count, selection.getEnd()); + } + + public static String hex(char ch) { + String result = Integer.toString(ch,16).toUpperCase(); + result = "0000".substring(result.length(),4) + result; + return result; + } +} diff --git a/icu4j/src/com/ibm/text/components/Selection.java b/icu4j/src/com/ibm/text/components/Selection.java new file mode 100755 index 00000000000..985b36f3521 --- /dev/null +++ b/icu4j/src/com/ibm/text/components/Selection.java @@ -0,0 +1,155 @@ +package com.ibm.text.components; +import java.text.*; + +public final class Selection { + + public int anchor; + public int caret; + public boolean clickAfter; + + public int getStart() { + return anchor < caret ? anchor : caret; + } + + public int getEnd() { + return anchor > caret ? anchor : caret; + } + + public boolean isCaret() { + return anchor == caret; + } + + public Selection set(Selection other) { + anchor = other.anchor; + caret = other.caret; + clickAfter = other.clickAfter; + return this; + } + + public Selection set(int anchor, int caret, boolean clickAfter) { + this.anchor = anchor; + this.caret = caret; + this.clickAfter = clickAfter; + return this; + } + + public boolean equals(Object other) { + Selection other2 = (Selection)other; + return anchor == other2.anchor + && caret == other2.caret + && clickAfter == other2.clickAfter; + } + + public boolean isLessThan(Selection other) { + return getStart() < other.getEnd(); + } + + public Selection pin(String text) { + if (anchor > text.length()) { + anchor = text.length(); + } else if (anchor < 0) { + anchor = 0; + } + if (caret > text.length()) { + caret = text.length(); + clickAfter = true; + } else if (caret < 0) { + caret = 0; + clickAfter = false; + } + return this; + } + + public Selection swap(Selection after) { + int temp = anchor; + anchor = after.anchor; + after.anchor = temp; + temp = caret; + caret = after.caret; + after.caret = temp; + boolean b = clickAfter; + clickAfter = after.clickAfter; + after.clickAfter = b; + return this; + } + + public Selection fixAfterReplace(int start, int end, int len) { + if (anchor >= start) { + if (anchor < end) anchor = end; + anchor = start + len + anchor - end; + } + if (caret >= start) { + if (caret < end) caret = end; + caret = start + len + caret - end; + } + return this; + } + + // Mac & Windows considerably different + // Mac: end++. If start!=end, start=end + // SHIFT: move end right + // CTL: no different + // Windows: + // UNSHIFTED: if start!=end, start = end, else start=end=end+1; + // anchor = tip = start + // SHIFT: tip++ + // CTL: if start!=end, start = end = nextbound(end-1), + // else start=end=nextbound(end) + // anchor = tip = start + // CTL/SHIFT: tip = nextbound(tip) + + public Selection nextBound(BreakIterator breaker, + int direction, boolean extend) { + if (!extend && anchor != caret) caret -= direction; + caret = next(caret, breaker, direction, true); + if (!extend) anchor = caret; + clickAfter = false; + return this; + } + + // expand start and end to word breaks--if they are not already on one + public void expand(BreakIterator breaker) { + if (anchor <= caret) { + anchor = next(anchor,breaker,-1,false); + caret = next(caret,breaker,1,false); + /* + try { + breaker.following(anchor); + anchor = breaker.previous(); + } catch (Exception e) {} + try { + caret = breaker.following(caret-1); + } catch (Exception e) {} + */ + } else { + anchor = next(anchor,breaker,1,false); + caret = next(caret,breaker,-1,false); + /* + try { + breaker.following(caret); + caret = breaker.previous(); + } catch (Exception e) {} + try { + anchor = breaker.following(anchor-1); + } catch (Exception e) {} + */ + } + } + + // different = false - move to next boundary, unless on one + // true - move to next boundary, even if on one + public static int next(int position, BreakIterator breaker, + int direction, boolean different) { + if (!different) position -= direction; + try { + if (direction > 0) { + position = breaker.following(position); + } else { + breaker.following(position-1); + position = breaker.previous(); + } + } catch (Exception e) {} + return position; + } +} + diff --git a/icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java b/icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java new file mode 100755 index 00000000000..02bcd5996a5 --- /dev/null +++ b/icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java @@ -0,0 +1,191 @@ +package com.ibm.text.components; + +import java.awt.*; +import java.awt.event.*; +import java.text.*; +import java.awt.datatransfer.*; +import com.ibm.text.*; + +/** + * A subclass of {@link DumbTextComponent} that passes key events through + * a {@link com.ibm.text.Transliterator}. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @version $RCSfile: TransliteratingTextComponent.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $ + */ +public class TransliteratingTextComponent extends DumbTextComponent { + + private static boolean DEBUG = false; + + private Transliterator translit = null; + + // Index into getText() where the start of transliteration is. + // As we commit text during keyboardTransliteration, we advance + // this. + private int start = 0; + + // Index into getText() where the cursor is; cursor >= start + private int cursor = 0; + + private static final String COPYRIGHT = + "\u00A9 IBM Corporation 1999. All rights reserved."; + + /** + * Constructor. + */ + public TransliteratingTextComponent() { + super(); + addActionListener(new ActionListener() { + public void actionPerformed(ActionEvent e) { + // We get an ActionEvent only when the selection changes + resetTransliterationStart(); + } + }); + } + + /** + * {@link DumbTextComponent} API. Framework method that is called + * when a KeyEvent is received. This implementation + * runs the new character through the current + * Transliterator, if one is set, and inserts the + * transliterated text into the buffer. + */ + protected void handleKeyTyped(KeyEvent e) { + char ch = e.getKeyChar(); + + if (translit == null) { + super.handleKeyTyped(e); + return; + } + + // ------------------------------------------------------------ + // The following case motivates the two lines that recompute + // start and cursor below. + + // " " + // a b c q r|s t u m m + // 0 1 2 3 4 5 6 7 8 9 + // 0 1 2 + + // start 3, cursor 5, sel 6 -> { 0, 3, 2 } + // : new int[] { 0, sel - start, cursor - start }; + + // sz>99|9 + + // " { " + // a b c q r 9 9|9 t u m m + // 0 1 2 3 4 5 6 7 8 9 a b + // 0 1 2 3 4 + + // { 3, 5, 4 } -> start 6, cursor 7, sel 8 + // : start += index[0]; + // : cursor = start + index[2] - index[0]; + // ------------------------------------------------------------ + + // Need to save start because calls to replaceRange will update + // start and cursor. + int saveStart = start; + + ReplaceableString buf = new ReplaceableString(); + buf.getStringBuffer().append(getText().substring(start, + getSelectionStart())); + + int[] index = new int[] { 0, getSelectionStart() - start, + cursor - start}; + + StringBuffer log = null; + if (DEBUG) { + log = new StringBuffer(); + log.append("start " + start + ", cursor " + cursor); + log.append(", sel " + getSelectionStart()); + log.append(", {" + index[0] + ", " + index[1] + ", " + index[2] + "}, "); + log.append('"' + buf.toString() + "\" + '" + ch + "' -> \""); + } + + translit.keyboardTransliterate(buf, index, ch); + replaceRange(buf.toString(), start, getSelectionEnd()); + // At this point start has been changed by the callback to + // resetTransliteratorStart() via replaceRange() -- so use our + // local copy, saveStart. + + // The START index is zero-based. On entry to keyboardTransliterate(), + // it was zero. We can therefore just add it to our original + // getText()-based index value of start (in saveStart) to get + // the new getText()-based start. + start = saveStart + index[Transliterator.START]; + + // Make the cursor getText()-based. The CURSOR index is zero-based. + cursor = start + index[Transliterator.CURSOR] + - index[Transliterator.START]; + + if (DEBUG) { + String out = buf.toString(); + log.append(out.substring(0, index[Transliterator.START])). + append('{'). + append(out.substring(index[Transliterator.START], + index[Transliterator.CURSOR])). + append('|'). + append(out.substring(index[Transliterator.CURSOR])). + append('"'); + log.append(", {" + index[0] + ", " + index[1] + ", " + index[2] + "}, "); + log.append("start " + start + ", cursor " + cursor); + log.append(", sel " + getSelectionStart()); + System.out.println(escape(log.toString())); + } + } + + /** + * Set the {@link com.ibm.text.Transliterator} and direction to + * use to process incoming KeyEvents. + * @param t the {@link com.ibm.text.Transliterator} to use + */ + public void setTransliterator(Transliterator t) { + if (translit != t) { // [sic] pointer compare ok; singletons + resetTransliterationStart(); + } + translit = t; + } + + /** + * Reset the start point at which transliteration begins. This + * needs to be done when the user moves the cursor or when the + * current {@link com.ibm.text.Transliterator} is changed. + */ + private void resetTransliterationStart() { + start = getSelectionStart(); + cursor = start; + } + + /** + * Escape non-ASCII characters as Unicode. + * JUST FOR DEBUGGING OUTPUT. + */ + public static final String escape(String s) { + StringBuffer buf = new StringBuffer(); + for (int i=0; i= ' ' && c <= 0x007F) { + if (c == '\\') { + buf.append("\\\\"); // That is, "\\" + } else { + buf.append(c); + } + } else { + buf.append("\\u"); + if (c < 0x1000) { + buf.append('0'); + if (c < 0x100) { + buf.append('0'); + if (c < 0x10) { + buf.append('0'); + } + } + } + buf.append(Integer.toHexString(c)); + } + } + return buf.toString(); + } +} diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java new file mode 100755 index 00000000000..fa9a89b2d60 --- /dev/null +++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java @@ -0,0 +1,132 @@ +package com.ibm.text.resources; + +import java.util.ListResourceBundle; + +public class TransliterationRuleKeyboardEscapeLatin1 extends ListResourceBundle { + /** + * Overrides ListResourceBundle + */ + public Object[][] getContents() { + return new Object[][] { + { "Description", + "Keyboard transliterator for Latin-1 block" }, + + { "Rule", + "esc=''\n" + + "grave=`\n" + + "acute=''\n" + + "hat=^\n" + + "tilde=~\n" + + "umlaut=:\n" + + "ring=.\n" + + "cedilla=,\n" + + "slash=/\n" + + "super=^\n" + + // Make keyboard entry of {esc} possible + // and of backslash + + "'\\'{esc}>{esc}\n" + + "'\\\\'>'\\'\n" + + // Long keys + + "cur{esc}>\u00A4\n" + + "sec{esc}>\u00A7\n" + + "not{esc}>\u00AC\n" + + "mul{esc}>\u00D7\n" + + "div{esc}>\u00F7\n" + + + " {esc}>\u00A0\n" // non-breaking space + + "!{esc}>\u00A1\n" // inverted exclamation + + "c/{esc}>\u00A2\n" // cent sign + + "lb{esc}>\u00A3\n" // pound sign + + "'|'{esc}>\u00A6\n" // broken vertical bar + + ":{esc}>\u00A8\n" // umlaut + + "{super}a{esc}>\u00AA\n" // feminine ordinal + + "'<<'{esc}>\u00AB\n" + + "r{esc}>\u00AE\n" + + "--{esc}>\u00AF\n" + + "-{esc}>\u00AD\n" + + "+-{esc}>\u00B1\n" + + "{super}2{esc}>\u00B2\n" + + "{super}3{esc}>\u00B3\n" + + "{acute}{esc}>\u00B4\n" + + "m{esc}>\u00B5\n" + + "para{esc}>\u00B6\n" + + "dot{esc}>\u00B7\n" + + "{cedilla}{esc}>\u00B8\n" + + "{super}1{esc}>\u00B9\n" + + "{super}o{esc}>\u00BA\n" // masculine ordinal + + "'>>'{esc}>\u00BB\n" + + "1/4{esc}>\u00BC\n" + + "1/2{esc}>\u00BD\n" + + "3/4{esc}>\u00BE\n" + + "?{esc}>\u00BF\n" + + "A{grave}{esc}>\u00C0\n" + + "A{acute}{esc}>\u00C1\n" + + "A{hat}{esc}>\u00C2\n" + + "A{tilde}{esc}>\u00C3\n" + + "A{umlaut}{esc}>\u00C4\n" + + "A{ring}{esc}>\u00C5\n" + + "AE{esc}>\u00C6\n" + + "C{cedilla}{esc}>\u00C7\n" + + "E{grave}{esc}>\u00C8\n" + + "E{acute}{esc}>\u00C9\n" + + "E{hat}{esc}>\u00CA\n" + + "E{umlaut}{esc}>\u00CB\n" + + "I{grave}{esc}>\u00CC\n" + + "I{acute}{esc}>\u00CD\n" + + "I{hat}{esc}>\u00CE\n" + + "I{umlaut}{esc}>\u00CF\n" + + "D-{esc}>\u00D0\n" + + "N{tilde}{esc}>\u00D1\n" + + "O{grave}{esc}>\u00D2\n" + + "O{acute}{esc}>\u00D3\n" + + "O{hat}{esc}>\u00D4\n" + + "O{tilde}{esc}>\u00D5\n" + + "O{umlaut}{esc}>\u00D6\n" + + "O{slash}{esc}>\u00D8\n" + + "U{grave}{esc}>\u00D9\n" + + "U{acute}{esc}>\u00DA\n" + + "U{hat}{esc}>\u00DB\n" + + "U{umlaut}{esc}>\u00DC\n" + + "Y{acute}{esc}>\u00DD\n" + + "TH{esc}>\u00DE\n" + + "ss{esc}>\u00DF\n" + + "a{grave}{esc}>\u00E0\n" + + "a{acute}{esc}>\u00E1\n" + + "a{hat}{esc}>\u00E2\n" + + "a{tilde}{esc}>\u00E3\n" + + "a{umlaut}{esc}>\u00E4\n" + + "a{ring}{esc}>\u00E5\n" + + "ae{esc}>\u00E6\n" + + "c{cedilla}{esc}>\u00E7\n" + + "c{esc}>\u00A9\n" // copyright - after c{cedilla} + + "e{grave}{esc}>\u00E8\n" + + "e{acute}{esc}>\u00E9\n" + + "e{hat}{esc}>\u00EA\n" + + "e{umlaut}{esc}>\u00EB\n" + + "i{grave}{esc}>\u00EC\n" + + "i{acute}{esc}>\u00ED\n" + + "i{hat}{esc}>\u00EE\n" + + "i{umlaut}{esc}>\u00EF\n" + + "d-{esc}>\u00F0\n" + + "n{tilde}{esc}>\u00F1\n" + + "o{grave}{esc}>\u00F2\n" + + "o{acute}{esc}>\u00F3\n" + + "o{hat}{esc}>\u00F4\n" + + "o{tilde}{esc}>\u00F5\n" + + "o{umlaut}{esc}>\u00F6\n" + + "o{slash}{esc}>\u00F8\n" + + "o{esc}>\u00B0\n" + + "u{grave}{esc}>\u00F9\n" + + "u{acute}{esc}>\u00FA\n" + + "u{hat}{esc}>\u00FB\n" + + "u{umlaut}{esc}>\u00FC\n" + + "y{acute}{esc}>\u00FD\n" + + "y{esc}>\u00A5\n" // yen sign + + "th{esc}>\u00FE\n" + + "ss{esc}>\u00FF\n" + } + }; + } +} diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java new file mode 100755 index 00000000000..bb96443d051 --- /dev/null +++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java @@ -0,0 +1,243 @@ +package com.ibm.text.resources; + +import java.util.ListResourceBundle; + +public class TransliterationRuleLatinArabic extends ListResourceBundle { + /** + * Overrides ListResourceBundle + */ + public Object[][] getContents() { + return new Object[][] { + { "HasInverse", "1" }, + + { "Rule", + // To Do: finish adding shadda, add sokoon + + "alefmadda=\u0622\n"+ + "alefuhamza=\u0623\n"+ + "wauuhamza=\u0624\n"+ + "alefhamza=\u0625\n"+ + "yehuhamza=\u0626\n"+ + "alef=\u0627\n"+ + "beh=\u0628\n"+ + "tehmarbuta=\u0629\n"+ + "teh=\u062A\n"+ + "theh=\u062B\n"+ + "geem=\u062C\n"+ + "hah=\u062D\n"+ + "kha=\u062E\n"+ + "dal=\u062F\n"+ + "dhal=\u0630\n"+ + "reh=\u0631\n"+ + "zain=\u0632\n"+ + "seen=\u0633\n"+ + "sheen=\u0634\n"+ + "sad=\u0635\n"+ + "dad=\u0636\n"+ + "tah=\u0637\n"+ + "zah=\u0638\n"+ + "ein=\u0639\n"+ + "ghein=\u063A\n"+ + "feh=\u0641\n"+ + "qaaf=\u0642\n"+ + "kaf=\u0643\n"+ + "lam=\u0644\n"+ + "meem=\u0645\n"+ + "noon=\u0646\n"+ + "heh=\u0647\n"+ + "wau=\u0648\n"+ + "yehmaqsura=\u0649\n"+ + "yeh=\u064A\n"+ + "peh=\u06A4\n"+ + + "hamza=\u0621\n"+ + "fathatein=\u064B\n"+ + "dammatein=\u064C\n"+ + "kasratein=\u064D\n"+ + "fatha=\u064E\n"+ + "damma=\u064F\n"+ + "kasra=\u0650\n"+ + "shadda=\u0651\n"+ + "sokoon=\u0652\n"+ + + // convert English to Arabic + "Arabic>"+ + "\u062a\u062a\u0645\u062a\u0639\u0020"+ + "\u0627\u0644\u0644\u063a\u0629\u0020"+ + "\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+ + "\u0628\u0628\u0646\u0638\u0645\u0020"+ + "\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+ + "\u062c\u0645\u064a\u0644\u0629\n"+ + + "ai>{alefmadda}\n"+ + "ae>{alefuhamza}\n"+ + "ao>{alefhamza}\n"+ + "aa>{alef}\n"+ + "an>{fathatein}\n"+ + "a>{fatha}\n"+ + "b>{beh}\n"+ + "c>{kaf}\n"+ + "{dhal}]dh>{shadda}\n"+ + "dh>{dhal}\n"+ + "{dad}]dd>{shadda}\n"+ + "dd>{dad}\n"+ + "{dal}]d>{shadda}\n"+ + "d>{dal}\n"+ + "e>{ein}\n"+ + "f>{feh}\n"+ + "gh>{ghein}\n"+ + "g>{geem}\n"+ + "hh>{hah}\n"+ + "h>{heh}\n"+ + "ii>{kasratein}\n"+ + "i>{kasra}\n"+ + "j>{geem}\n"+ + "kh>{kha}\n"+ + "k>{kaf}\n"+ + "l>{lam}\n"+ + "m>{meem}\n"+ + "n>{noon}\n"+ + "o>{hamza}\n"+ + "p>{peh}\n"+ + "q>{qaaf}\n"+ + "r>{reh}\n"+ + "sh>{sheen}\n"+ + "ss>{sad}\n"+ + "s>{seen}\n"+ + "th>{theh}\n"+ + "tm>{tehmarbuta}\n"+ + "tt>{tah}\n"+ + "t>{teh}\n"+ + "uu>{dammatein}\n"+ + "u>{damma}\n"+ + "v>{beh}\n"+ + "we>{wauuhamza}\n"+ + "w>{wau}\n"+ + "x>{kaf}{shadda}{seen}\n"+ + "ye>{yehuhamza}\n"+ + "ym>{yehmaqsura}\n"+ + "y>{yeh}\n"+ + "zz>{zah}\n"+ + "z>{zain}\n"+ + + "0>\u0660\n"+ // Arabic digit 0 + "1>\u0661\n"+ // Arabic digit 1 + "2>\u0662\n"+ // Arabic digit 2 + "3>\u0663\n"+ // Arabic digit 3 + "4>\u0664\n"+ // Arabic digit 4 + "5>\u0665\n"+ // Arabic digit 5 + "6>\u0666\n"+ // Arabic digit 6 + "7>\u0667\n"+ // Arabic digit 7 + "8>\u0668\n"+ // Arabic digit 8 + "9>\u0669\n"+ // Arabic digit 9 + "%>\u066A\n"+ // Arabic % + ".>\u066B\n"+ // Arabic decimal separator + ",>\u066C\n"+ // Arabic thousands separator + "*>\u066D\n"+ // Arabic five-pointed star + + "`0>0\n"+ // Escaped forms of the above + "`1>1\n"+ + "`2>2\n"+ + "`3>3\n"+ + "`4>4\n"+ + "`5>5\n"+ + "`6>6\n"+ + "`7>7\n"+ + "`8>8\n"+ + "`9>9\n"+ + "`%>%\n"+ + "`.>.\n"+ + "`,>,\n"+ + "`*>*\n"+ + "``>`\n"+ + + "''>\n"+ + + // now Arabic to English + + "''ai\u041f\u0420\u0410\u0412\u0414\u0410\u00D1\u0020\u0411\u044d\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f\u002c\u0020\u043a\u044b\u0440\u0433\u044b\u0437\u002c\u0020\u041c\u043e\u043b\u0434\u043e\u0432\u044d\u043d\u044f\u0441\u043a\u044d\u002e\n" + + + //special equivs for ay, oy, ... + "Y{a}{i}>{cyYa}{cyY}\n" + + "Y{e}{i}>{cyYe}{cyY}\n" + + "Y{i}{i}>{cyYi}{cyY}\n" + + "Y{o}{i}>{cyYo}{cyY}\n" + + "Y{u}{i}>{cyYu}{cyY}\n" + + "A{i}>{cyA}{cyY}\n" + + "E{i}>{cyE}{cyY}\n" + + //skip II, since it is the soft sign + "O{i}>{cyO}{cyY}\n" + + "U{i}>{cyU}{cyY}\n" + + + "A>{cyA}\n" + + "B>{cyBe}\n" + + "C{h}>{cyChe}\n" + + "C[{iey}>{cySe}\n" + + "C>{cyKe}\n" + + "D>{cyDe}\n" + + "E>{cyE}\n" + + "F>{cyFe}\n" + + "G>{cyGe}\n" + + "H>{cyHard}\n" + + "I{i}>{cySoft}\n" + + "I>{cyI}\n" + + "J>{cyDe}{cyZhe}\n" + + "K{h}>{cyKhe}\n" + + "K>{cyKe}\n" + + "L>{cyLe}\n" + + "M>{cyMe}\n" + + "N>{cyNe}\n" + + "O>{cyO}\n" + + "P>{cyPe}\n" + + "Q{u}>{cyKe}{cyVe}\n" + + "R>{cyRe}\n" + + "S{h}{t}{c}{h}>{cyShche}\n" + + "S{h}{c}{h}>{cyShche}\n" + + "S{h}>{cyShe}\n" + + "S>{cySe}\n" + + "T{c}{h}>{cyChe}\n" + + "T{h}>{cyZe}\n" + + "T{s}>{cyTse}\n" + + "T>{cyTe}\n" + + "U>{cyU}\n" + + "V>{cyVe}\n" + + "W{h}>{cyVe}\n" + + "W>{cyVe}\n" + + "X>{cyKe}{cySe}\n" + + "Y{e}>{cyYe}\n" + + "Y{o}>{cyYo}\n" + + "Y{u}>{cyYu}\n" + + "Y{a}>{cyYa}\n" + + "Y{i}>{cyYi}\n" + + "Y>{cyY}\n" + + "Z{h}>{cyZhe}\n" + + "Z>{cyZe}\n" + + "X>{cyKe}{cySe}\n" + + + //lower case: doesn''t solve join bug + "y{a}{i}>{cyya}{cyy}\n" + + "y{e}{i}>{cyye}{cyy}\n" + + "y{i}{i}>{cyyi}{cyy}\n" + + "y{o}{i}>{cyyo}{cyy}\n" + + "y{u}{i}>{cyyu}{cyy}\n" + + "a{i}>{cya}{cyy}\n" + + "e{i}>{cye}{cyy}\n" + + //skip ii, since it is the soft sign + "o{i}>{cyo}{cyy}\n" + + "u{i}>{cyu}{cyy}\n" + + + "a>{cya}\n" + + "b>{cybe}\n" + + "c{h}>{cyche}\n" + + "c[{iey}>{cyse}\n" + + "c>{cyke}\n" + + "d>{cyde}\n" + + "e>{cye}\n" + + "f>{cyfe}\n" + + "g>{cyge}\n" + + "h>{cyhard}\n" + + "i{i}>{cysoft}\n" + + "i>{cyi}\n" + + "j>{cyde}{cyzhe}\n" + + "k{h}>{cykhe}\n" + + "k>{cyke}\n" + + "l>{cyle}\n" + + "m>{cyme}\n" + + "n>{cyne}\n" + + "o>{cyo}\n" + + "p>{cype}\n" + + "q{u}>{cyke}{cyve}\n" + + "r>{cyre}\n" + + "s{h}{t}{c}{h}>{cyshche}\n" + + "s{h}{c}{h}>{cyshche}\n" + + "s{h}>{cyshe}\n" + + "s>{cyse}\n" + + "t{c}{h}>{cyche}\n" + + "t{h}>{cyze}\n" + + "t{s}>{cytse}\n" + + "t>{cyte}\n" + + "u>{cyu}\n" + + "v>{cyve}\n" + + "w{h}>{cyve}\n" + + "w>{cyve}\n" + + "x>{cyke}{cyse}\n" + + "y{e}>{cyye}\n" + + "y{o}>{cyyo}\n" + + "y{u}>{cyyu}\n" + + "y{a}>{cyya}\n" + + "y{i}>{cyyi}\n" + + "y>{cyy}\n" + + "z{h}>{cyzhe}\n" + + "z>{cyze}\n" + + "x>{cyke}{cyse}\n" + + + //generally the last rule + "''>\n" + + + //now Russian to English + + "Y''<{cyY}[{cyA}\n" + + "Y''<{cyY}[{cyE}\n" + + "Y''<{cyY}[{cyI}\n" + + "Y''<{cyY}[{cyO}\n" + + "Y''<{cyY}[{cyU}\n" + + "Y''<{cyY}[{cya}\n" + + "Y''<{cyY}[{cye}\n" + + "Y''<{cyY}[{cyi}\n" + + "Y''<{cyY}[{cyo}\n" + + "Y''<{cyY}[{cyu}\n" + + "A<{cyA}\n" + + "B<{cyBe}\n" + + "J<{cyDe}{cyZhe}\n" + + "J<{cyDe}{cyzhe}\n" + + "D<{cyDe}\n" + + "V<{cyVe}\n" + + "G<{cyGe}\n" + + "Zh<{cyZhe}[{lower}\n" + + "ZH<{cyZhe}\n" + + "Z''<{cyZe}[{cyHard}\n" + + "Z''<{cyZe}[{cyhard}\n" + + "Z<{cyZe}\n" + + "Ye<{cyYe}[{lower}\n" + + "YE<{cyYe}\n" + + "Yo<{cyYo}[{lower}\n" + + "YO<{cyYo}\n" + + "Yu<{cyYu}[{lower}\n" + + "YU<{cyYu}\n" + + "Ya<{cyYa}[{lower}\n" + + "YA<{cyYa}\n" + + "Yi<{cyYi}[{lower}\n" + + "YI<{cyYi}\n" + + "Y<{cyY}\n" + + "Kh<{cyKhe}[{lower}\n" + + "KH<{cyKhe}\n" + + "K''<{cyKe}[{cyHard}\n" + + "K''<{cyKe}[{cyhard}\n" + + "X<{cyKe}{cySe}\n" + + "X<{cyKe}{cyse}\n" + + "K<{cyKe}\n" + + "L<{cyLe}\n" + + "M<{cyMe}\n" + + "N<{cyNe}\n" + + "O<{cyO}\n" + + "P<{cyPe}\n" + + + "R<{cyRe}\n" + + "Shch<{cyShche}[{lower}\n" + + "SHCH<{cyShche}\n" + + "Sh''<{cyShe}[{cyche}\n" + + "SH''<{cyShe}[{cyChe}\n" + + "Sh<{cyShe}[{lower}\n" + + "SH<{cyShe}\n" + + "S''<{cySe}[{cyHard}\n" + + "S''<{cySe}[{cyhard}\n" + + "S<{cySe}\n" + + "Ts<{cyTse}[{lower}\n" + + "TS<{cyTse}\n" + + "T''<{cyTe}[{cySe}\n" + + "T''<{cyTe}[{cyse}\n" + + "T''<{cyTe}[{cyHard}\n" + + "T''<{cyTe}[{cyhard}\n" + + "T<{cyTe}\n" + + "U<{cyU}\n" + + "F<{cyFe}\n" + + "Ch<{cyChe}[{lower}\n" + + "CH<{cyChe}\n" + + "H<{cyHard}\n" + + "I''<{cyI}[{cyI}\n" + + "I''<{cyI}[{cyi}\n" + + "I<{cyI}\n" + + "Ii<{cySoft}[{lower}\n" + + "II<{cySoft}\n" + + "E<{cyE}\n" + + + //lowercase + "y''<{cyy}[{cya}\n" + + "y''<{cyy}[{cye}\n" + + "y''<{cyy}[{cyi}\n" + + "y''<{cyy}[{cyo}\n" + + "y''<{cyy}[{cyu}\n" + + "y''<{cyy}[{cyA}\n" + + "y''<{cyy}[{cyE}\n" + + "y''<{cyy}[{cyI}\n" + + "y''<{cyy}[{cyO}\n" + + "y''<{cyy}[{cyU}\n" + + "a<{cya}\n" + + "b<{cybe}\n" + + "j<{cyde}{cyzhe}\n" + + "j<{cyde}{cyZhe}\n" + + "d<{cyde}\n" + + "v<{cyve}\n" + + "g<{cyge}\n" + + "zh<{cyzhe}\n" + + "z''<{cyze}[{cyhard}\n" + + "z''<{cyze}[{cyHard}\n" + + "z<{cyze}\n" + + "ye<{cyye}\n" + + "yo<{cyyo}\n" + + "yu<{cyyu}\n" + + "ya<{cyya}\n" + + "yi<{cyyi}\n" + + "y<{cyy}\n" + + "kh<{cykhe}\n" + + "k''<{cyke}[{cyhard}\n" + + "k''<{cyke}[{cyHard}\n" + + "x<{cyke}{cyse}\n" + + "x<{cyke}{cySe}\n" + + "k<{cyke}\n" + + "l<{cyle}\n" + + "m<{cyme}\n" + + "n<{cyne}\n" + + "o<{cyo}\n" + + "p<{cype}\n" + + + "r<{cyre}\n" + + "shch<{cyshche}\n" + + "sh''<{cyshe}[{cyche}\n" + + "sh''<{cyshe}[{cyChe}\n" + + "sh<{cyshe}\n" + + "s''<{cyse}[{cyhard}\n" + + "s''<{cyse}[{cyHard}\n" + + "s<{cyse}\n" + + "ts<{cytse}\n" + + "t''<{cyte}[{cyse}\n" + + "t''<{cyte}[{cySe}\n" + + "t''<{cyte}[{cyhard}\n" + + "t''<{cyte}[{cyHard}\n" + + "t<{cyte}\n" + + "u<{cyu}\n" + + "f<{cyfe}\n" + + "ch<{cyche}\n" + + "h<{cyhard}\n" + + "i''<{cyi}[{cyI}\n" + + "i''<{cyi}[{cyi}\n" + + "i<{cyi}\n" + + "ii<{cysoft}\n" + + "e<{cye}\n" + + + //generally the last rule + "''>\n" + //the end + } + }; + } +} diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java new file mode 100755 index 00000000000..d359adde14a --- /dev/null +++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java @@ -0,0 +1,412 @@ +package com.ibm.text.resources; + +import java.util.ListResourceBundle; + +public class TransliterationRuleLatinDevanagari extends ListResourceBundle { + /** + * Overrides ListResourceBundle + */ + public Object[][] getContents() { + return new Object[][] { + { "Description", + "Latin to Devanagari" }, + + { "Rule", + //##################################################################### + // Keyboard Transliteration Table + //##################################################################### + // Conversions should be: + // 1. complete + // * convert every sequence of Latin letters (a to z plus apostrophe) + // to a sequence of Native letters + // * convert every sequence of Native letters to Latin letters + // 2. reversable + // * any string of Native converted to Latin and back should be the same + // * this is not true for English converted to Native & back, e.g.: + // k -> {kaf} -> k + // c -> {kaf} -> k + //##################################################################### + // Sequences of Latin letters may convert to a single Native letter. + // When this is the case, an apostrophe can be used to indicate separate + // letters.$ + // E.g. sh -> {shin} + // s'h -> {sin}{heh} + // ss -> {sad} + // s's -> {sin}{shadda} + //##################################################################### + // To Do: + // finish adding shadda, add sokoon, fix uppercase + // make two transliteration tables: one with vowels, one without + //##################################################################### + // Modifications + // Devanagari Transliterator: broken up with consonsants/vowels + //##################################################################### + // Unicode character name definitions + //##################################################################### + + //consonants + "candrabindu=\u0901\n" + + "bindu=\u0902\n" + + "visarga=\u0903\n" + + // w represents the stand-alone form + + "wa=\u0905\n" + + "waa=\u0906\n" + + "wi=\u0907\n" + + "wii=\u0908\n" + + "wu=\u0909\n" + + "wuu=\u090A\n" + + "wr=\u090B\n" + + "wl=\u090C\n" + + "we=\u090F\n" + + "wai=\u0910\n" + + "wo=\u0913\n" + + "wau=\u0914\n" + + + "ka=\u0915\n" + + "kha=\u0916\n" + + "ga=\u0917\n" + + "gha=\u0918\n" + + "nga=\u0919\n" + + + "ca=\u091A\n" + + "cha=\u091B\n" + + "ja=\u091C\n" + + "jha=\u091D\n" + + "nya=\u091E\n" + + + "tta=\u091F\n" + + "ttha=\u0920\n" + + "dda=\u0921\n" + + "ddha=\u0922\n" + + "nna=\u0923\n" + + + "ta=\u0924\n" + + "tha=\u0925\n" + + "da=\u0926\n" + + "dha=\u0927\n" + + "na=\u0928\n" + + + "pa=\u092A\n" + + "pha=\u092B\n" + + "ba=\u092C\n" + + "bha=\u092D\n" + + "ma=\u092E\n" + + + "ya=\u092F\n" + + "ra=\u0930\n" + + "rra=\u0931\n" + + "la=\u0933\n" + + "va=\u0935\n" + + + "sha=\u0936\n" + + "ssa=\u0937\n" + + "sa=\u0938\n" + + "ha=\u0939\n" + + // represents the dependent form + + "aa=\u093E\n" + + "i=\u093F\n" + + "ii=\u0940\n" + + "u=\u0941\n" + + "uu=\u0942\n" + + "rh=\u0943\n" + + "lh=\u0944\n" + + "e=\u0947\n" + + "ai=\u0948\n" + + "o=\u094B\n" + + "au=\u094C\n" + + + "virama=\u094D\n" + + + "wrr=\u0960\n" + + "rrh=\u0962\n" + + + "danda=\u0964\n" + + "doubleDanda=\u0965\n" + + "depVowelAbove=[\u093E-\u0940\u0945-\u094C]\n" + + "depVowelBelow=[\u0941-\u0944]\n" + + "endThing=[{danda}{doubleDanda}\u0000-\u08FF\u0980-\uFFFF]\n" + + + "&=[{virama}{aa}{ai}{au}{ii}{i}{uu}{u}{rrh}{rh}{lh}{e}{o}]\n" + + "%=[bcdfghjklmnpqrstvwxyz]\n" + + //##################################################################### + // convert from Latin letters to Native letters + //##################################################################### + //Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e + + // special forms with no good conversion + + + "mm>{bindu}\n" + + "x>{visarga}\n" + + // convert to independent forms at start of word or syllable: + // e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai}) + // Moved up [LIU] + + + "aa>{waa}\n" + + "ai>{wai}\n" + + "au>{wau}\n" + + "ii>{wii}\n" + + "i>{wi}\n" + + "uu>{wuu}\n" + + "u>{wu}\n" + + "rrh>{wrr}\n" + + "rh>{wr}\n" + + "lh>{wl}\n" + + "e>{we}\n" + + "o>{wo}\n" + + "a>{wa}\n" + + // normal consonants + + + "kh>{kha}|{virama}\n" + + "k>{ka}|{virama}\n" + + "q>{ka}|{virama}\n" + + "gh>{gha}|{virama}\n" + + "g>{ga}|{virama}\n" + + "ng>{nga}|{virama}\n" + + "ch>{cha}|{virama}\n" + + "c>{ca}|{virama}\n" + + "jh>{jha}|{virama}\n" + + "j>{ja}|{virama}\n" + + "ny>{nya}|{virama}\n" + + "tth>{ttha}|{virama}\n" + + "tt>{tta}|{virama}\n" + + "ddh>{ddha}|{virama}\n" + + "dd>{dda}|{virama}\n" + + "nn>{nna}|{virama}\n" + + "th>{tha}|{virama}\n" + + "t>{ta}|{virama}\n" + + "dh>{dha}|{virama}\n" + + "d>{da}|{virama}\n" + + "n>{na}|{virama}\n" + + "ph>{pha}|{virama}\n" + + "p>{pa}|{virama}\n" + + "bh>{bha}|{virama}\n" + + "b>{ba}|{virama}\n" + + "m>{ma}|{virama}\n" + + "y>{ya}|{virama}\n" + + "r>{ra}|{virama}\n" + + "l>{la}|{virama}\n" + + "v>{va}|{virama}\n" + + "f>{va}|{virama}\n" + + "w>{va}|{virama}\n" + + "sh>{sha}|{virama}\n" + + "ss>{ssa}|{virama}\n" + + "s>{sa}|{virama}\n" + + "z>{sa}|{virama}\n" + + "h>{ha}|{virama}\n" + + + ".>{danda}\n" + + "{danda}.>{doubleDanda}\n" + + "{depVowelAbove}]~>{bindu}\n" + + "{depVowelBelow}]~>{candrabindu}\n" + + // convert to dependent forms after consonant with no vowel: + // e.g. kai -> {ka}{virama}ai -> {ka}{ai} + + + "{virama}aa>{aa}\n" + + "{virama}ai>{ai}\n" + + "{virama}au>{au}\n" + + "{virama}ii>{ii}\n" + + "{virama}i>{i}\n" + + "{virama}uu>{uu}\n" + + "{virama}u>{u}\n" + + "{virama}rrh>{rrh}\n" + + "{virama}rh>{rh}\n" + + "{virama}lh>{lh}\n" + + "{virama}e>{e}\n" + + "{virama}o>{o}\n" + + "{virama}a>\n" + + // otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai} + + + "{virama}''aa>{waa}\n" + + "{virama}''ai>{wai}\n" + + "{virama}''au>{wau}\n" + + "{virama}''ii>{wii}\n" + + "{virama}''i>{wi}\n" + + "{virama}''uu>{wuu}\n" + + "{virama}''u>{wu}\n" + + "{virama}''rrh>{wrr}\n" + + "{virama}''rh>{wr}\n" + + "{virama}''lh>{wl}\n" + + "{virama}''e>{we}\n" + + "{virama}''o>{wo}\n" + + "{virama}''a>{wa}\n" + + + "{virama}[{endThing}>\n" + + // convert any left-over apostrophes used for separation + + + "''>\n" + + //##################################################################### + // convert from Native letters to Latin letters + //##################################################################### + + // special forms with no good conversion + + + "mm<{bindu}\n" + + "x<{visarga}\n" + + // normal consonants + + + "kh<{kha}[&\n" + + "kha<{kha}\n" + + "k''<{ka}{virama}[{ha}\n" + + "k<{ka}[&\n" + + "ka<{ka}\n" + + "gh<{gha}[&\n" + + "gha<{gha}\n" + + "g''<{ga}{virama}[{ha}\n" + + "g<{ga}[&\n" + + "ga<{ga}\n" + + "ng<{nga}[&\n" + + "nga<{nga}\n" + + "ch<{cha}[&\n" + + "cha<{cha}\n" + + "c''<{ca}{virama}[{ha}\n" + + "c<{ca}[&\n" + + "ca<{ca}\n" + + "jh<{jha}[&\n" + + "jha<{jha}\n" + + "j''<{ja}{virama}[{ha}\n" + + "j<{ja}[&\n" + + "ja<{ja}\n" + + "ny<{nya}[&\n" + + "nya<{nya}\n" + + "tth<{ttha}[&\n" + + "ttha<{ttha}\n" + + "tt''<{tta}{virama}[{ha}\n" + + "tt<{tta}[&\n" + + "tta<{tta}\n" + + "ddh<{ddha}[&\n" + + "ddha<{ddha}\n" + + "dd''<{dda}[&{ha}\n" + + "dd<{dda}[&\n" + + "dda<{dda}\n" + + "dh<{dha}[&\n" + + "dha<{dha}\n" + + "d''<{da}{virama}[{ha}\n" + + "d''<{da}{virama}[{ddha}\n" + + "d''<{da}{virama}[{dda}\n" + + "d''<{da}{virama}[{dha}\n" + + "d''<{da}{virama}[{da}\n" + + "d<{da}[&\n" + + "da<{da}\n" + + "th<{tha}[&\n" + + "tha<{tha}\n" + + "t''<{ta}{virama}[{ha}\n" + + "t''<{ta}{virama}[{ttha}\n" + + "t''<{ta}{virama}[{tta}\n" + + "t''<{ta}{virama}[{tha}\n" + + "t''<{ta}{virama}[{ta}\n" + + "t<{ta}[&\n" + + "ta<{ta}\n" + + "n''<{na}{virama}[{ga}\n" + + "n''<{na}{virama}[{ya}\n" + + "n<{na}[&\n" + + "na<{na}\n" + + "ph<{pha}[&\n" + + "pha<{pha}\n" + + "p''<{pa}{virama}[{ha}\n" + + "p<{pa}[&\n" + + "pa<{pa}\n" + + "bh<{bha}[&\n" + + "bha<{bha}\n" + + "b''<{ba}{virama}[{ha}\n" + + "b<{ba}[&\n" + + "ba<{ba}\n" + + "m''<{ma}{virama}[{ma}\n" + + "m''<{ma}{virama}[{bindu}\n" + + "m<{ma}[&\n" + + "ma<{ma}\n" + + "y<{ya}[&\n" + + "ya<{ya}\n" + + "r''<{ra}{virama}[{ha}\n" + + "r<{ra}[&\n" + + "ra<{ra}\n" + + "l''<{la}{virama}[{ha}\n" + + "l<{la}[&\n" + + "la<{la}\n" + + "v<{va}[&\n" + + "va<{va}\n" + + "sh<{sha}[&\n" + + "sha<{sha}\n" + + "ss<{ssa}[&\n" + + "ssa<{ssa}\n" + + "s''<{sa}{virama}[{ha}\n" + + "s''<{sa}{virama}[{sha}\n" + + "s''<{sa}{virama}[{ssa}\n" + + "s''<{sa}{virama}[{sa}\n" + + "s<{sa}[&\n" + + "sa<{sa}\n" + + "h<{ha}[&\n" + + "ha<{ha}\n" + + // dependent vowels (should never occur except following consonants) + + + "aa<{aa}\n" + + "ai<{ai}\n" + + "au<{au}\n" + + "ii<{ii}\n" + + "i<{i}\n" + + "uu<{uu}\n" + + "u<{u}\n" + + "rrh<{rrh}\n" + + "rh<{rh}\n" + + "lh<{lh}\n" + + "e<{e}\n" + + "o<{o}\n" + + // independent vowels (when following consonants) + + + "''aa\u039c\u0397\u039d\u0399\u039d\u0020\u0391\u0395\u0399\u0394\u0395\u002c\u0020\u0398\u0395\u0391\u002c\u0020--\u0397\u039b\u0397\u0399\u0391\u0394\u0395\u03a9\u0020\u0391\u03a7\u0399\u039b\u0397\u039f\u03a3\n" + + + "AV`>{grAl}{grAcUp}\n" + + "EV`>{grEp}{grAcUp}\n" + + "AV>{grAl}{grUp}\n" + + "EV>{grEp}{grUp}\n" + + "NG>{grGa}{grGa}\n" + + "NK>{grGa}{grKa}\n" + + "NX>{grGa}{grKs}\n" + + "NCH>{grGa}{grKh}\n" + + //+ "final = [ .;]\n" // Syntax error, unused anyway - Liu + + + "A`>{grAcAl}\n" + + "EE`>{grAcEt}\n" + + "E`>{grAcEp}\n" + + "I`>{grAcIo}\n" + + "U`>{grAcUp}\n" + + "OO`>{grAcOme}\n" + + "O`>{grAcOm}\n" + + "''I>{grDiIo}\n" + + "''U>{grDiUp}\n" + + "A>{grAl}\n" + + "B>{grBe}\n" + + "C[I>{grSi}\n" + + "C[E>{grSi}\n" + + "C[Y>{grSi}\n" + + "CH>{grKh}\n" + + "C>{grKa}\n" + + "D>{grDe}\n" + + "EE>{grEt}\n" + + "E>{grEp}\n" + + "F>{grPh}\n" + + "G>{grGa}\n" + + "H>{grKh}\n" + + "I>{grIo}\n" + + "J>{grIo}\n" + + "KS>{grKs}\n" + + "KH>{grKh}\n" + + "K>{grKa}\n" + + "L>{grLa}\n" + + "M>{grMu}\n" + + "N>{grNu}\n" + + "OO>{grOme}\n" + + "O>{grOm}\n" + + "PS>{grPs}\n" + + "PH>{grPh}\n" + + "P>{grPi}\n" + + "Q>{grKa}\n" + + "R>{grRh}\n" + + "S>{grSi}\n" + + "TH>{grTh}\n" + + "T>{grTa}\n" + + "W>{grUp}{grUp}\n" + + "U>{grUp}\n" + + "V>{grUp}\n" + + "X>{grKs}\n" + + "Y>{grUp}\n" + + "Z>{grZe}\n" + + //now Native to Roman + + + "AV<{grAl}{grUp}\n" + + "EV<{grEp}{grUp}\n" + + "AV`<{grAl}{grAcUp}\n" + + "EV`<{grEp}{grAcUp}\n" + + "N''<{grNu}[{grGa}\n" + + "NG<{grGa}{grGa}\n" + + "N''<{grNu}[{grKa}\n" + + "NK<{grGa}{grKa}\n" + + "N''<{grNu}[{grKs}\n" + + "NX<{grGa}{grKs}\n" + + "N''<{grNu}[{grKh}\n" + + "NCH<{grGa}{grKh}\n" + + + "A<{grAl}\n" + + "B<{grBe}\n" + + "G<{grGa}\n" + + "D<{grDe}\n" + + "E''<{grEp}[{grEp}\n" + + "E''<{grEp}[{grEt}\n" + + "E''<{grEp}[{grAcEp}\n" + + "E''<{grEp}[{grAcEt}\n" + + "E<{grEp}\n" + + "Z<{grZe}\n" + + "EE<{grEt}\n" + + "TH<{grTh}\n" + + "I<{grIo}\n" + + "K<{grKa}\n" + + "L<{grLa}\n" + + "M<{grMu}\n" + + "N<{grNu}\n" + + "X<{grKs}\n" + + "O''<{grOm}[{grOm}\n" + + "O''<{grOm}[{grOme}\n" + + "O''<{grOm}[{grAcOm}\n" + + "O''<{grOm}[{grAcOme}\n" + + "O<{grOm}\n" + + "P''<{grPi}[{grSi}\n" + + "P''<{grPi}[{grfinal}\n" + + "P<{grPi}\n" + + "R<{grRh}\n" + + "S<{grSi}\n" + + "T<{grTa}\n" + + "W<{grUp}{grUp}\n" + + + "V<{grUp}[{grAcAl}\n" + + "V<{grUp}[{grAcEp}\n" + + "V<{grUp}[{grAcEt}\n" + + "V<{grUp}[{grAcIo}\n" + + "V<{grUp}[{grAcOm}\n" + + "V<{grUp}[{grAcUp}\n" + + "V<{grUp}[{grAcOme}\n" + + + "V<{grUp}[{grAl}\n" + + "V<{grUp}[{grEp}\n" + + "V<{grUp}[{grEt}\n" + + "V<{grUp}[{grIo}\n" + + "V<{grUp}[{grOm}\n" + //{grUp}[{grUp}{gral}{gracup}\n" + + "ev`>{grep}{gracup}\n" + + "av>{gral}{grup}\n" + + "ev>{grep}{grup}\n" + + "ng>{grga}{grga}\n" + + "nk>{grga}{grka}\n" + + "nx>{grga}{grks}\n" + + "nch>{grga}{grkh}\n" + + + "a`>{gracal}\n" + + "ee`>{gracet}\n" + + "e`>{gracep}\n" + + "i`>{gracio}\n" + + "u`>{gracup}\n" + + "oo`>{gracome}\n" + + "o`>{gracom}\n" + + "''i>{grdiio}\n" + + "''u>{grdiup}\n" + + "a>{gral}\n" + + "b>{grbe}\n" + + "c[i>{grsi}\n" + + "c[e>{grsi}\n" + + "c[y>{grsi}\n" + + "ch>{grkh}\n" + + "c>{grka}\n" + + "d>{grde}\n" + + "ee>{gret}\n" + + "e>{grep}\n" + + "f>{grph}\n" + + "g>{grga}\n" + + "h>{grkh}\n" + + "i>{grio}\n" + + "j>{grio}\n" + + "ks>{grks}\n" + + "kh>{grkh}\n" + + "k>{grka}\n" + + "l>{grla}\n" + + "m>{grmu}\n" + + "n>{grnu}\n" + + "oo>{grome}\n" + + "o>{grom}\n" + + "ps>{grps}\n" + + "ph>{grph}\n" + + "p>{grpi}\n" + + "q>{grka}\n" + + "r>{grrh}\n" + + "s>|{grfinal}\n" + + "{grfinal}[{letter}>{grsi}\n" + + "th>{grth}\n" + + "t>{grta}\n" + + "w>{grup}{grup}\n" + + "u>{grup}\n" + + "v>{grup}\n" + + "x>{grks}\n" + + "y>{grup}\n" + + "z>{grze}\n" + + + //forms + + "''>\n" + //now native to roman + + + "av<{gral}{grup}\n" + + "ev<{grep}{grup}\n" + + "av`<{gral}{gracup}\n" + + "ev`<{grep}{gracup}\n" + + "n''<{grnu}[{grga}\n" + + "ng<{grga}{grga}\n" + + "n''<{grnu}[{grka}\n" + + "nk<{grga}{grka}\n" + + "n''<{grnu}[{grks}\n" + + "nx<{grga}{grks}\n" + + "n''<{grnu}[{grkh}\n" + + "nch<{grga}{grkh}\n" + + + "a<{gral}\n" + + "b<{grbe}\n" + + "g<{grga}\n" + + "d<{grde}\n" + + "e''<{grep}[{grep}\n" + + "e''<{grep}[{gret}\n" + + "e''<{grep}[{gracep}\n" + + "e''<{grep}[{gracet}\n" + + "e<{grep}\n" + + "z<{grze}\n" + + "ee<{gret}\n" + + "th<{grth}\n" + + "i<{grio}\n" + + "k<{grka}\n" + + "l<{grla}\n" + + "m<{grmu}\n" + + "n<{grnu}\n" + + "x<{grks}\n" + + "o''<{grom}[{grom}\n" + + "o''<{grom}[{grome}\n" + + "o''<{grom}[{gracom}\n" + + "o''<{grom}[{gracome}\n" + + "o<{grom}\n" + + "p''<{grpi}[{grsi}\n" + + "p''<{grpi}[{grfinal}\n" + + "p<{grpi}\n" + + "r<{grrh}\n" + + "s<{grsi}\n" + + "s<{grfinal}\n" + + "t<{grta}\n" + + "w<{grup}{grup}\n" + + + "v<{grup}[{gracal}\n" + + "v<{grup}[{gracep}\n" + + "v<{grup}[{gracet}\n" + + "v<{grup}[{gracio}\n" + + "v<{grup}[{gracom}\n" + + "v<{grup}[{gracup}\n" + + "v<{grup}[{gracome}\n" + + + "v<{grup}[{gral}\n" + + "v<{grup}[{grep}\n" + + "v<{grup}[{gret}\n" + + "v<{grup}[{grio}\n" + + "v<{grup}[{grom}\n" + //{grup}[{grup}{POINT_SHEVA} + //?>{POINT_HATAF_SEGOL} + //?>{POINT_HATAF_PATAH} + //?>{POINT_HATAF_QAMATS} + //?>{POINT_HIRIQ} + //?>{POINT_TSERE} + //?>{POINT_SEGOL} + //?>{POINT_PATAH} + //?>{POINT_QAMATS} + //?>{POINT_HOLAM} + //?>{POINT_QUBUTS} + //?>{POINT_DAGESH_OR_MAPIQ} + //?>{POINT_METEG} + //?>{PUNCTUATION_MAQAF} + //?>{POINT_RAFE} + //?>{PUNCTUATION_PASEQ} + //?>{POINT_SHIN_DOT} + //?>{POINT_SIN_DOT} + //?>{PUNCTUATION_SOF_PASUQ} + + + "a>{ALEF}\n" + + "A>{ALEF}\n" + + + "b>{BET}\n" + + "B>{BET}\n" + + + "c[{softvowel}>{SAMEKH}\n" + + "C[{softvowel}>{SAMEKH}\n" + + "c[{letter}>{KAF}\n" + + "C[{letter}>{KAF}\n" + + "c>{FINAL_KAF}\n" + + "C>{FINAL_KAF}\n" + + + "d>{DALET}\n" + + "D>{DALET}\n" + + + "e>{AYIN}\n" + + "E>{AYIN}\n" + + + "f[{letter}>{PE}\n" + + "f>{FINAL_PE}\n" + + "F[{letter}>{PE}\n" + + "F>{FINAL_PE}\n" + + + "g>{GIMEL}\n" + + "G>{GIMEL}\n" + + + "h>{HE}\n" + + "H>{HE}\n" + + + "i>{YOD}\n" + + "I>{YOD}\n" + + + "j>{DALET}{SHIN}\n" + + "J>{DALET}{SHIN}\n" + + + "kH>{HET}\n" + + "kh>{HET}\n" + + "KH>{HET}\n" + + "Kh>{HET}\n" + + "k[{letter}>{KAF}\n" + + "K[{letter}>{KAF}\n" + + "k>{FINAL_KAF}\n" + + "K>{FINAL_KAF}\n" + + + "l>{LAMED}\n" + + "L>{LAMED}\n" + + + "m[{letter}>{MEM}\n" + + "m>{FINAL_MEM}\n" + + "M[{letter}>{MEM}\n" + + "M>{FINAL_MEM}\n" + + + "n[{letter}>{NUN}\n" + + "n>{FINAL_NUN}\n" + + "N[{letter}>{NUN}\n" + + "N>{FINAL_NUN}\n" + + + "o>{VAV}\n" + + "O>{VAV}\n" + + + "p[{letter}>{PE}\n" + + "p>{FINAL_PE}\n" + + "P[{letter}>{PE}\n" + + "P>{FINAL_PE}\n" + + + "q>{QOF}\n" + + "Q>{QOF}\n" + + + "r>{RESH}\n" + + "R>{RESH}\n" + + + "sH>{SHIN}\n" + + "sh>{SHIN}\n" + + "SH>{SHIN}\n" + + "Sh>{SHIN}\n" + + "s>{SAMEKH}\n" + + "S>{SAMEKH}\n" + + + "th>{TAV}\n" + + "tH>{TAV}\n" + + "TH>{TAV}\n" + + "Th>{TAV}\n" + + "tS[{letter}>{TSADI}\n" + + "ts[{letter}>{TSADI}\n" + + "Ts[{letter}>{TSADI}\n" + + "TS[{letter}>{TSADI}\n" + + "tS>{FINAL_TSADI}\n" + + "ts>{FINAL_TSADI}\n" + + "Ts>{FINAL_TSADI}\n" + + "TS>{FINAL_TSADI}\n" + + "t>{TET}\n" + + "T>{TET}\n" + + + "u>{VAV}\n" + + "U>{VAV}\n" + + + "v>{VAV}\n" + + "V>{VAV}\n" + + + "w>{VAV}\n" + + "W>{VAV}\n" + + + "x>{KAF}{SAMEKH}\n" + + "X>{KAF}{SAMEKH}\n" + + + "y>{YOD}\n" + + "Y>{YOD}\n" + + + "z>{ZAYIN}\n" + + "Z>{ZAYIN}\n" + + //#?>{YIDDISH_DOUBLE_VAV} + //?>{YIDDISH_VAV_YOD} + //?>{YIDDISH_DOUBLE_YOD} + //?>{PUNCTUATION_GERESH} + //?>{PUNCTUATION_GERSHAYIM} + + + "''>\n" + + //{POINT_SHEVA}>@ + //{POINT_HATAF_SEGOL}>@ + //{POINT_HATAF_PATAH}>@ + //{POINT_HATAF_QAMATS}>@ + //{POINT_HIRIQ}>@ + //{POINT_TSERE}>@ + //{POINT_SEGOL}>@ + //{POINT_PATAH}>@ + //{POINT_QAMATS}>@ + //{POINT_HOLAM}>@ + //{POINT_QUBUTS}>@ + //{POINT_DAGESH_OR_MAPIQ}>@ + //{POINT_METEG}>@ + //{PUNCTUATION_MAQAF}>@ + //{POINT_RAFE}>@ + //{PUNCTUATION_PASEQ}>@ + //{POINT_SHIN_DOT}>@ + //{POINT_SIN_DOT}>@ + //{PUNCTUATION_SOF_PASUQ}>@ + + + "a<{ALEF}\n" + + "e<{AYIN}\n" + + "b<{BET}\n" + + "d<{DALET}\n" + + "k<{FINAL_KAF}\n" + + "m<{FINAL_MEM}\n" + + "n<{FINAL_NUN}\n" + + "p<{FINAL_PE}\n" + + "ts<{FINAL_TSADI}\n" + + "g<{GIMEL}\n" + + "kh<{HET}\n" + + "h<{HE}\n" + + "k''<{KAF}[{HE}\n" + + "k<{KAF}\n" + + "l<{LAMED}\n" + + "m<{MEM}\n" + + "n<{NUN}\n" + + "p<{PE}\n" + + "q<{QOF}\n" + + "r<{RESH}\n" + + "s''<{SAMEKH}[{HE}\n" + + "s<{SAMEKH}\n" + + "sh<{SHIN}\n" + + "th<{TAV}\n" + + "t''<{TET}[{HE}\n" + + "t''<{TET}[{HE}\n" + + "t''<{TET}[{SAMEKH}\n" + + "t''<{TET}[{SHIN}\n" + + "t<{TET}\n" + + "ts<{TSADI}\n" + + "v<{VAV}[{vowellike}\n" + + "u<{VAV}\n" + + "y<{YOD}\n" + + "z<{ZAYIN}\n" + + //{YIDDISH_DOUBLE_VAV}>@ + //{YIDDISH_VAV_YOD}>@ + //{YIDDISH_DOUBLE_YOD}>@ + //{PUNCTUATION_GERESH}>@ + //{PUNCTUATION_GERSHAYIM}>@ + + + "<''\n" + } + }; + } +} diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java new file mode 100755 index 00000000000..47b6e2a3de2 --- /dev/null +++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java @@ -0,0 +1,883 @@ +package com.ibm.text.resources; + +import java.util.ListResourceBundle; + +/** + * Rewritten April 1999 to implement Hepburn (kebon shiki) + * transliteration. Reference: CJKV Information Processing, Lunde, + * 1999, pp. 30-35. + * @author Alan Liu + */ +public class TransliterationRuleLatinKana extends ListResourceBundle { + /** + * Overrides ListResourceBundle + */ + public Object[][] getContents() { + return new Object[][] { + { "Description", + "Lowercase Latin to Hiragana; Uppercase Latin to Katakana" }, + + { "Rule", + + //------------------------------------------------------------ + // Variables + //------------------------------------------------------------ + + // Hiragana. These are named according to the + // regularized Nippon romanization (the naming system + // used by Unicode). Thus \u3062 is called "di", not + // "ji". "x_" is the small form of "_", e.g. "xa" is + // small "a". + + "xa=\u3041\n" + + "a=\u3042\n" + + "xi=\u3043\n" + + "i=\u3044\n" + + "xu=\u3045\n" + + "u=\u3046\n" + + "xe=\u3047\n" + + "e=\u3048\n" + + "xo=\u3049\n" + + "o=\u304A\n" + + + "ka=\u304B\n" + + "ga=\u304C\n" + + "ki=\u304D\n" + + "gi=\u304E\n" + + "ku=\u304F\n" + + "gu=\u3050\n" + + "ke=\u3051\n" + + "ge=\u3052\n" + + "ko=\u3053\n" + + "go=\u3054\n" + + + "sa=\u3055\n" + + "za=\u3056\n" + + "si=\u3057\n" + + "zi=\u3058\n" + + "su=\u3059\n" + + "zu=\u305A\n" + + "se=\u305B\n" + + "ze=\u305C\n" + + "so=\u305D\n" + + "zo=\u305E\n" + + + "ta=\u305F\n" + + "da=\u3060\n" + + "ti=\u3061\n" + + "di=\u3062\n" + + "xtu=\u3063\n" + + "tu=\u3064\n" + + "du=\u3065\n" + + "te=\u3066\n" + + "de=\u3067\n" + + "to=\u3068\n" + + "do=\u3069\n" + + + "na=\u306A\n" + + "ni=\u306B\n" + + "nu=\u306C\n" + + "ne=\u306D\n" + + "no=\u306E\n" + + + "ha=\u306F\n" + + "ba=\u3070\n" + + "pa=\u3071\n" + + "hi=\u3072\n" + + "bi=\u3073\n" + + "pi=\u3074\n" + + "hu=\u3075\n" + + "bu=\u3076\n" + + "pu=\u3077\n" + + "he=\u3078\n" + + "be=\u3079\n" + + "pe=\u307A\n" + + "ho=\u307B\n" + + "bo=\u307C\n" + + "po=\u307D\n" + + + "ma=\u307E\n" + + "mi=\u307F\n" + + "mu=\u3080\n" + + "me=\u3081\n" + + "mo=\u3082\n" + + + "xya=\u3083\n" + + "ya=\u3084\n" + + "xyu=\u3085\n" + + "yu=\u3086\n" + + "xyo=\u3087\n" + + "yo=\u3088\n" + + + "ra=\u3089\n" + + "ri=\u308A\n" + + "ru=\u308B\n" + + "re=\u308C\n" + + "ro=\u308D\n" + + + "xwa=\u308E\n" + + "wa=\u308F\n" + + "wi=\u3090\n" + + "we=\u3091\n" + + "wo=\u3092\n" + + + "n=\u3093\n" + + "vu=\u3094\n" + + // Katakana. "X_" is the small form of "_", e.g. "XA" + // is small "A". + + + "XA=\u30A1\n" + + "A=\u30A2\n" + + "XI=\u30A3\n" + + "I=\u30A4\n" + + "XU=\u30A5\n" + + "U=\u30A6\n" + + "XE=\u30A7\n" + + "E=\u30A8\n" + + "XO=\u30A9\n" + + "O=\u30AA\n" + + + "KA=\u30AB\n" + + "GA=\u30AC\n" + + "KI=\u30AD\n" + + "GI=\u30AE\n" + + "KU=\u30AF\n" + + "GU=\u30B0\n" + + "KE=\u30B1\n" + + "GE=\u30B2\n" + + "KO=\u30B3\n" + + "GO=\u30B4\n" + + + "SA=\u30B5\n" + + "ZA=\u30B6\n" + + "SI=\u30B7\n" + + "ZI=\u30B8\n" + + "SU=\u30B9\n" + + "ZU=\u30BA\n" + + "SE=\u30BB\n" + + "ZE=\u30BC\n" + + "SO=\u30BD\n" + + "ZO=\u30BE\n" + + + "TA=\u30BF\n" + + "DA=\u30C0\n" + + "TI=\u30C1\n" + + "DI=\u30C2\n" + + "XTU=\u30C3\n" + + "TU=\u30C4\n" + + "DU=\u30C5\n" + + "TE=\u30C6\n" + + "DE=\u30C7\n" + + "TO=\u30C8\n" + + "DO=\u30C9\n" + + + "NA=\u30CA\n" + + "NI=\u30CB\n" + + "NU=\u30CC\n" + + "NE=\u30CD\n" + + "NO=\u30CE\n" + + + "HA=\u30CF\n" + + "BA=\u30D0\n" + + "PA=\u30D1\n" + + "HI=\u30D2\n" + + "BI=\u30D3\n" + + "PI=\u30D4\n" + + "HU=\u30D5\n" + + "BU=\u30D6\n" + + "PU=\u30D7\n" + + "HE=\u30D8\n" + + "BE=\u30D9\n" + + "PE=\u30DA\n" + + "HO=\u30DB\n" + + "BO=\u30DC\n" + + "PO=\u30DD\n" + + + "MA=\u30DE\n" + + "MI=\u30DF\n" + + "MU=\u30E0\n" + + "ME=\u30E1\n" + + "MO=\u30E2\n" + + + "XYA=\u30E3\n" + + "YA=\u30E4\n" + + "XYU=\u30E5\n" + + "YU=\u30E6\n" + + "XYO=\u30E7\n" + + "YO=\u30E8\n" + + + "RA=\u30E9\n" + + "RI=\u30EA\n" + + "RU=\u30EB\n" + + "RE=\u30EC\n" + + "RO=\u30ED\n" + + + "XWA=\u30EE\n" + + "WA=\u30EF\n" + + "WI=\u30F0\n" + + "WE=\u30F1\n" + + "WO=\u30F2\n" + + + "N=\u30F3\n" + + "VU=\u30F4\n" + + + "XKA=\u30F5\n" + + "XKE=\u30F6\n" + + + "VA=\u30F7\n" + + "VI=\u30F8\n" + + "VE=\u30F9\n" + + "VO=\u30FA\n" + + + "DOT=\u30FB\n" // Middle dot + + "LONG=\u30FC\n" // Prolonged sound mark + + // Categories and programmatic variables + + + "vowel=[aiueo]\n" + + "small=\uE000\n" + + "hvr=\uE001\n" + + "hv=[{xya}{xi}{xyu}{xe}{xyo}]\n" + + //------------------------------------------------------------ + // Rules + //------------------------------------------------------------ + /* +// Hepburn equivalents + +shi>|si +ji>|zi +chi>|ti +// ji>|di // By default we use the ji-zi mapping +tsu>|tu +fu>|hu + +sh[{vowel}>|sy +ja>|zya +// ji = zi +ju>|zyu +je>|zye +jo>|zyo +cha>|tya +// chi = ti +chu>|tyu +che>|tye +cho>|tyo +// j[{vowel} = dy{vowel}, but we use zy{vowel} by default + +// Historically, m preceded b, p, or m; now n is used +// in all cases +m[b>n +m[p>n +m[m>n + +// Compatibility + +// 'f' group +fa>{fu}{xa} +fi>{fu}{xi} +// fu = hu +fe>{fu}{xe} +fo>{fu}{xo} + +// 'jy' group; these will not round-trip, except for "jyi" +// See also the 'j' group. +jya>|zya +jyi>{zi}{xyi} +jyu>|zyu +jye>|zye +jyo>|zyo + +// Nippon romanized forms + +a>{a} +i>{i} +u>{u} +e>{e} +o>{o} +ka>{ka} +ki>{ki} +ku>{ku} +ke>{ke} +ko>{ko} +ga>{ga} +gi>{gi} +gu>{gu} +ge>{ge} +go>{go} +sa>{sa} +si>{si} +su>{su} +se>{se} +so>{so} +za>{za} +zi>{zi} +zu>{zu} +ze>{ze} +zo>{zo} +ta>{ta} +ti>{ti} +tu>{tu} +te>{te} +to>{to} +da>{da} +di>{di} +du>{du} +de>{de} +do>{do} +na>{na} +ni>{ni} +nu>{nu} +ne>{ne} +no>{no} +ha>{ha} +hi>{hi} +hu>{hu} +he>{he} +ho>{ho} +ba>{ba} +bi>{bi} +bu>{bu} +be>{be} +bo>{bo} +pa>{pa} +pi>{pi} +pu>{pu} +pe>{pe} +po>{po} +ma>{ma} +mi>{mi} +mu>{mu} +me>{me} +mo>{mo} +ya>{ya} +yu>{yu} +yo>{yo} +ra>{ra} +ri>{ri} +ru>{ru} +re>{re} +ro>{ro} +wa>{wa} +wi>{wi} +// No "wu" +we>{we} +wo>{wo} // Reverse {wo} to "o", not "wo" +n''>{n} +n>{n} + +// Palatized Nippon romanized syllables + +ky[{vowel}>{ki}|{small} +gy[{vowel}>{gi}|{small} +sy[{vowel}>{si}|{small} +zy[{vowel}>{zi}|{small} +ty[{vowel}>{ti}|{small} +dy[{vowel}>{di}|{small} +ny[{vowel}>{ni}|{small} +my[{vowel}>{mi}|{small} +hy[{vowel}>{hi}|{small} +by[{vowel}>{bi}|{small} +py[{vowel}>{pi}|{small} +ry[{vowel}>{ri}|{small} + +// Doubled consonants + +c[c>{xtu} +k[k>{xtu} +g[g>{xtu} +s[s>{xtu} +z[z>{xtu} +j[j>{xtu} +t[t>{xtu} +d[d>{xtu} +h[h>{xtu} +f[f>{xtu} +p[p>{xtu} +b[b>{xtu} +m[m>{xtu} +y[y>{xtu} +r[r>{xtu} +w[w>{xtu} + */ + + + "a>{a}\n" + + + "ba>{ba}\n" + + "bi>{bi}\n" + + "bu>{bu}\n" + + "be>{be}\n" + + "bo>{bo}\n" + + "by[{vowel}>{bi}|{small}\n" + + "b[b>{xtu}\n" + + + "da>{da}\n" + + "di>{di}\n" + + "du>{du}\n" + + "de>{de}\n" + + "do>{do}\n" + + "dy[{vowel}>{di}|{small}\n" + + "dh[{vowel}>{de}|{small}\n" + + "d[d>{xtu}\n" + + + "e>{e}\n" + + + "fa>{hu}{xa}\n" + + "fi>{hu}{xi}\n" + + "fe>{hu}{xe}\n" + + "fo>{hu}{xo}\n" + + "fya>{hu}{xya}\n" + + "fyu>{hu}{xyu}\n" + + "fyo>{hu}{xyo}\n" + + "f[f>{xtu}\n" + + + "ga>{ga}\n" + + "gi>{gi}\n" + + "gu>{gu}\n" + + "ge>{ge}\n" + + "go>{go}\n" + + "gy[{vowel}>{gi}|{small}\n" + + "gwa>{gu}{xwa}\n" + + "gwi>{gu}{xi}\n" + + "gwu>{gu}{xu}\n" + + "gwe>{gu}{xe}\n" + + "gwo>{gu}{xo}\n" + + "g[g>{xtu}\n" + + + "ha>{ha}\n" + + "hi>{hi}\n" + + "hu>{hu}\n" + + "he>{he}\n" + + "ho>{ho}\n" + + "hy[{vowel}>{hi}|{small}\n" + + "h[h>{xtu}\n" + + + "i>{i}\n" + + + "ka>{ka}\n" + + "ki>{ki}\n" + + "ku>{ku}\n" + + "ke>{ke}\n" + + "ko>{ko}\n" + + "kwa>{ku}{xwa}\n" + + "kwi>{ku}{xi}\n" + + "kwu>{ku}{xu}\n" + + "kwe>{ku}{xe}\n" + + "kwo>{ku}{xo}\n" + + "ky[{vowel}>{ki}|{small}\n" + + "k[k>{xtu}\n" + + + "ma>{ma}\n" + + "mi>{mi}\n" + + "mu>{mu}\n" + + "me>{me}\n" + + "mo>{mo}\n" + + "my[{vowel}>{mi}|{small}\n" + + "m[b>{n}\n" + + "m[f>{n}\n" + + "m[m>{n}\n" + + "m[p>{n}\n" + + "m[v>{n}\n" + + "m''>{n}\n" + + + "na>{na}\n" + + "ni>{ni}\n" + + "nu>{nu}\n" + + "ne>{ne}\n" + + "no>{no}\n" + + "ny[{vowel}>{ni}|{small}\n" + + "nn>{n}\n" + + "n''>{n}\n" + + "n>{n}\n" + + + "o>{o}\n" + + + "pa>{pa}\n" + + "pi>{pi}\n" + + "pu>{pu}\n" + + "pe>{pe}\n" + + "po>{po}\n" + + "py[{vowel}>{pi}|{small}\n" + + "p[p>{xtu}\n" + + + "qa>{ku}{xa}\n" + + "qi>{ku}{xi}\n" + + "qu>{ku}{xu}\n" + + "qe>{ku}{xe}\n" + + "qo>{ku}{xo}\n" + + "qy[{vowel}>{ku}|{small}\n" + + "q[q>{xtu}\n" + + + "ra>{ra}\n" + + "ri>{ri}\n" + + "ru>{ru}\n" + + "re>{re}\n" + + "ro>{ro}\n" + + "ry[{vowel}>{ri}|{small}\n" + + "r[r>{xtu}\n" + + + "sa>{sa}\n" + + "si>{si}\n" + + "su>{su}\n" + + "se>{se}\n" + + "so>{so}\n" + + "sy[{vowel}>{si}|{small}\n" + + "s[sh>{xtu}\n" + + "s[s>{xtu}\n" + + + "ta>{ta}\n" + + "ti>{ti}\n" + + "tu>{tu}\n" + + "te>{te}\n" + + "to>{to}\n" + + "th[{vowel}>{te}|{small}\n" + + "tsa>{tu}{xa}\n" + + "tsi>{tu}{xi}\n" + + "tse>{tu}{xe}\n" + + "tso>{tu}{xo}\n" + + "ty[{vowel}>{ti}|{small}\n" + + "t[ts>{xtu}\n" + + "t[ch>{xtu}\n" + + "t[t>{xtu}\n" + + + "u>{u}\n" + + + "va>{VA}\n" + + "vi>{VI}\n" + + "vu>{vu}\n" + + "ve>{VE}\n" + + "vo>{VO}\n" + + "vy[{vowel}>{VI}|{small}\n" + + "v[v>{xtu}\n" + + + "wa>{wa}\n" + + "wi>{wi}\n" + + "we>{we}\n" + + "wo>{wo}\n" + + "w[w>{xtu}\n" + + + "ya>{ya}\n" + + "yu>{yu}\n" + + "ye>{i}{xe}\n" + + "yo>{yo}\n" + + "y[y>{xtu}\n" + + + "za>{za}\n" + + "zi>{zi}\n" + + "zu>{zu}\n" + + "ze>{ze}\n" + + "zo>{zo}\n" + + "zy[{vowel}>{zi}|{small}\n" + + "z[z>{xtu}\n" + + + "xa>{xa}\n" + + "xi>{xi}\n" + + "xu>{xu}\n" + + "xe>{xe}\n" + + "xo>{xo}\n" + + "xka>{XKA}\n" + + "xke>{XKE}\n" + + "xtu>{xtu}\n" + + "xwa>{xwa}\n" + + "xya>{xya}\n" + + "xyu>{xyu}\n" + + "xyo>{xyo}\n" + + // optional mappings + + "wu>{u}\n" + + + "ca>{ka}\n" + + "ci>{si}\n" + + "cu>{ku}\n" + + "ce>{se}\n" + + "co>{ko}\n" + + "cha>{ti}{xya}\n" + + "chi>{ti}\n" + + "chu>{ti}{xyu}\n" + + "che>{ti}{xe}\n" + + "cho>{ti}{xyo}\n" + + "cy[{vowel}>{ti}|{small}\n" + + "c[k>{xtu}\n" + + "c[c>{xtu}\n" + + + "fu>{hu}\n" + + + "ja>{zi}{xya}\n" + + "ji>{zi}\n" + + "ju>{zi}{xyu}\n" + + "je>{zi}{xe}\n" + + "jo>{zi}{xyo}\n" + + "jy[{vowel}>{zi}|{small}\n" + + "j[j>{xtu}\n" + + + "la>{ra}\n" + + "li>{ri}\n" + + "lu>{ru}\n" + + "le>{re}\n" + + "lo>{ro}\n" + + "ly[{vowel}>{ri}|{small}\n" + + "l[l>{xtu}\n" + + + "sha>{si}{xya}\n" + + "shi>{si}\n" + + "shu>{si}{xyu}\n" + + "she>{si}{xe}\n" + + "sho>{si}{xyo}\n" + + + "tsu>{tu}\n" + + + "yi>{i}\n" + + + "xtsu>{xtu}\n" + + "xyi>{xi}\n" + + "xye>{xe}\n" + + + + + + + + // Convert vowels to small form + + "{small}a>{xya}\n" + + "{small}i>{xi}\n" + + "{small}u>{xyu}\n" + + "{small}e>{xe}\n" + + "{small}o>{xyo}\n" + + + + + + "gy|{hvr}<{gi}[{hv}\n" + + "gwa<{gu}{xwa}\n" + + "gwi<{gu}{xi}\n" + + "gwu<{gu}{xu}\n" + + "gwe<{gu}{xe}\n" + + "gwo<{gu}{xo}\n" + + "ga<{ga}\n" + + "gi<{gi}\n" + + "gu<{gu}\n" + + "ge<{ge}\n" + + "go<{go}\n" + + + "ky|{hvr}<{ki}[{hv}\n" + + "kwa<{ku}{xwa}\n" + + "kwi<{ku}{xi}\n" + + "kwu<{ku}{xu}\n" + + "kwe<{ku}{xe}\n" + + "kwo<{ku}{xo}\n" + + "qa<{ku}{xa}\n" + + "qya<{ku}{xya}\n" + + "qyu<{ku}{xyu}\n" + + "qyo<{ku}{xyo}\n" + + "ka<{ka}\n" + + "ki<{ki}\n" + + "ku<{ku}\n" + + "ke<{ke}\n" + + "ko<{ko}\n" + + + "j|{hvr}<{zi}[{hv}\n" // Hepburn + + "za<{za}\n" + + "ji<{zi}\n" // Hepburn + + "zu<{zu}\n" + + "ze<{ze}\n" + + "zo<{zo}\n" + + + "sh|{hvr}<{si}[{hv}\n" // Hepburn + + "sa<{sa}\n" + + "shi<{si}\n" + + "su<{su}\n" + + "se<{se}\n" + + "so<{so}\n" + + + "j|{hvr}<{di}[{hv}\n" // Hepburn + + "dh|{hvr}<{de}[{hv}\n" + + "da<{da}\n" + + "ji<{di}\n" // Hepburn + + "de<{de}\n" + + "do<{do}\n" + + "zu<{du}\n" // Hepburn + + + "ch|{hvr}<{ti}[{hv}\n" // Hepburn + + "tsa<{tu}{xa}\n" + + "tsi<{tu}{xi}\n" + + "tse<{tu}{xe}\n" + + "tso<{tu}{xo}\n" + + "th|{hvr}<{te}[{hv}\n" + + "ta<{ta}\n" + + "chi<{ti}\n" // Hepburn + + "tsu<{tu}\n" // Hepburn + + "te<{te}\n" + + "to<{to}\n" + + + "ny|{hvr}<{ni}[{hv}\n" + + "na<{na}\n" + + "ni<{ni}\n" + + "nu<{nu}\n" + + "ne<{ne}\n" + + "no<{no}\n" + + + "by|{hvr}<{bi}[{hv}\n" + + "ba<{ba}\n" + + "bi<{bi}\n" + + "bu<{bu}\n" + + "be<{be}\n" + + "bo<{bo}\n" + + + "py|{hvr}<{pi}[{hv}\n" + + "pa<{pa}\n" + + "pi<{pi}\n" + + "pu<{pu}\n" + + "pe<{pe}\n" + + "po<{po}\n" + + + "hy|{hvr}<{hi}[{hv}\n" + + "fa<{hu}{xa}\n" + + "fi<{hu}{xi}\n" + + "fe<{hu}{xe}\n" + + "fo<{hu}{xo}\n" + + "fya<{hu}{xya}\n" + + "fyu<{hu}{xyu}\n" + + "fyo<{hu}{xyo}\n" + + "ha<{ha}\n" + + "hi<{hi}\n" + + "fu<{hu}\n" // Hepburn + + "he<{he}\n" + + "ho<{ho}\n" + + + "my|{hvr}<{mi}[{hv}\n" + + "ma<{ma}\n" + + "mi<{mi}\n" + + "mu<{mu}\n" + + "me<{me}\n" + + "mo<{mo}\n" + + + "ya<{ya}\n" + + "yu<{yu}\n" + + "ye<{i}{xe}\n" + + "yo<{yo}\n" + + "xya<{xya}\n" + + "xyu<{xyu}\n" + + "xyo<{xyo}\n" + + + "ry|{hvr}<{ri}[{hv}\n" + + "ra<{ra}\n" + + "ri<{ri}\n" + + "ru<{ru}\n" + + "re<{re}\n" + + "ro<{ro}\n" + + + "wa<{wa}\n" + + "wi<{wi}\n" + + "we<{we}\n" + + "wo<{wo}\n" + + + "vu<{vu}\n" + + "vy|{hvr}<{VI}[{hv}\n" + + "v<{xtu}[{vu}\n" + + + "xa<{xa}\n" + + "xi<{xi}\n" + + "xu<{xu}\n" + + "xe<{xe}\n" + + "xo<{xo}\n" + + + "n''<{n}[{a}\n" + + "n''<{n}[{i}\n" + + "n''<{n}[{u}\n" + + "n''<{n}[{e}\n" + + "n''<{n}[{o}\n" + + "n''<{n}[{na}\n" + + "n''<{n}[{ni}\n" + + "n''<{n}[{nu}\n" + + "n''<{n}[{ne}\n" + + "n''<{n}[{no}\n" + + "n''<{n}[{ya}\n" + + "n''<{n}[{yu}\n" + + "n''<{n}[{yo}\n" + + "n''<{n}[{n}\n" + + "n<{n}\n" + + + + "g<{xtu}[{ga}\n" + + "g<{xtu}[{gi}\n" + + "g<{xtu}[{gu}\n" + + "g<{xtu}[{ge}\n" + + "g<{xtu}[{go}\n" + + "k<{xtu}[{ka}\n" + + "k<{xtu}[{ki}\n" + + "k<{xtu}[{ku}\n" + + "k<{xtu}[{ke}\n" + + "k<{xtu}[{ko}\n" + + + "z<{xtu}[{za}\n" + + "z<{xtu}[{zi}\n" + + "z<{xtu}[{zu}\n" + + "z<{xtu}[{ze}\n" + + "z<{xtu}[{zo}\n" + + "s<{xtu}[{sa}\n" + + "s<{xtu}[{si}\n" + + "s<{xtu}[{su}\n" + + "s<{xtu}[{se}\n" + + "s<{xtu}[{so}\n" + + + "d<{xtu}[{da}\n" + + "d<{xtu}[{di}\n" + + "d<{xtu}[{du}\n" + + "d<{xtu}[{de}\n" + + "d<{xtu}[{do}\n" + + "t<{xtu}[{ta}\n" + + "t<{xtu}[{ti}\n" + + "t<{xtu}[{tu}\n" + + "t<{xtu}[{te}\n" + + "t<{xtu}[{to}\n" + + + + "b<{xtu}[{ba}\n" + + "b<{xtu}[{bi}\n" + + "b<{xtu}[{bu}\n" + + "b<{xtu}[{be}\n" + + "b<{xtu}[{bo}\n" + + "p<{xtu}[{pa}\n" + + "p<{xtu}[{pi}\n" + + "p<{xtu}[{pu}\n" + + "p<{xtu}[{pe}\n" + + "p<{xtu}[{po}\n" + + "h<{xtu}[{ha}\n" + + "h<{xtu}[{hi}\n" + + "h<{xtu}[{hu}\n" + + "h<{xtu}[{he}\n" + + "h<{xtu}[{ho}\n" + + + + "r<{xtu}[{ra}\n" + + "r<{xtu}[{ri}\n" + + "r<{xtu}[{ru}\n" + + "r<{xtu}[{re}\n" + + "r<{xtu}[{ro}\n" + + + "w<{xtu}[{wa}\n" + + "xtu<{xtu}\n" + + + "a<{a}\n" + + "i<{i}\n" + + "u<{u}\n" + + "e<{e}\n" + + "o<{o}\n" + + + + // Convert small forms to vowels + + "a<{hvr}{xya}\n" + + "i<{hvr}{xi}\n" + + "u<{hvr}{xyu}\n" + + "e<{hvr}{xe}\n" + + "o<{hvr}{xyo}\n" + } + }; + } +} + + + diff --git a/icu4j/src/com/ibm/text/resources/TransliterationRule$StraightQuotes$CurlyQuotes.java b/icu4j/src/com/ibm/text/resources/TransliterationRule$StraightQuotes$CurlyQuotes.java new file mode 100755 index 00000000000..409d0a1e29b --- /dev/null +++ b/icu4j/src/com/ibm/text/resources/TransliterationRule$StraightQuotes$CurlyQuotes.java @@ -0,0 +1,87 @@ +package com.ibm.text.resources; + +import java.util.ListResourceBundle; + +public class TransliterationRuleStraightQuotesCurlyQuotes extends ListResourceBundle { + /** + * Overrides ListResourceBundle + */ + public Object[][] getContents() { + return new Object[][] { + { "Description", + "Use left and right double quotes" }, + + { "Rule", + // Rewritten using character codes [LIU] + "white=[[:Zs:][:Zl:][:Zp:]]\n" + + "black=[^[:Zs:][:Zl:][:Zp:]]\n" + + "open=[[:Ps:]]\n" + + "dquote=\"\n" + + + "lAng=\u3008\n" + + "ldAng=\u300A\n" + + "lBrk='['\n" + + "lBrc='{'\n" + + + "lquote=\u2018\n" + + "rquote=\u2019\n" + + "ldquote=\u201C\n" + + "rdquote=\u201D\n" + + + "ldguill=\u00AB\n" + + "rdguill=\u00BB\n" + + "lguill=\u2039\n" + + "rguill=\u203A\n" + + + "mdash=\u2014\n" + + //####################################### + // Conversions from input + //####################################### + + // join single quotes + + "{lquote}''>{ldquote}\n" + + "{lquote}{lquote}>{ldquote}\n" + + "{rquote}''>{rdquote}\n" + + "{rquote}{rquote}>{rdquote}\n" + + //smart single quotes + + "{white}]''>{lquote}\n" + + "{open}]''>{lquote}\n" + + "{black}]''>{rquote}\n" + + "''>{lquote}\n" + + //smart doubles + + "{white}]{dquote}>{ldquote}\n" + + "{open}]{dquote}>{ldquote}\n" + + "{black}]{dquote}>{rdquote}\n" + + "{dquote}>{ldquote}\n" + + // join single guillemets + + "{rguill}{rguill}>{rdguill}\n" + + "'>>'>{rdguill}\n" + + "{lguill}{lguill}>{ldguill}\n" + + "'<<'>{ldguill}\n" + + // prevent double spaces + + " ] >\n" + + // join hyphens into dash + + "-->{mdash}\n" + + //####################################### + // Conversions back to input + //####################################### + + //smart quotes + + "''<{lquote}\n" + + "''<{rquote}\n" + + "{dquote}<{ldquote}\n" + + "{dquote}<{rdquote}\n" + + //hyphens + + "--<{mdash}\n" + } + }; + } +}