mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
Initial revision
X-SVN-Rev: 437
This commit is contained in:
parent
34f6f3b538
commit
42035450ff
48 changed files with 16605 additions and 0 deletions
253
icu4j/src/com/ibm/demo/translit/Demo.java
Executable file
253
icu4j/src/com/ibm/demo/translit/Demo.java
Executable file
|
@ -0,0 +1,253 @@
|
|||
import java.applet.*;
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
import java.util.*;
|
||||
import com.ibm.text.components.*;
|
||||
import com.ibm.text.*;
|
||||
|
||||
/**
|
||||
* A frame that allows the user to experiment with keyboard
|
||||
* transliteration. This class has a main() method so it can be run
|
||||
* as an application. The frame contains an editable text component
|
||||
* and uses keyboard transliteration to process keyboard events.
|
||||
*
|
||||
* <p>Copyright (c) IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Demo.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class Demo extends Frame {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
Transliterator translit = null;
|
||||
|
||||
boolean compound = false;
|
||||
Transliterator[] compoundTranslit = new Transliterator[MAX_COMPOUND];
|
||||
static final int MAX_COMPOUND = 128;
|
||||
int compoundCount = 0;
|
||||
|
||||
TransliteratingTextComponent text = null;
|
||||
|
||||
Menu translitMenu;
|
||||
CheckboxMenuItem translitItem;
|
||||
CheckboxMenuItem noTranslitItem;
|
||||
|
||||
static final String NO_TRANSLITERATOR = "None";
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
public static void main(String[] args) {
|
||||
Frame f = new Demo(600, 200);
|
||||
f.addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent e) {
|
||||
System.exit(0);
|
||||
}
|
||||
});
|
||||
f.setVisible(true);
|
||||
}
|
||||
|
||||
public Demo(int width, int height) {
|
||||
super("Transliteration Demo");
|
||||
|
||||
initMenus();
|
||||
|
||||
addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent e) {
|
||||
handleClose();
|
||||
}
|
||||
});
|
||||
|
||||
text = new TransliteratingTextComponent();
|
||||
Font font = new Font("serif", Font.PLAIN, 48);
|
||||
text.setFont(font);
|
||||
text.setSize(width, height);
|
||||
text.setVisible(true);
|
||||
text.setText("\u03B1\u05D0\u3042\u4E80");
|
||||
add(text);
|
||||
|
||||
setSize(width, height);
|
||||
}
|
||||
|
||||
private void initMenus() {
|
||||
MenuBar mbar;
|
||||
Menu menu;
|
||||
MenuItem mitem;
|
||||
CheckboxMenuItem citem;
|
||||
|
||||
setMenuBar(mbar = new MenuBar());
|
||||
mbar.add(menu = new Menu("File"));
|
||||
menu.add(mitem = new MenuItem("Quit"));
|
||||
mitem.addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
handleClose();
|
||||
}
|
||||
});
|
||||
|
||||
final ItemListener setTransliteratorListener = new ItemListener() {
|
||||
public void itemStateChanged(ItemEvent e) {
|
||||
CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
|
||||
if (e.getStateChange() == ItemEvent.DESELECTED) {
|
||||
// Don't let the current transliterator be deselected.
|
||||
// Just reselect it.
|
||||
item.setState(true);
|
||||
} else if (compound) {
|
||||
// Adding an item to a compound transliterator
|
||||
handleAddToCompound(item.getLabel());
|
||||
} else if (item != translitItem) {
|
||||
// Deselect previous choice. Don't need to call
|
||||
// setState(true) on new choice.
|
||||
translitItem.setState(false);
|
||||
translitItem = item;
|
||||
handleSetTransliterator(item.getLabel());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
translit = null;
|
||||
mbar.add(translitMenu = new Menu("Transliterator"));
|
||||
translitMenu.add(translitItem = noTranslitItem =
|
||||
new CheckboxMenuItem(NO_TRANSLITERATOR, true));
|
||||
noTranslitItem.addItemListener(new ItemListener() {
|
||||
public void itemStateChanged(ItemEvent e) {
|
||||
// Can't uncheck None -- any action here sets None to true
|
||||
setNoTransliterator();
|
||||
}
|
||||
});
|
||||
|
||||
translitMenu.addSeparator();
|
||||
|
||||
translitMenu.add(citem = new CheckboxMenuItem("Compound"));
|
||||
citem.addItemListener(new ItemListener() {
|
||||
public void itemStateChanged(ItemEvent e) {
|
||||
CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
|
||||
if (e.getStateChange() == ItemEvent.DESELECTED) {
|
||||
// If compound gets deselected, then select NONE
|
||||
setNoTransliterator();
|
||||
} else if (!compound) {
|
||||
// Switching from non-compound to compound
|
||||
translitItem.setState(false);
|
||||
translitItem = item;
|
||||
translit = null;
|
||||
compound = true;
|
||||
compoundCount = 0;
|
||||
for (int i=0; i<MAX_COMPOUND; ++i) {
|
||||
compoundTranslit[i] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
translitMenu.addSeparator();
|
||||
|
||||
for (Enumeration e=getSystemTransliteratorNames().elements();
|
||||
e.hasMoreElements(); ) {
|
||||
String s = (String) e.nextElement();
|
||||
translitMenu.add(citem = new CheckboxMenuItem(s));
|
||||
citem.addItemListener(setTransliteratorListener);
|
||||
}
|
||||
|
||||
mbar.add(menu = new Menu("Batch"));
|
||||
menu.add(mitem = new MenuItem("Transliterate Selection"));
|
||||
mitem.addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
handleBatchTransliterate();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a sorted list of the system transliterators.
|
||||
*/
|
||||
private static Vector getSystemTransliteratorNames() {
|
||||
Vector v = new Vector();
|
||||
for (Enumeration e=Transliterator.getAvailableIDs();
|
||||
e.hasMoreElements(); ) {
|
||||
v.addElement(e.nextElement());
|
||||
}
|
||||
// Insertion sort, O(n^2) acceptable for small n
|
||||
for (int i=0; i<(v.size()-1); ++i) {
|
||||
String a = (String) v.elementAt(i);
|
||||
for (int j=i+1; j<v.size(); ++j) {
|
||||
String b = (String) v.elementAt(j);
|
||||
if (a.compareTo(b) > 0) {
|
||||
v.setElementAt(b, i);
|
||||
v.setElementAt(a, j);
|
||||
a = b;
|
||||
}
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
private void setNoTransliterator() {
|
||||
translitItem = noTranslitItem;
|
||||
noTranslitItem.setState(true);
|
||||
handleSetTransliterator(noTranslitItem.getLabel());
|
||||
compound = false;
|
||||
for (int i=0; i<translitMenu.getItemCount(); ++i) {
|
||||
MenuItem it = translitMenu.getItem(i);
|
||||
if (it != noTranslitItem && it instanceof CheckboxMenuItem) {
|
||||
((CheckboxMenuItem) it).setState(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void handleAddToCompound(String name) {
|
||||
if (compoundCount < MAX_COMPOUND) {
|
||||
compoundTranslit[compoundCount] = decodeTranslitItem(name);
|
||||
++compoundCount;
|
||||
Transliterator t[] = new Transliterator[compoundCount];
|
||||
System.arraycopy(compoundTranslit, 0, t, 0, compoundCount);
|
||||
translit = new CompoundTransliterator("Compound", t);
|
||||
text.setTransliterator(translit);
|
||||
}
|
||||
}
|
||||
|
||||
private void handleSetTransliterator(String name) {
|
||||
translit = decodeTranslitItem(name);
|
||||
text.setTransliterator(translit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a menu item that looks like <translit name>.
|
||||
*/
|
||||
private static Transliterator decodeTranslitItem(String name) {
|
||||
return (name.equals(NO_TRANSLITERATOR))
|
||||
? null : Transliterator.getInstance(name);
|
||||
}
|
||||
|
||||
private void handleBatchTransliterate() {
|
||||
if (translit == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
int start = text.getSelectionStart();
|
||||
int end = text.getSelectionEnd();
|
||||
ReplaceableString s =
|
||||
new ReplaceableString(text.getText().substring(start, end));
|
||||
|
||||
StringBuffer log = null;
|
||||
if (DEBUG) {
|
||||
log = new StringBuffer();
|
||||
log.append('"' + s.toString() + "\" (start " + start +
|
||||
", end " + end + ") -> \"");
|
||||
}
|
||||
|
||||
translit.transliterate(s);
|
||||
String str = s.toString();
|
||||
|
||||
if (DEBUG) {
|
||||
log.append(str + "\"");
|
||||
System.out.println("Batch " + translit.getID() + ": " + log.toString());
|
||||
}
|
||||
|
||||
text.replaceRange(str, start, end);
|
||||
text.select(start, start + str.length());
|
||||
}
|
||||
|
||||
private void handleClose() {
|
||||
dispose();
|
||||
}
|
||||
}
|
62
icu4j/src/com/ibm/demo/translit/DemoApplet.java
Executable file
62
icu4j/src/com/ibm/demo/translit/DemoApplet.java
Executable file
|
@ -0,0 +1,62 @@
|
|||
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
import java.applet.*;
|
||||
import com.ibm.text.components.AppletFrame;
|
||||
|
||||
/**
|
||||
* A simple Applet that shows a button. When pressed, the button
|
||||
* shows the DemoAppletFrame. This Applet is meant to be embedded
|
||||
* in a web page.
|
||||
*
|
||||
* <p>Copyright (c) IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: DemoApplet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class DemoApplet extends Applet {
|
||||
|
||||
Demo frame = null;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
public static void main(String args[]) {
|
||||
final DemoApplet applet = new DemoApplet();
|
||||
new AppletFrame("Transliteration Demo", applet, 640, 480);
|
||||
}
|
||||
|
||||
public void init() {
|
||||
|
||||
Button button = new Button("Transliteration Demo");
|
||||
button.addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
if (frame == null) {
|
||||
frame = new Demo(600, 200);
|
||||
frame.addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent we) {
|
||||
frame = null;
|
||||
}
|
||||
});
|
||||
}
|
||||
frame.setVisible(true);
|
||||
frame.toFront();
|
||||
}
|
||||
});
|
||||
|
||||
add(button);
|
||||
|
||||
Dimension size = button.getPreferredSize();
|
||||
size.width += 10;
|
||||
size.height += 10;
|
||||
|
||||
resize(size);
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
if (frame != null) {
|
||||
frame.dispose();
|
||||
}
|
||||
frame = null;
|
||||
}
|
||||
}
|
7
icu4j/src/com/ibm/demo/translit/demo.bat
Executable file
7
icu4j/src/com/ibm/demo/translit/demo.bat
Executable file
|
@ -0,0 +1,7 @@
|
|||
REM For best results, run the demo as an applet inside of Netscape
|
||||
REM with Bitstream Cyberbit installed.
|
||||
|
||||
REM setup your JDK 1.1.x path and classpath here:
|
||||
call JDK11
|
||||
set CLASSPATH=../translit.jar;%CLASSPATH%
|
||||
javaw Demo
|
8
icu4j/src/com/ibm/demo/translit/demo.html
Executable file
8
icu4j/src/com/ibm/demo/translit/demo.html
Executable file
|
@ -0,0 +1,8 @@
|
|||
<HTML>
|
||||
<HEAD>
|
||||
<TITLE>Transliteration Demo</TITLE>
|
||||
</HEAD>
|
||||
<BODY>
|
||||
<APPLET CODE="DemoApplet.class" WIDTH=140 HEIGHT=33></APPLET>
|
||||
</BODY>
|
||||
</HTML>
|
253
icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java
Executable file
253
icu4j/src/com/ibm/icu/dev/demo/translit/Demo.java
Executable file
|
@ -0,0 +1,253 @@
|
|||
import java.applet.*;
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
import java.util.*;
|
||||
import com.ibm.text.components.*;
|
||||
import com.ibm.text.*;
|
||||
|
||||
/**
|
||||
* A frame that allows the user to experiment with keyboard
|
||||
* transliteration. This class has a main() method so it can be run
|
||||
* as an application. The frame contains an editable text component
|
||||
* and uses keyboard transliteration to process keyboard events.
|
||||
*
|
||||
* <p>Copyright (c) IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Demo.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class Demo extends Frame {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
Transliterator translit = null;
|
||||
|
||||
boolean compound = false;
|
||||
Transliterator[] compoundTranslit = new Transliterator[MAX_COMPOUND];
|
||||
static final int MAX_COMPOUND = 128;
|
||||
int compoundCount = 0;
|
||||
|
||||
TransliteratingTextComponent text = null;
|
||||
|
||||
Menu translitMenu;
|
||||
CheckboxMenuItem translitItem;
|
||||
CheckboxMenuItem noTranslitItem;
|
||||
|
||||
static final String NO_TRANSLITERATOR = "None";
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
public static void main(String[] args) {
|
||||
Frame f = new Demo(600, 200);
|
||||
f.addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent e) {
|
||||
System.exit(0);
|
||||
}
|
||||
});
|
||||
f.setVisible(true);
|
||||
}
|
||||
|
||||
public Demo(int width, int height) {
|
||||
super("Transliteration Demo");
|
||||
|
||||
initMenus();
|
||||
|
||||
addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent e) {
|
||||
handleClose();
|
||||
}
|
||||
});
|
||||
|
||||
text = new TransliteratingTextComponent();
|
||||
Font font = new Font("serif", Font.PLAIN, 48);
|
||||
text.setFont(font);
|
||||
text.setSize(width, height);
|
||||
text.setVisible(true);
|
||||
text.setText("\u03B1\u05D0\u3042\u4E80");
|
||||
add(text);
|
||||
|
||||
setSize(width, height);
|
||||
}
|
||||
|
||||
private void initMenus() {
|
||||
MenuBar mbar;
|
||||
Menu menu;
|
||||
MenuItem mitem;
|
||||
CheckboxMenuItem citem;
|
||||
|
||||
setMenuBar(mbar = new MenuBar());
|
||||
mbar.add(menu = new Menu("File"));
|
||||
menu.add(mitem = new MenuItem("Quit"));
|
||||
mitem.addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
handleClose();
|
||||
}
|
||||
});
|
||||
|
||||
final ItemListener setTransliteratorListener = new ItemListener() {
|
||||
public void itemStateChanged(ItemEvent e) {
|
||||
CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
|
||||
if (e.getStateChange() == ItemEvent.DESELECTED) {
|
||||
// Don't let the current transliterator be deselected.
|
||||
// Just reselect it.
|
||||
item.setState(true);
|
||||
} else if (compound) {
|
||||
// Adding an item to a compound transliterator
|
||||
handleAddToCompound(item.getLabel());
|
||||
} else if (item != translitItem) {
|
||||
// Deselect previous choice. Don't need to call
|
||||
// setState(true) on new choice.
|
||||
translitItem.setState(false);
|
||||
translitItem = item;
|
||||
handleSetTransliterator(item.getLabel());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
translit = null;
|
||||
mbar.add(translitMenu = new Menu("Transliterator"));
|
||||
translitMenu.add(translitItem = noTranslitItem =
|
||||
new CheckboxMenuItem(NO_TRANSLITERATOR, true));
|
||||
noTranslitItem.addItemListener(new ItemListener() {
|
||||
public void itemStateChanged(ItemEvent e) {
|
||||
// Can't uncheck None -- any action here sets None to true
|
||||
setNoTransliterator();
|
||||
}
|
||||
});
|
||||
|
||||
translitMenu.addSeparator();
|
||||
|
||||
translitMenu.add(citem = new CheckboxMenuItem("Compound"));
|
||||
citem.addItemListener(new ItemListener() {
|
||||
public void itemStateChanged(ItemEvent e) {
|
||||
CheckboxMenuItem item = (CheckboxMenuItem) e.getSource();
|
||||
if (e.getStateChange() == ItemEvent.DESELECTED) {
|
||||
// If compound gets deselected, then select NONE
|
||||
setNoTransliterator();
|
||||
} else if (!compound) {
|
||||
// Switching from non-compound to compound
|
||||
translitItem.setState(false);
|
||||
translitItem = item;
|
||||
translit = null;
|
||||
compound = true;
|
||||
compoundCount = 0;
|
||||
for (int i=0; i<MAX_COMPOUND; ++i) {
|
||||
compoundTranslit[i] = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
translitMenu.addSeparator();
|
||||
|
||||
for (Enumeration e=getSystemTransliteratorNames().elements();
|
||||
e.hasMoreElements(); ) {
|
||||
String s = (String) e.nextElement();
|
||||
translitMenu.add(citem = new CheckboxMenuItem(s));
|
||||
citem.addItemListener(setTransliteratorListener);
|
||||
}
|
||||
|
||||
mbar.add(menu = new Menu("Batch"));
|
||||
menu.add(mitem = new MenuItem("Transliterate Selection"));
|
||||
mitem.addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
handleBatchTransliterate();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a sorted list of the system transliterators.
|
||||
*/
|
||||
private static Vector getSystemTransliteratorNames() {
|
||||
Vector v = new Vector();
|
||||
for (Enumeration e=Transliterator.getAvailableIDs();
|
||||
e.hasMoreElements(); ) {
|
||||
v.addElement(e.nextElement());
|
||||
}
|
||||
// Insertion sort, O(n^2) acceptable for small n
|
||||
for (int i=0; i<(v.size()-1); ++i) {
|
||||
String a = (String) v.elementAt(i);
|
||||
for (int j=i+1; j<v.size(); ++j) {
|
||||
String b = (String) v.elementAt(j);
|
||||
if (a.compareTo(b) > 0) {
|
||||
v.setElementAt(b, i);
|
||||
v.setElementAt(a, j);
|
||||
a = b;
|
||||
}
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
private void setNoTransliterator() {
|
||||
translitItem = noTranslitItem;
|
||||
noTranslitItem.setState(true);
|
||||
handleSetTransliterator(noTranslitItem.getLabel());
|
||||
compound = false;
|
||||
for (int i=0; i<translitMenu.getItemCount(); ++i) {
|
||||
MenuItem it = translitMenu.getItem(i);
|
||||
if (it != noTranslitItem && it instanceof CheckboxMenuItem) {
|
||||
((CheckboxMenuItem) it).setState(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void handleAddToCompound(String name) {
|
||||
if (compoundCount < MAX_COMPOUND) {
|
||||
compoundTranslit[compoundCount] = decodeTranslitItem(name);
|
||||
++compoundCount;
|
||||
Transliterator t[] = new Transliterator[compoundCount];
|
||||
System.arraycopy(compoundTranslit, 0, t, 0, compoundCount);
|
||||
translit = new CompoundTransliterator("Compound", t);
|
||||
text.setTransliterator(translit);
|
||||
}
|
||||
}
|
||||
|
||||
private void handleSetTransliterator(String name) {
|
||||
translit = decodeTranslitItem(name);
|
||||
text.setTransliterator(translit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a menu item that looks like <translit name>.
|
||||
*/
|
||||
private static Transliterator decodeTranslitItem(String name) {
|
||||
return (name.equals(NO_TRANSLITERATOR))
|
||||
? null : Transliterator.getInstance(name);
|
||||
}
|
||||
|
||||
private void handleBatchTransliterate() {
|
||||
if (translit == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
int start = text.getSelectionStart();
|
||||
int end = text.getSelectionEnd();
|
||||
ReplaceableString s =
|
||||
new ReplaceableString(text.getText().substring(start, end));
|
||||
|
||||
StringBuffer log = null;
|
||||
if (DEBUG) {
|
||||
log = new StringBuffer();
|
||||
log.append('"' + s.toString() + "\" (start " + start +
|
||||
", end " + end + ") -> \"");
|
||||
}
|
||||
|
||||
translit.transliterate(s);
|
||||
String str = s.toString();
|
||||
|
||||
if (DEBUG) {
|
||||
log.append(str + "\"");
|
||||
System.out.println("Batch " + translit.getID() + ": " + log.toString());
|
||||
}
|
||||
|
||||
text.replaceRange(str, start, end);
|
||||
text.select(start, start + str.length());
|
||||
}
|
||||
|
||||
private void handleClose() {
|
||||
dispose();
|
||||
}
|
||||
}
|
62
icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java
Executable file
62
icu4j/src/com/ibm/icu/dev/demo/translit/DemoApplet.java
Executable file
|
@ -0,0 +1,62 @@
|
|||
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
import java.applet.*;
|
||||
import com.ibm.text.components.AppletFrame;
|
||||
|
||||
/**
|
||||
* A simple Applet that shows a button. When pressed, the button
|
||||
* shows the DemoAppletFrame. This Applet is meant to be embedded
|
||||
* in a web page.
|
||||
*
|
||||
* <p>Copyright (c) IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: DemoApplet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class DemoApplet extends Applet {
|
||||
|
||||
Demo frame = null;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
public static void main(String args[]) {
|
||||
final DemoApplet applet = new DemoApplet();
|
||||
new AppletFrame("Transliteration Demo", applet, 640, 480);
|
||||
}
|
||||
|
||||
public void init() {
|
||||
|
||||
Button button = new Button("Transliteration Demo");
|
||||
button.addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
if (frame == null) {
|
||||
frame = new Demo(600, 200);
|
||||
frame.addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent we) {
|
||||
frame = null;
|
||||
}
|
||||
});
|
||||
}
|
||||
frame.setVisible(true);
|
||||
frame.toFront();
|
||||
}
|
||||
});
|
||||
|
||||
add(button);
|
||||
|
||||
Dimension size = button.getPreferredSize();
|
||||
size.width += 10;
|
||||
size.height += 10;
|
||||
|
||||
resize(size);
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
if (frame != null) {
|
||||
frame.dispose();
|
||||
}
|
||||
frame = null;
|
||||
}
|
||||
}
|
7
icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat
Executable file
7
icu4j/src/com/ibm/icu/dev/demo/translit/demo.bat
Executable file
|
@ -0,0 +1,7 @@
|
|||
REM For best results, run the demo as an applet inside of Netscape
|
||||
REM with Bitstream Cyberbit installed.
|
||||
|
||||
REM setup your JDK 1.1.x path and classpath here:
|
||||
call JDK11
|
||||
set CLASSPATH=../translit.jar;%CLASSPATH%
|
||||
javaw Demo
|
8
icu4j/src/com/ibm/icu/dev/demo/translit/demo.html
Executable file
8
icu4j/src/com/ibm/icu/dev/demo/translit/demo.html
Executable file
|
@ -0,0 +1,8 @@
|
|||
<HTML>
|
||||
<HEAD>
|
||||
<TITLE>Transliteration Demo</TITLE>
|
||||
</HEAD>
|
||||
<BODY>
|
||||
<APPLET CODE="DemoApplet.class" WIDTH=140 HEIGHT=33></APPLET>
|
||||
</BODY>
|
||||
</HTML>
|
763
icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
Executable file
763
icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java
Executable file
|
@ -0,0 +1,763 @@
|
|||
import com.ibm.text.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary General test of Transliterator
|
||||
*/
|
||||
public class TransliteratorTest extends IntlTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new TransliteratorTest().run(args);
|
||||
}
|
||||
|
||||
/**
|
||||
* A CommonPoint legacy round-trip test for the Kana transliterator.
|
||||
*/
|
||||
// public void TestKanaRoundTrip() {
|
||||
// Transliterator t = Transliterator.getInstance("Kana");
|
||||
// StringTokenizer tok = new StringTokenizer(KANA_RT_DATA);
|
||||
// while (tok.hasMoreTokens()) {
|
||||
// String str = tok.nextToken();
|
||||
// ReplaceableString tmp = new ReplaceableString(str);
|
||||
// t.transliterate(tmp, Transliterator.FORWARD);
|
||||
//
|
||||
// str = tmp.toString();
|
||||
// tmp = new ReplaceableString(str);
|
||||
// t.transliterate(tmp, Transliterator.REVERSE);
|
||||
// t.transliterate(tmp, Transliterator.FORWARD);
|
||||
// if (!tmp.toString().equals(str)) {
|
||||
// tmp = new ReplaceableString(str);
|
||||
// t.transliterate(tmp, Transliterator.REVERSE);
|
||||
// String a = tmp.toString();
|
||||
// t.transliterate(tmp, Transliterator.FORWARD);
|
||||
// errln("FAIL: " + escape(str) + " -> " +
|
||||
// escape(a) + " -> " + escape(tmp.toString()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
public void TestInstantiation() {
|
||||
long ms = System.currentTimeMillis();
|
||||
String ID;
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
ID = (String) e.nextElement();
|
||||
try {
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
// We should get a new instance if we try again
|
||||
Transliterator t2 = Transliterator.getInstance(ID);
|
||||
if (t != t2) {
|
||||
logln(ID + ":" + t);
|
||||
} else {
|
||||
errln("FAIL: " + ID + " returned identical instances");
|
||||
}
|
||||
} catch (IllegalArgumentException ex) {
|
||||
errln("FAIL: " + ID);
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
// Now test the failure path
|
||||
try {
|
||||
ID = "<Not a valid Transliterator ID>";
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
errln("FAIL: " + ID + " returned " + t);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
logln("OK: Bogus ID handled properly");
|
||||
}
|
||||
|
||||
ms = System.currentTimeMillis() - ms;
|
||||
logln("Elapsed time: " + ms + " ms");
|
||||
}
|
||||
|
||||
public void TestSimpleRules() {
|
||||
/* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
*
|
||||
* []|eabcd start - no match, copy e to tranlated buffer
|
||||
* [e]|abcd match rule 1 - copy output & adjust cursor
|
||||
* [ex|y]cd match rule 2 - copy output & adjust cursor
|
||||
* [exz]|d no match, copy d to transliterated buffer
|
||||
* [exzd]| done
|
||||
*/
|
||||
expect("ab>x|y\n" +
|
||||
"yc>z",
|
||||
"eabcd", "exzd");
|
||||
|
||||
/* Another set of rules:
|
||||
* 1. ab>x|yzacw
|
||||
* 2. za>q
|
||||
* 3. qc>r
|
||||
* 4. cw>n
|
||||
*
|
||||
* []|ab Rule 1
|
||||
* [x|yzacw] No match
|
||||
* [xy|zacw] Rule 2
|
||||
* [xyq|cw] Rule 4
|
||||
* [xyqn]| Done
|
||||
*/
|
||||
expect("ab>x|yzacw\n" +
|
||||
"za>q\n" +
|
||||
"qc>r\n" +
|
||||
"cw>n",
|
||||
"ab", "xyqn");
|
||||
|
||||
/* Test categories
|
||||
*/
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>",
|
||||
"dummy=\uE100\n" +
|
||||
"vowel=[aeiouAEIOU]\n" +
|
||||
"lu=[:Lu:]\n" +
|
||||
"{vowel}[{lu}>!\n" +
|
||||
"{vowel}>&\n" +
|
||||
"!]{lu}>^\n" +
|
||||
"{lu}>*\n" +
|
||||
"a>ERROR");
|
||||
expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
|
||||
}
|
||||
|
||||
// Restore this test if/when it's been deciphered. In general,
|
||||
// tests that depend on a specific tranliterator are subject
|
||||
// to the same fragility as tests that depend on resource data.
|
||||
|
||||
// public void TestKana() {
|
||||
// String DATA[] = {
|
||||
// "a", "\u3042",
|
||||
// "A", "\u30A2",
|
||||
// "aA", "\u3042\u30A2",
|
||||
// "aaaa", "\u3042\u3042\u3042\u3042",
|
||||
// "akasata", "\u3042\u304B\u3055\u305F",
|
||||
// };
|
||||
//
|
||||
// Transliterator t = Transliterator.getInstance("Latin-Kana");
|
||||
// Transliterator rt = Transliterator.getInstance("Kana-Latin");
|
||||
// for (int i=0; i<DATA.length; i+=2) {
|
||||
// expect(t, DATA[i], DATA[i+1], rt);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
/**
|
||||
* Create some inverses and confirm that they work. We have to be
|
||||
* careful how we do this, since the inverses will not be true
|
||||
* inverses -- we can't throw any random string at the composition
|
||||
* of the transliterators and expect the identity function. F x
|
||||
* F' != I. However, if we are careful about the input, we will
|
||||
* get the expected results.
|
||||
*/
|
||||
public void TestRuleBasedInverse() {
|
||||
String RULES =
|
||||
"abc>zyx\n" +
|
||||
"ab>yz\n" +
|
||||
"bc>zx\n" +
|
||||
"ca>xy\n" +
|
||||
"a>x\n" +
|
||||
"b>y\n" +
|
||||
"c>z\n" +
|
||||
|
||||
"abc<zyx\n" +
|
||||
"ab<yz\n" +
|
||||
"bc<zx\n" +
|
||||
"ca<xy\n" +
|
||||
"a<x\n" +
|
||||
"b<y\n" +
|
||||
"c<z\n" +
|
||||
|
||||
"";
|
||||
|
||||
String[] DATA = {
|
||||
// Careful here -- random strings will not work. If we keep
|
||||
// the left side to the domain and the right side to the range
|
||||
// we will be okay though (left, abc; right xyz).
|
||||
"a", "x",
|
||||
"abcacab", "zyxxxyy",
|
||||
"caccb", "xyzzy",
|
||||
};
|
||||
|
||||
Transliterator fwd = new RuleBasedTransliterator("<ID>", RULES);
|
||||
Transliterator rev = new RuleBasedTransliterator("<ID>", RULES,
|
||||
RuleBasedTransliterator.REVERSE, null);
|
||||
for (int i=0; i<DATA.length; i+=2) {
|
||||
expect(fwd, DATA[i], DATA[i+1]);
|
||||
expect(rev, DATA[i+1], DATA[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic test of keyboard.
|
||||
*/
|
||||
public void TestKeyboard() {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>",
|
||||
"psch>Y\n"
|
||||
+"ps>y\n"
|
||||
+"ch>x\n"
|
||||
+"a>A\n");
|
||||
String DATA[] = {
|
||||
// insertion, buffer
|
||||
"a", "A",
|
||||
"p", "Ap",
|
||||
"s", "Aps",
|
||||
"c", "Apsc",
|
||||
"a", "AycA",
|
||||
"psch", "AycAY",
|
||||
null, "AycAY", // null means finishKeyboardTransliteration
|
||||
};
|
||||
|
||||
keyboardAux(t, DATA);
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic test of keyboard with cursor.
|
||||
*/
|
||||
public void TestKeyboard2() {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>",
|
||||
"ych>Y\n"
|
||||
+"ps>|y\n"
|
||||
+"ch>x\n"
|
||||
+"a>A\n");
|
||||
String DATA[] = {
|
||||
// insertion, buffer
|
||||
"a", "A",
|
||||
"p", "Ap",
|
||||
"s", "Ay",
|
||||
"c", "Ayc",
|
||||
"a", "AycA",
|
||||
"p", "AycAp",
|
||||
"s", "AycAy",
|
||||
"c", "AycAyc",
|
||||
"h", "AycAY",
|
||||
null, "AycAY", // null means finishKeyboardTransliteration
|
||||
};
|
||||
|
||||
keyboardAux(t, DATA);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test keyboard transliteration with back-replacement.
|
||||
*/
|
||||
public void TestKeyboard3() {
|
||||
// We want th>z but t>y. Furthermore, during keyboard
|
||||
// transliteration we want t>y then yh>z if t, then h are
|
||||
// typed.
|
||||
String RULES =
|
||||
"t>|y\n" +
|
||||
"yh>z\n" +
|
||||
"";
|
||||
|
||||
String[] DATA = {
|
||||
// Column 1: characters to add to buffer (as if typed)
|
||||
// Column 2: expected appearance of buffer after
|
||||
// keyboard xliteration.
|
||||
"a", "a",
|
||||
"b", "ab",
|
||||
"t", "aby",
|
||||
"c", "abyc",
|
||||
"t", "abycy",
|
||||
"h", "abycz",
|
||||
null, "abycz", // null means finishKeyboardTransliteration
|
||||
};
|
||||
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", RULES);
|
||||
keyboardAux(t, DATA);
|
||||
}
|
||||
|
||||
private void keyboardAux(Transliterator t, String[] DATA) {
|
||||
int[] index = {0, 0, 0};
|
||||
ReplaceableString s = new ReplaceableString();
|
||||
for (int i=0; i<DATA.length; i+=2) {
|
||||
StringBuffer log;
|
||||
if (DATA[i] != null) {
|
||||
log = new StringBuffer(s.toString() + " + "
|
||||
+ DATA[i]
|
||||
+ " -> ");
|
||||
t.keyboardTransliterate(s, index, DATA[i]);
|
||||
} else {
|
||||
log = new StringBuffer(s.toString() + " => ");
|
||||
t.finishKeyboardTransliteration(s, index);
|
||||
}
|
||||
String str = s.toString();
|
||||
// Show the start index '{' and the cursor '|'
|
||||
log.append(str.substring(0, index[Transliterator.START])).
|
||||
append('{').
|
||||
append(str.substring(index[Transliterator.START],
|
||||
index[Transliterator.CURSOR])).
|
||||
append('|').
|
||||
append(str.substring(index[Transliterator.CURSOR]));
|
||||
if (str.equals(DATA[i+1])) {
|
||||
logln(log.toString());
|
||||
} else {
|
||||
errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestArabic() {
|
||||
String DATA[] = {
|
||||
"Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
|
||||
"\u0627\u0644\u0644\u063a\u0629\u0020"+
|
||||
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
|
||||
"\u0628\u0628\u0646\u0638\u0645\u0020"+
|
||||
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
|
||||
"\u062c\u0645\u064a\u0644\u0629",
|
||||
};
|
||||
|
||||
Transliterator t = Transliterator.getInstance("Latin-Arabic");
|
||||
for (int i=0; i<DATA.length; i+=2) {
|
||||
expect(t, DATA[i], DATA[i+1]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compose the Kana transliterator forward and reverse and try
|
||||
* some strings that should come out unchanged.
|
||||
*/
|
||||
public void TestCompoundKana() {
|
||||
Transliterator kana = Transliterator.getInstance("Latin-Kana");
|
||||
Transliterator rkana = Transliterator.getInstance("Kana-Latin");
|
||||
Transliterator[] trans = { kana, rkana };
|
||||
Transliterator t = new CompoundTransliterator("<ID>", trans);
|
||||
|
||||
expect(t, "aaaaa", "aaaaa");
|
||||
}
|
||||
|
||||
/**
|
||||
* Compose the hex transliterators forward and reverse.
|
||||
*/
|
||||
public void TestCompoundHex() {
|
||||
Transliterator a = Transliterator.getInstance("Unicode-Hex");
|
||||
Transliterator b = Transliterator.getInstance("Hex-Unicode");
|
||||
Transliterator[] trans = { a, b };
|
||||
Transliterator ab = new CompoundTransliterator("ab", trans);
|
||||
String s = "abcde";
|
||||
expect(ab, s, s);
|
||||
|
||||
trans = new Transliterator[] { b, a };
|
||||
Transliterator ba = new CompoundTransliterator("ba", trans);
|
||||
ReplaceableString str = new ReplaceableString(s);
|
||||
a.transliterate(str);
|
||||
expect(ba, str.toString(), str.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Do some basic tests of filtering.
|
||||
*/
|
||||
public void TestFiltering() {
|
||||
Transliterator hex = Transliterator.getInstance("Unicode-Hex");
|
||||
hex.setFilter(new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return c != 'c';
|
||||
}
|
||||
});
|
||||
String s = "abcde";
|
||||
String out = hex.transliterate(s);
|
||||
String exp = "\\u0061\\u0062c\\u0064\\u0065";
|
||||
if (out.equals(exp)) {
|
||||
logln("Ok: \"" + exp + "\"");
|
||||
} else {
|
||||
logln("FAIL: \"" + out + "\", wanted \"" + exp + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
||||
void expect(String rules, String source, String expectedResult) {
|
||||
expect(new RuleBasedTransliterator("<ID>", rules), source, expectedResult);
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult,
|
||||
Transliterator reverseTransliterator) {
|
||||
expect(t, source, expectedResult);
|
||||
if (reverseTransliterator != null) {
|
||||
expect(reverseTransliterator, expectedResult, source);
|
||||
}
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult) {
|
||||
String result = t.transliterate(source);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
|
||||
ReplaceableString rsource = new ReplaceableString(source);
|
||||
t.transliterate(rsource);
|
||||
result = rsource.toString();
|
||||
expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.getStringBuffer().setLength(0);
|
||||
int[] index = { 0, 0, 0 };
|
||||
StringBuffer log = new StringBuffer();
|
||||
|
||||
for (int i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
t.keyboardTransliterate(rsource, index,
|
||||
String.valueOf(source.charAt(i)));
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
String s = rsource.toString();
|
||||
log.append(s.substring(0, index[Transliterator.CURSOR])).
|
||||
append('|').
|
||||
append(s.substring(index[Transliterator.CURSOR]));
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
// transliterate to finish off any pending partial matches that
|
||||
// were waiting for more input.
|
||||
t.finishKeyboardTransliteration(rsource, index);
|
||||
result = rsource.toString();
|
||||
log.append(" => ").append(rsource.toString());
|
||||
|
||||
expectAux(t.getID() + ":Keyboard", log.toString(),
|
||||
result.equals(expectedResult),
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void expectAux(String tag, String source,
|
||||
String result, String expectedResult) {
|
||||
expectAux(tag, source + " -> " + result,
|
||||
result.equals(expectedResult),
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void expectAux(String tag, String summary, boolean pass,
|
||||
String expectedResult) {
|
||||
if (pass) {
|
||||
logln("("+tag+") " + escape(summary));
|
||||
} else {
|
||||
errln("FAIL: ("+tag+") "
|
||||
+ escape(summary)
|
||||
+ ", expected " + escape(expectedResult));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/*
|
||||
static final String KANA_RT_DATA =
|
||||
"a "+
|
||||
|
||||
"ba bi bu be bo "+
|
||||
"bya byi byu bye byo "+
|
||||
"bba "+
|
||||
|
||||
"da di du de do "+
|
||||
"dya dyi dyu dye dyo "+
|
||||
"dha dhi dhu dhe dho "+
|
||||
"dda "+
|
||||
|
||||
"e "+
|
||||
|
||||
"fa fi fe fo "+
|
||||
"fya fyu fyo "+
|
||||
"ffa "+
|
||||
|
||||
"ga gi gu ge go "+
|
||||
"gya gyi gyu gye gyo "+
|
||||
"gwa gwi gwu gwe gwo "+
|
||||
"gga "+
|
||||
|
||||
"ha hi hu he ho "+
|
||||
"hya hyi hyu hye hyo "+
|
||||
"hha "+
|
||||
|
||||
"i "+
|
||||
|
||||
"ka ki ku ke ko "+
|
||||
"kwa kwi kwu kwe kwo "+
|
||||
"kya kyi kyu kye kyo "+
|
||||
"kka "+
|
||||
|
||||
"ma mi mu me mo "+
|
||||
"mya myi myu mye myo "+
|
||||
"mba mfa mma mpa mva "+
|
||||
"m'' "+
|
||||
|
||||
"na ni nu ne no "+
|
||||
"nya nyi nyu nye nyo "+
|
||||
"nn n'' n "+
|
||||
|
||||
"o "+
|
||||
|
||||
"pa pi pu pe po "+
|
||||
"pya pyi pyu pye pyo "+
|
||||
"ppa "+
|
||||
|
||||
"qa qi qu qe qo "+
|
||||
"qya qyi qyu qye qyo "+
|
||||
"qqa "+
|
||||
|
||||
"ra ri ru re ro "+
|
||||
"rya ryi ryu rye ryo "+
|
||||
"rra "+
|
||||
|
||||
"sa si su se so "+
|
||||
"sya syi syu sye syo "+
|
||||
"ssya ssa "+
|
||||
|
||||
"ta ti tu te to "+
|
||||
"tha thi thu the tho "+
|
||||
"tsa tsi tse tso "+
|
||||
"tya tyi tyu tye tyo "+
|
||||
"ttsa "+
|
||||
"tta "+
|
||||
|
||||
"u "+
|
||||
|
||||
"va vi vu ve vo "+
|
||||
"vya vyi vyu vye vyo "+
|
||||
"vva "+
|
||||
|
||||
"wa wi we wo "+
|
||||
"wwa "+
|
||||
|
||||
"ya yu ye yo "+
|
||||
"yya "+
|
||||
|
||||
"za zi zu ze zo "+
|
||||
"zya zyi zyu zye zyo "+
|
||||
"zza "+
|
||||
|
||||
"xa xi xu xe xo "+
|
||||
"xka xke "+
|
||||
"xtu "+
|
||||
"xwa "+
|
||||
"xya xyu xyo "+
|
||||
|
||||
"akka akki akku akke akko "+
|
||||
"akkya akkyu akkyo "+
|
||||
|
||||
"atta atti attu atte atto "+
|
||||
"attya attyu attyo "+
|
||||
"adda addi addu adde addo "+
|
||||
|
||||
"atcha atchi atchu atche atcho "+
|
||||
|
||||
"assa assi assu asse asso "+
|
||||
"assya assyu assyo "+
|
||||
|
||||
"ahha ahhi ahhu ahhe ahho "+
|
||||
"appa appi appu appe appo "+
|
||||
|
||||
"an "+
|
||||
"ana ani anu ane ano "+
|
||||
"anna anni annu anne anno "+
|
||||
"an'a an'i an'u an'e an'o "+
|
||||
|
||||
"annna annni annnu annne annno "+
|
||||
"an'na an'ni an'nu an'ne an'no "+
|
||||
|
||||
"anka anki anku anke anko "+
|
||||
"anga angi angu ange ango "+
|
||||
|
||||
"ansa ansi ansu anse anso "+
|
||||
"anza anzi anzu anze anzo "+
|
||||
"anzya anzyu anzyo "+
|
||||
|
||||
"anta anti antu ante anto "+
|
||||
"antya antyu antyo "+
|
||||
"anda andi andu ande ando "+
|
||||
|
||||
"ancha anchi anchu anche ancho "+
|
||||
"anja anji anju anje anjo "+
|
||||
"antsa antsu antso "+
|
||||
|
||||
"anpa anpi anpu anpe anpo "+
|
||||
"ampa ampi ampu ampe ampo "+
|
||||
|
||||
"anba anbi anbu anbe anbo "+
|
||||
"amba ambi ambu ambe ambo "+
|
||||
|
||||
"anma anmi anmu anme anmo "+
|
||||
"amma ammi ammu amme ammo "+
|
||||
|
||||
"anwa anwi anwu anwe anwo "+
|
||||
|
||||
"anha anhi anhu anhe anho "+
|
||||
|
||||
"anya anyi anyu anye anyo "+
|
||||
"annya annyi annyu annye annyo "+
|
||||
"an'ya an'yi an'yu an'ye an'yo "+
|
||||
|
||||
"kkk "+
|
||||
"ggg "+
|
||||
"sss "+
|
||||
"zzz "+
|
||||
"ttt "+
|
||||
"ddd "+
|
||||
"nnn "+
|
||||
"hhh "+
|
||||
"bbb "+
|
||||
"ppp "+
|
||||
"mmm "+
|
||||
"yyy "+
|
||||
"rrr "+
|
||||
"www ";
|
||||
*/
|
||||
|
||||
/*+
|
||||
|
||||
"A I U E O "+
|
||||
"XA XI XU XE XO "+
|
||||
|
||||
"KA KI KU KE KO "+
|
||||
"KYA KYI KYU KYE KYO "+
|
||||
"KWA KWI KWU KWE KWO "+
|
||||
"QA QI QU QE QO "+
|
||||
"QYA QYI QYU QYE QYO "+
|
||||
"XKA XKE "+
|
||||
|
||||
"GA GI GU GE GO "+
|
||||
"GYA GYI GYU GYE GYO "+
|
||||
"GWA GWI GWU GWE GWO "+
|
||||
|
||||
"SA SI SU SE SO "+
|
||||
"SHA SHI SHU SHE SHO "+
|
||||
"SYA SYI SYU SYE SYO "+
|
||||
|
||||
"ZA ZI ZU ZE ZO "+
|
||||
"ZYA ZYI ZYU ZYE ZYO "+
|
||||
"JA JI JU JE JO "+
|
||||
"JYA JYU JYO "+
|
||||
|
||||
"TA TI TU TE TO "+
|
||||
"XTU XTSU "+
|
||||
"TYA TYU TYO "+
|
||||
"CYA CYU CYO "+
|
||||
"CHA CHI CHU CHE CHO "+
|
||||
"TSA TSI TSU TSE TSO "+
|
||||
"DA DI DU DE DO "+
|
||||
"DYA DYU DYO "+
|
||||
"THA THI THU THE THO "+
|
||||
"DHA DHI DHU DHE DHO "+
|
||||
|
||||
"NA NI NU NE NO "+
|
||||
"NYA NYU NYO "+
|
||||
|
||||
"HA HI HU HE HO "+
|
||||
"HYA HYU HYO "+
|
||||
"FA FI FU FE FO "+
|
||||
"FYA FYU FYO "+
|
||||
"BA BI BU BE BO "+
|
||||
"BYA BYU BYO "+
|
||||
"PA PI PU PE PO "+
|
||||
"PYA PYU PYO "+
|
||||
|
||||
"MA MI MU ME MO "+
|
||||
"MYA MYU MYO "+
|
||||
"YA YI YU YE YO "+
|
||||
"XYA XYI XYU XYE XYO "+
|
||||
|
||||
"RA RI RU RE RO "+
|
||||
"LA LI LU LE LO "+
|
||||
"RYA RYI RYU RYE RYO "+
|
||||
"LYA LYI LYU LYE LYO "+
|
||||
|
||||
"WA WI WU WE WO "+
|
||||
"VA VI VU VE VO "+
|
||||
"VYA VYU VYO "+
|
||||
|
||||
"CYA CYI CYU CYE CYO "+
|
||||
|
||||
"NN "+
|
||||
"N' "+
|
||||
"N "+
|
||||
|
||||
"AKKA AKKI AKKU AKKE AKKO "+
|
||||
"AKKYA AKKYU AKKYO "+
|
||||
|
||||
"ATTA ATTI ATTU ATTE ATTO "+
|
||||
"ATTYA ATTYU ATTYO "+
|
||||
"ADDA ADDI ADDU ADDE ADDO "+
|
||||
|
||||
"ATCHA ATCHI ATCHU ATCHE ATCHO "+
|
||||
|
||||
"ASSA ASSI ASSU ASSE ASSO "+
|
||||
"ASSYA ASSYU ASSYO "+
|
||||
|
||||
"AHHA AHHI AHHU AHHE AHHO "+
|
||||
"APPA APPI APPU APPE APPO "+
|
||||
|
||||
"AN "+
|
||||
"ANA ANI ANU ANE ANO "+
|
||||
"ANNA ANNI ANNU ANNE ANNO "+
|
||||
"AN'A AN'I AN'U AN'E AN'O "+
|
||||
|
||||
"ANNNA ANNNI ANNNU ANNNE ANNNO "+
|
||||
"AN'NA AN'NI AN'NU AN'NE AN'NO "+
|
||||
|
||||
"ANKA ANKI ANKU ANKE ANKO "+
|
||||
"ANGA ANGI ANGU ANGE ANGO "+
|
||||
|
||||
"ANSA ANSI ANSU ANSE ANSO "+
|
||||
"ANZA ANZI ANZU ANZE ANZO "+
|
||||
"ANZYA ANZYU ANZYO "+
|
||||
|
||||
"ANTA ANTI ANTU ANTE ANTO "+
|
||||
"ANTYA ANTYU ANTYO "+
|
||||
"ANDA ANDI ANDU ANDE ANDO "+
|
||||
|
||||
"ANCHA ANCHI ANCHU ANCHE ANCHO "+
|
||||
"ANJA ANJI ANJU ANJE ANJO "+
|
||||
"ANTSA ANTSU ANTSO "+
|
||||
|
||||
"ANPA ANPI ANPU ANPE ANPO "+
|
||||
"AMPA AMPI AMPU AMPE AMPO "+
|
||||
|
||||
"ANBA ANBI ANBU ANBE ANBO "+
|
||||
"AMBA AMBI AMBU AMBE AMBO "+
|
||||
|
||||
"ANMA ANMI ANMU ANME ANMO "+
|
||||
"AMMA AMMI AMMU AMME AMMO "+
|
||||
|
||||
"ANWA ANWI ANWU ANWE ANWO "+
|
||||
|
||||
"ANHA ANHI ANHU ANHE ANHO "+
|
||||
|
||||
"ANYA ANYI ANYU ANYE ANYO "+
|
||||
"ANNYA ANNYI ANNYU ANNYE ANNYO "+
|
||||
"AN'YA AN'YI AN'YU AN'YE AN'YO "+
|
||||
|
||||
"KKK "+
|
||||
"GGG "+
|
||||
"SSS "+
|
||||
"ZZZ "+
|
||||
"TTT "+
|
||||
"DDD "+
|
||||
"NNN "+
|
||||
"HHH "+
|
||||
"BBB "+
|
||||
"PPP "+
|
||||
"MMM "+
|
||||
"YYY "+
|
||||
"RRR "+
|
||||
"WWW";*/
|
||||
}
|
118
icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
Executable file
118
icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
Executable file
|
@ -0,0 +1,118 @@
|
|||
import com.ibm.text.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary General test of UnicodeSet
|
||||
*/
|
||||
public class UnicodeSetTest extends IntlTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new UnicodeSetTest().run(args);
|
||||
}
|
||||
|
||||
public void TestPatterns() {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km");
|
||||
expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz");
|
||||
expectPattern(set, "[a\\-z]", "--aazz");
|
||||
expectPattern(set, "[-az]", "--aazz");
|
||||
expectPattern(set, "[az-]", "--aazz");
|
||||
expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz");
|
||||
|
||||
// Throw in a test of complement
|
||||
set.complement();
|
||||
String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF';
|
||||
expectPairs(set, exp);
|
||||
}
|
||||
|
||||
public void TestAddRemove() {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.add('a', 'z');
|
||||
expectPairs(set, "az");
|
||||
set.remove('m', 'p');
|
||||
expectPairs(set, "alqz");
|
||||
set.remove('e', 'g');
|
||||
expectPairs(set, "adhlqz");
|
||||
set.remove('d', 'i');
|
||||
expectPairs(set, "acjlqz");
|
||||
set.remove('c', 'r');
|
||||
expectPairs(set, "absz");
|
||||
set.add('f', 'q');
|
||||
expectPairs(set, "abfqsz");
|
||||
set.remove('a', 'g');
|
||||
expectPairs(set, "hqsz");
|
||||
set.remove('a', 'z');
|
||||
expectPairs(set, "");
|
||||
|
||||
// Try removing an entire set from another set
|
||||
expectPattern(set, "[c-x]", "cx");
|
||||
UnicodeSet set2 = new UnicodeSet();
|
||||
expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
|
||||
set.removeAll(set2);
|
||||
expectPairs(set, "deluxx");
|
||||
|
||||
// Try adding an entire set to another set
|
||||
expectPattern(set, "[jackiemclean]", "aacceein");
|
||||
expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
|
||||
set.addAll(set2);
|
||||
expectPairs(set, "aacehort");
|
||||
|
||||
// Test commutativity
|
||||
expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
|
||||
expectPattern(set2, "[jackiemclean]", "aacceein");
|
||||
set.addAll(set2);
|
||||
expectPairs(set, "aacehort");
|
||||
}
|
||||
|
||||
void expectPattern(UnicodeSet set,
|
||||
String pattern,
|
||||
String expectedPairs) {
|
||||
set.applyPattern(pattern);
|
||||
if (!set.getPairs().equals(expectedPairs)) {
|
||||
errln("FAIL: applyPattern(\"" + pattern +
|
||||
"\") => pairs \"" +
|
||||
escape(set.getPairs()) + "\", expected \"" +
|
||||
escape(expectedPairs) + "\"");
|
||||
} else {
|
||||
logln("Ok: applyPattern(\"" + pattern +
|
||||
"\") => pairs \"" +
|
||||
escape(set.getPairs()) + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
void expectPairs(UnicodeSet set, String expectedPairs) {
|
||||
if (!set.getPairs().equals(expectedPairs)) {
|
||||
errln("FAIL: Expected pair list \"" +
|
||||
escape(expectedPairs) + "\", got \"" +
|
||||
escape(set.getPairs()) + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
*/
|
||||
static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
285
icu4j/src/com/ibm/icu/text/CompoundTransliterator.java
Executable file
285
icu4j/src/com/ibm/icu/text/CompoundTransliterator.java
Executable file
|
@ -0,0 +1,285 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.Vector;
|
||||
|
||||
/**
|
||||
* A transliterator that is composed of two or more other
|
||||
* transliterator objects linked together. For example, if one
|
||||
* transliterator transliterates from script A to script B, and
|
||||
* another transliterates from script B to script C, the two may be
|
||||
* combined to form a new transliterator from A to C.
|
||||
*
|
||||
* <p>Composed transliterators may not behave as expected. For
|
||||
* example, inverses may not combine to form the identity
|
||||
* transliterator. See the class documentation for {@link
|
||||
* Transliterator} for details.
|
||||
*
|
||||
* <p>If a non-<tt>null</tt> <tt>UnicodeFilter</tt> is applied to a
|
||||
* <tt>CompoundTransliterator</tt>, it has the effect of being
|
||||
* logically <b>and</b>ed with the filter of each transliterator in
|
||||
* the chain.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class CompoundTransliterator extends Transliterator {
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
private Transliterator[] trans;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
* transliterators. The array of transliterators may be of any
|
||||
* length, including zero or one, however, useful compound
|
||||
* transliterators have at least two components.
|
||||
* @param transliterators array of <code>Transliterator</code>
|
||||
* objects
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
public CompoundTransliterator(String ID, Transliterator[] transliterators,
|
||||
UnicodeFilter filter) {
|
||||
super(ID, filter);
|
||||
trans = new Transliterator[transliterators.length];
|
||||
System.arraycopy(transliterators, 0, trans, 0, trans.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
* transliterators. The array of transliterators may be of any
|
||||
* length, including zero or one, however, useful compound
|
||||
* transliterators have at least two components.
|
||||
* @param transliterators array of <code>Transliterator</code>
|
||||
* objects
|
||||
*/
|
||||
public CompoundTransliterator(String ID, Transliterator[] transliterators) {
|
||||
this(ID, transliterators, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of transliterators in this chain.
|
||||
* @return number of transliterators in this chain.
|
||||
*/
|
||||
public int getCount() {
|
||||
return trans.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the transliterator at the given index in this chain.
|
||||
* @param index index into chain, from 0 to <code>getCount() - 1</code>
|
||||
* @return transliterator at the given index
|
||||
*/
|
||||
public Transliterator getTransliterator(int index) {
|
||||
return trans[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
public int transliterate(Replaceable text, int start, int limit) {
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
limit = trans[i].transliterate(text, start, limit);
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
protected void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] index) {
|
||||
/* Call each transliterator with the same start value and
|
||||
* initial cursor index, but with the limit index as modified
|
||||
* by preceding transliterators. The cursor index must be
|
||||
* reset for each transliterator to give each a chance to
|
||||
* transliterate the text. The initial cursor index is known
|
||||
* to still point to the same place after each transliterator
|
||||
* is called because each transliterator will not change the
|
||||
* text between start and the initial value of cursor.
|
||||
*
|
||||
* IMPORTANT: After the first transliterator, each subsequent
|
||||
* transliterator only gets to transliterate text committed by
|
||||
* preceding transliterators; that is, the cursor (output
|
||||
* value) of transliterator i becomes the limit (input value)
|
||||
* of transliterator i+1. Finally, the overall limit is fixed
|
||||
* up before we return.
|
||||
*
|
||||
* Assumptions we make here:
|
||||
* (1) start <= cursor <= limit ;cursor valid on entry
|
||||
* (2) cursor <= cursor' <= limit' ;cursor doesn't move back
|
||||
* (3) cursor <= limit' ;text before cursor unchanged
|
||||
* - cursor' is the value of cursor after calling handleKT
|
||||
* - limit' is the value of limit after calling handleKT
|
||||
*/
|
||||
|
||||
/**
|
||||
* Example: 3 transliterators. This example illustrates the
|
||||
* mechanics we need to implement. S, C, and L are the start,
|
||||
* cursor, and limit. gl is the globalLimit.
|
||||
*
|
||||
* 1. h-u, changes hex to Unicode
|
||||
*
|
||||
* 4 7 a d 0 4 7 a
|
||||
* abc/u0061/u => abca/u
|
||||
* S C L S C L gl=f->a
|
||||
*
|
||||
* 2. upup, changes "x" to "XX"
|
||||
*
|
||||
* 4 7 a 4 7 a
|
||||
* abca/u => abcAA/u
|
||||
* S CL S C
|
||||
* L gl=a->b
|
||||
* 3. u-h, changes Unicode to hex
|
||||
*
|
||||
* 4 7 a 4 7 a d 0 3
|
||||
* abcAA/u => abc/u0041/u0041/u
|
||||
* S C L S C
|
||||
* L gl=b->15
|
||||
* 4. return
|
||||
*
|
||||
* 4 7 a d 0 3
|
||||
* abc/u0041/u0041/u
|
||||
* S C L
|
||||
*/
|
||||
|
||||
/**
|
||||
* One more wrinkle. If there is a filter F for the compound
|
||||
* transliterator as a whole, then we need to modify every
|
||||
* non-null filter f in the chain to be f' = F & f. Then,
|
||||
* when we're done, we restore the original filters.
|
||||
*
|
||||
* A possible future optimization is to change f to f' at
|
||||
* construction time, but then if anyone else is using the
|
||||
* transliterators in the chain outside of this context, they
|
||||
* will get unexpected results.
|
||||
*/
|
||||
UnicodeFilter F = getFilter();
|
||||
UnicodeFilter[] f = null;
|
||||
if (F != null) {
|
||||
f = new UnicodeFilter[trans.length];
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
f[i] = trans[i].getFilter();
|
||||
trans[i].setFilter(UnicodeFilterLogic.and(F, f[i]));
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
int cursor = index[CURSOR];
|
||||
int limit = index[LIMIT];
|
||||
int globalLimit = limit;
|
||||
/* globalLimit is the overall limit. We keep track of this
|
||||
* since we overwrite index[LIMIT] with the previous
|
||||
* index[CURSOR]. After each transliteration, we update
|
||||
* globalLimit for insertions or deletions that have happened.
|
||||
*/
|
||||
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
index[CURSOR] = cursor; // Reset cursor
|
||||
index[LIMIT] = limit;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.print(escape(i + ": \"" +
|
||||
substring(text, index[START], index[CURSOR]) + '|' +
|
||||
substring(text, index[CURSOR], index[LIMIT]) +
|
||||
"\" -> \""));
|
||||
}
|
||||
|
||||
trans[i].handleKeyboardTransliterate(text, index);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(escape(
|
||||
substring(text, index[START], index[CURSOR]) + '|' +
|
||||
substring(text, index[CURSOR], index[LIMIT]) +
|
||||
'"'));
|
||||
}
|
||||
|
||||
// Adjust overall limit for insertions/deletions
|
||||
globalLimit += index[LIMIT] - limit;
|
||||
limit = index[CURSOR]; // Move limit to end of committed text
|
||||
}
|
||||
// Cursor is good where it is -- where the last
|
||||
// transliterator left it. Limit needs to be put back
|
||||
// where it was, modulo adjustments for deletions/insertions.
|
||||
index[LIMIT] = globalLimit;
|
||||
|
||||
} finally {
|
||||
// Fixup the transliterator filters, if we had to modify them.
|
||||
if (f != null) {
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
trans[i].setFilter(f[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
int max = 0;
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
int len = trans[i].getMaximumContextLength();
|
||||
if (len > max) {
|
||||
max = len;
|
||||
}
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
/**
|
||||
* DEBUG
|
||||
* Returns a substring of a Replaceable.
|
||||
*/
|
||||
private static final String substring(Replaceable str, int start, int limit) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
while (start < limit) {
|
||||
buf.append(str.charAt(start++));
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* DEBUG
|
||||
* Escapes non-ASCII characters as Unicode.
|
||||
*/
|
||||
private static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
130
icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java
Executable file
130
icu4j/src/com/ibm/icu/text/HexToUnicodeTransliterator.java
Executable file
|
@ -0,0 +1,130 @@
|
|||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from hexadecimal Unicode
|
||||
* escape sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. It recognizes the
|
||||
* prefixes "U+", "u+", "\U", and "\u". Hex values may be
|
||||
* upper- or lowercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class HexToUnicodeTransliterator extends Transliterator {
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Hex-Unicode";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator() {
|
||||
super(_ID, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
public int transliterate(Replaceable text, int start, int limit) {
|
||||
int[] offsets = { start, limit, start };
|
||||
handleKeyboardTransliterate(text, offsets);
|
||||
return offsets[LIMIT];
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
protected void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] offsets) {
|
||||
/**
|
||||
* Performs transliteration changing Unicode hexadecimal
|
||||
* escapes to characters. For example, "U+0040" -> '@'. A fixed
|
||||
* set of prefixes is recognized: "\u", "\U", "u+", "U+".
|
||||
*/
|
||||
int cursor = offsets[CURSOR];
|
||||
int limit = offsets[LIMIT];
|
||||
|
||||
int maxCursor = limit - 6;
|
||||
loop:
|
||||
while (cursor <= maxCursor) {
|
||||
char c = filteredCharAt(text, cursor + 5);
|
||||
int digit0 = Character.digit(c, 16);
|
||||
if (digit0 < 0) {
|
||||
if (c == '\\') {
|
||||
cursor += 5;
|
||||
} else if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += 4;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int u = digit0;
|
||||
|
||||
for (int i=4; i>=2; --i) {
|
||||
c = filteredCharAt(text, cursor + i);
|
||||
int digit = Character.digit(c, 16);
|
||||
if (digit < 0) {
|
||||
if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += i-1;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue loop;
|
||||
}
|
||||
u |= digit << (4 * (5-i));
|
||||
}
|
||||
|
||||
c = filteredCharAt(text, cursor);
|
||||
char d = filteredCharAt(text, cursor + 1);
|
||||
if (((c == 'U' || c == 'u') && d == '+')
|
||||
|| (c == '\\' && (d == 'U' || d == 'u'))) {
|
||||
|
||||
// At this point, we have a match; replace cursor..cursor+5
|
||||
// with u.
|
||||
text.replace(cursor, cursor+6, String.valueOf((char) u));
|
||||
limit -= 5;
|
||||
maxCursor -= 5;
|
||||
|
||||
++cursor;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
}
|
||||
|
||||
offsets[LIMIT] = limit;
|
||||
offsets[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
private char filteredCharAt(Replaceable text, int i) {
|
||||
char c;
|
||||
UnicodeFilter filter = getFilter();
|
||||
return (filter == null) ? text.charAt(i) :
|
||||
(filter.isIn(c = text.charAt(i)) ? c : '\uFFFF');
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
}
|
77
icu4j/src/com/ibm/icu/text/Replaceable.java
Executable file
77
icu4j/src/com/ibm/icu/text/Replaceable.java
Executable file
|
@ -0,0 +1,77 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>Replaceable</code> is an interface that supports the
|
||||
* operation of replacing a substring with another piece of text.
|
||||
* <code>Replaceable</code> is needed in order to change a piece of
|
||||
* text while retaining style attributes. For example, if the string
|
||||
* "the <b>bold</b> font" has range (4, 8) replaced with "strong",
|
||||
* then it becomes "the <b>strong</b> font".
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Replaceable.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public interface Replaceable {
|
||||
/**
|
||||
* Return the number of characters in the text.
|
||||
* @return number of characters in text
|
||||
*/
|
||||
int length();
|
||||
|
||||
/**
|
||||
* Return the character at the given offset into the text.
|
||||
* @param offset an integer between 0 and <code>length()</code>-1
|
||||
* inclusive
|
||||
* @return character of text at given offset
|
||||
*/
|
||||
char charAt(int offset);
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination
|
||||
* character array. The first character to be copied is at index
|
||||
* <code>srcStart</code>; the last character to be copied is at
|
||||
* index <code>srcLimit-1</code> (thus the total number of
|
||||
* characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code>
|
||||
* starting at index <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive; <code>0
|
||||
* <= start <= limit</code>.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* <code>start <= limit <= length()</code>.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
*/
|
||||
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
|
||||
|
||||
/**
|
||||
* Replace a substring of this object with the given text.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param text the text to replace characters <code>start</code>
|
||||
* to <code>limit - 1</code>
|
||||
*/
|
||||
void replace(int start, int limit, String text);
|
||||
|
||||
/**
|
||||
* Replace a substring of this object with the given text.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param chars the text to replace characters <code>start</code>
|
||||
* to <code>limit - 1</code>
|
||||
* @param charsStart the beginning index into <code>chars</code>,
|
||||
* inclusive; <code>0 <= start <= limit</code>.
|
||||
* @param charsLen the number of characters of <code>chars</code>.
|
||||
*/
|
||||
void replace(int start, int limit, char[] chars,
|
||||
int charsStart, int charsLen);
|
||||
// Note: We use length rather than limit to conform to StringBuffer
|
||||
// and System.arraycopy.
|
||||
}
|
159
icu4j/src/com/ibm/icu/text/ReplaceableString.java
Executable file
159
icu4j/src/com/ibm/icu/text/ReplaceableString.java
Executable file
|
@ -0,0 +1,159 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>ReplaceableString</code> is an adapter class that implements the
|
||||
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
|
||||
*
|
||||
* <p><em>Note:</em> This class does not support attributes and is not
|
||||
* intended for general use. Most clients will need to implement
|
||||
* {@link Replaceable} in their text representation class.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
private StringBuffer buf;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a new object with the given initial contents.
|
||||
* @param str initial contents
|
||||
*/
|
||||
public ReplaceableString(String str) {
|
||||
buf = new StringBuffer(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new object using <code>buf</code> for internal
|
||||
* storage. The contents of <code>buf</code> at the time of
|
||||
* construction are used as the initial contents. <em>Note!
|
||||
* Modifications to <code>buf</code> will modify this object, and
|
||||
* vice versa.</em>
|
||||
* @param buf object to be used as internal storage
|
||||
*/
|
||||
public ReplaceableString(StringBuffer buf) {
|
||||
this.buf = buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new empty object.
|
||||
*/
|
||||
public ReplaceableString() {
|
||||
buf = new StringBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the contents of this object as a <code>String</code>.
|
||||
* @return string contents of this object
|
||||
*/
|
||||
public String toString() {
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the internal storage of this object. <em>Note! Any
|
||||
* changes made to the returned object affect this object's
|
||||
* contents, and vice versa.</em>
|
||||
* @return internal buffer used by this object
|
||||
*/
|
||||
public StringBuffer getStringBuffer() {
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
*/
|
||||
public int length() {
|
||||
return buf.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the character at the given position in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
* @param offset offset into the contents, from 0 to
|
||||
* <code>length()</code> - 1
|
||||
*/
|
||||
public char charAt(int offset) {
|
||||
return buf.charAt(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination
|
||||
* character array. The first character to be copied is at index
|
||||
* <code>srcStart</code>; the last character to be copied is at
|
||||
* index <code>srcLimit-1</code> (thus the total number of
|
||||
* characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code>
|
||||
* starting at index <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive; <code>0
|
||||
* <= start <= limit</code>.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* <code>start <= limit <= length()</code>.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
*/
|
||||
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
|
||||
buf.getChars(srcStart, srcLimit, dst, dstStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace zero or more characters with new characters.
|
||||
* <code>Replaceable</code> API.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param text new text to replace characters <code>start</code> to
|
||||
* <code>limit - 1</code>
|
||||
*/
|
||||
public void replace(int start, int limit, String text) {
|
||||
if (start == limit) {
|
||||
buf.insert(start, text);
|
||||
} else {
|
||||
char[] tail = null;
|
||||
if (limit < buf.length()) {
|
||||
tail = new char[buf.length() - limit];
|
||||
buf.getChars(limit, buf.length(), tail, 0);
|
||||
}
|
||||
buf.setLength(start);
|
||||
buf.append(text);
|
||||
if (tail != null) {
|
||||
buf.append(tail);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace a substring of this object with the given text.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param chars the text to replace characters <code>start</code>
|
||||
* to <code>limit - 1</code>
|
||||
* @param charsStart the beginning index into <code>chars</code>,
|
||||
* inclusive; <code>0 <= start <= limit</code>.
|
||||
* @param charsLen the number of characters of <code>chars</code>.
|
||||
*/
|
||||
public void replace(int start, int limit, char[] chars,
|
||||
int charsStart, int charsLen) {
|
||||
char[] tail = null;
|
||||
if (limit < buf.length()) {
|
||||
tail = new char[buf.length() - limit];
|
||||
buf.getChars(limit, buf.length(), tail, 0);
|
||||
}
|
||||
buf.setLength(start);
|
||||
buf.append(chars, charsStart, charsLen);
|
||||
if (tail != null) {
|
||||
buf.append(tail);
|
||||
}
|
||||
}
|
||||
}
|
1187
icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
Executable file
1187
icu4j/src/com/ibm/icu/text/RuleBasedTransliterator.java
Executable file
File diff suppressed because it is too large
Load diff
530
icu4j/src/com/ibm/icu/text/TransliterationRule.java
Executable file
530
icu4j/src/com/ibm/icu/text/TransliterationRule.java
Executable file
|
@ -0,0 +1,530 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.Dictionary;
|
||||
|
||||
/**
|
||||
* A transliteration rule used by
|
||||
* <code>RuleBasedTransliterator</code>.
|
||||
* <code>TransliterationRule</code> is an immutable object.
|
||||
*
|
||||
* <p>A rule consists of an input pattern and an output string. When
|
||||
* the input pattern is matched, the output string is emitted. The
|
||||
* input pattern consists of zero or more characters which are matched
|
||||
* exactly (the key) and optional context. Context must match if it
|
||||
* is specified. Context may be specified before the key, after the
|
||||
* key, or both. The key, preceding context, and following context
|
||||
* may contain variables. Variables represent a set of Unicode
|
||||
* characters, such as the letters <i>a</i> through <i>z</i>.
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code> indicating a mismatch
|
||||
* between the text and this rule. One or more characters of the context or
|
||||
* key do not match the text.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
public static final int MISMATCH = 0;
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code> indicating a partial
|
||||
* match between the text and this rule. All characters of the text match
|
||||
* the corresponding context or key, but more characters are required for a
|
||||
* complete match. There are some key or context characters at the end of
|
||||
* the pattern that remain unmatched because the text isn't long enough.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
public static final int PARTIAL_MATCH = 1;
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code> indicating a complete
|
||||
* match between the text and this rule. The text matches all context and
|
||||
* key characters.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
public static final int FULL_MATCH = 2;
|
||||
|
||||
/**
|
||||
* The string that must be matched.
|
||||
*/
|
||||
private String key;
|
||||
|
||||
/**
|
||||
* The string that is emitted if the key, anteContext, and postContext
|
||||
* are matched.
|
||||
*/
|
||||
private String output;
|
||||
|
||||
/**
|
||||
* The string that must match before the key. Must not be the empty string.
|
||||
* May be null; if null, then there is no matching requirement before the
|
||||
* key.
|
||||
*/
|
||||
private String anteContext;
|
||||
|
||||
/**
|
||||
* The string that must match after the key. Must not be the empty string.
|
||||
* May be null; if null, then there is no matching requirement after the
|
||||
* key.
|
||||
*/
|
||||
private String postContext;
|
||||
|
||||
/**
|
||||
* The position of the cursor after emitting the output string, from 0 to
|
||||
* output.length(). For most rules with no special cursor specification,
|
||||
* the cursorPos is output.length().
|
||||
*/
|
||||
private int cursorPos;
|
||||
|
||||
/**
|
||||
* A string used to implement masks().
|
||||
*/
|
||||
private String maskKey;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given key, output text, and other
|
||||
* attributes. Zero, one, or two context strings may be specified. A
|
||||
* cursor position may be specified for the output text.
|
||||
* @param key the string to match
|
||||
* @param output the string to produce when the <code>key</code> is seen
|
||||
* @param anteContext if not null and not empty, then it must be matched
|
||||
* before the <code>key</code>
|
||||
* @param postContext if not null and not empty, then it must be matched
|
||||
* after the <code>key</code>
|
||||
* @param cursorPos a position for the cursor after the <code>output</code>
|
||||
* is emitted. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @exception IllegalArgumentException if the cursor position is out of
|
||||
* range.
|
||||
*/
|
||||
public TransliterationRule(String key, String output,
|
||||
String anteContext, String postContext,
|
||||
int cursorPos) {
|
||||
this.key = key;
|
||||
this.output = output;
|
||||
this.anteContext = (anteContext != null && anteContext.length() > 0)
|
||||
? anteContext : null;
|
||||
this.postContext = (postContext != null && postContext.length() > 0)
|
||||
? postContext : null;
|
||||
this.cursorPos = cursorPos < 0 ? output.length() : cursorPos;
|
||||
if (this.cursorPos > output.length()) {
|
||||
throw new IllegalArgumentException("Illegal cursor position");
|
||||
}
|
||||
|
||||
/* The mask key is needed when we are adding individual rules to a rule
|
||||
* set, for performance. Here are the numbers: Without mask key, 13.0
|
||||
* seconds. With mask key, 6.2 seconds. However, once the rules have
|
||||
* been added to the set, then they can be discarded to free up space.
|
||||
* This is what the freeze() method does. After freeze() has been
|
||||
* called, the method masks() must NOT be called.
|
||||
*/
|
||||
maskKey = key;
|
||||
if (postContext != null) {
|
||||
maskKey += postContext;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
public int getKeyLength() {
|
||||
return key.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the key.
|
||||
* @return the match key.
|
||||
*/
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
*/
|
||||
public String getOutput() {
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the position of the cursor within the output string.
|
||||
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
|
||||
*/
|
||||
public int getCursorPos() {
|
||||
return cursorPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the preceding context length. This method is needed to
|
||||
* support the <code>Transliterator</code> method
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
*/
|
||||
public int getAnteContextLength() {
|
||||
return anteContext == null ? 0 : anteContext.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
|
||||
* "[c]a>x" masks "[dc]a>y".
|
||||
*
|
||||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
public boolean masks(TransliterationRule r2) {
|
||||
/* There are three cases of masking. In each instance, rule1
|
||||
* masks rule2.
|
||||
*
|
||||
* 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
|
||||
*
|
||||
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*
|
||||
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
* maskings are currently not detected. For example,
|
||||
* "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
|
||||
* we need a subset operator on UnicodeSet objects, which we
|
||||
* currently do not have. This can be added later.
|
||||
*/
|
||||
return ((maskKey.length() < r2.maskKey.length() &&
|
||||
r2.maskKey.startsWith(maskKey)) ||
|
||||
(r2.anteContext != null && maskKey.equals(r2.maskKey) &&
|
||||
((anteContext == null) ||
|
||||
(anteContext.length() < r2.anteContext.length() &&
|
||||
r2.anteContext.endsWith(anteContext)))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, masks() must NOT be called.
|
||||
* If it is called, an exception will be thrown.
|
||||
*/
|
||||
public void freeze() {
|
||||
maskKey = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object.
|
||||
* @return string representation of this object
|
||||
*/
|
||||
public String toString() {
|
||||
return getClass().getName() + '['
|
||||
+ escape((anteContext != null ? ("[" + anteContext + ']') : "")
|
||||
+ key
|
||||
+ (postContext != null ? ("[" + postContext + ']') : "")
|
||||
+ " -> "
|
||||
+ (cursorPos < output.length()
|
||||
? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
|
||||
: output))
|
||||
+ ']';
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text. The text being matched
|
||||
* occupies a virtual buffer consisting of the contents of
|
||||
* <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
public boolean matches(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
return
|
||||
(anteContext == null
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor - anteContext.length(),
|
||||
anteContext, variables, filter)) &&
|
||||
regionMatches(text, start, limit, result, cursor,
|
||||
key, variables, filter) &&
|
||||
(postContext == null
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor + key.length(),
|
||||
postContext, variables, filter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
public boolean matches(Replaceable text, int start, int limit,
|
||||
int cursor, Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
return
|
||||
(anteContext == null
|
||||
|| regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, variables, filter)) &&
|
||||
regionMatches(text, start, limit, cursor,
|
||||
key, variables, filter) &&
|
||||
(postContext == null
|
||||
|| regionMatches(text, start, limit, cursor + key.length(),
|
||||
postContext, variables, filter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full match. A
|
||||
* mismatch means at least one character of the text does not match the
|
||||
* context or key. A partial match means some context and key characters
|
||||
* match, but the text is not long enough to match all of them. A full
|
||||
* match means all context and key characters match.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
|
||||
* <code>FULL_MATCH</code>.
|
||||
* @see #MISMATCH
|
||||
* @see #PARTIAL_MATCH
|
||||
* @see #FULL_MATCH
|
||||
*/
|
||||
public int getMatchDegree(Replaceable text, int start, int limit,
|
||||
int cursor, Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
if (anteContext != null
|
||||
&& !regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, variables, filter)) {
|
||||
return MISMATCH;
|
||||
}
|
||||
int len = getRegionMatchLength(text, start, limit, cursor,
|
||||
key, variables, filter);
|
||||
if (len < 0) {
|
||||
return MISMATCH;
|
||||
}
|
||||
if (len < key.length()) {
|
||||
return PARTIAL_MATCH;
|
||||
}
|
||||
if (postContext == null) {
|
||||
return FULL_MATCH;
|
||||
}
|
||||
len = getRegionMatchLength(text, start, limit,
|
||||
cursor + key.length(),
|
||||
postContext, variables, filter);
|
||||
return (len < 0) ? MISMATCH
|
||||
: ((len == postContext.length()) ? FULL_MATCH
|
||||
: PARTIAL_MATCH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor. As in
|
||||
* <code>matches()</code>, the text being matched occupies a virtual buffer
|
||||
* consisting of the contents of <code>result</code> concatenated to a
|
||||
* substring of <code>text</code>. See <code>matches()</code> for details.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param template the text to match against. All characters must match.
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
protected static boolean regionMatches(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
String template,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
int rlen = result.length();
|
||||
if (cursor < 0
|
||||
|| (cursor + template.length()) > (rlen + limit - start)) {
|
||||
return false;
|
||||
}
|
||||
for (int i=0; i<template.length(); ++i, ++cursor) {
|
||||
if (!charMatches(template.charAt(i),
|
||||
cursor < rlen ? result.charAt(cursor)
|
||||
: text.charAt(cursor - rlen + start),
|
||||
variables, filter)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param template the text to match against. All characters must match.
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
protected static boolean regionMatches(Replaceable text, int start, int limit,
|
||||
int cursor,
|
||||
String template, Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
if (cursor < start
|
||||
|| (cursor + template.length()) > limit) {
|
||||
return false;
|
||||
}
|
||||
for (int i=0; i<template.length(); ++i, ++cursor) {
|
||||
if (!charMatches(template.charAt(i), text.charAt(cursor),
|
||||
variables, filter)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters of the text that match this rule. If
|
||||
* there is a mismatch, return -1. If the text is not long enough to match
|
||||
* any characters, return 0.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param template the text to match against. All characters must match.
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return -1 if there is a mismatch, 0 if the text is not long enough to
|
||||
* match any characters, otherwise the number of characters of text that
|
||||
* match this rule.
|
||||
*/
|
||||
protected static int getRegionMatchLength(Replaceable text, int start,
|
||||
int limit, int cursor,
|
||||
String template,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
if (cursor < start) {
|
||||
return -1;
|
||||
}
|
||||
int i;
|
||||
for (i=0; i<template.length() && cursor<limit; ++i, ++cursor) {
|
||||
if (!charMatches(template.charAt(i), text.charAt(cursor),
|
||||
variables, filter)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given key matches the given text. This method
|
||||
* accounts for the fact that the key character may represent a character
|
||||
* set. Note that the key and text characters may not be interchanged
|
||||
* without altering the results.
|
||||
* @param keyChar a character in the match key
|
||||
* @param textChar a character in the text being transliterated
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
protected static boolean charMatches(char keyChar, char textChar,
|
||||
Dictionary variables, UnicodeFilter filter) {
|
||||
UnicodeSet set = null;
|
||||
return (filter == null || filter.isIn(textChar)) &&
|
||||
((set = (UnicodeSet) variables.get(new Character(keyChar)))
|
||||
== null) ?
|
||||
keyChar == textChar : set.contains(textChar);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
218
icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java
Executable file
218
icu4j/src/com/ibm/icu/text/TransliterationRuleSet.java
Executable file
|
@ -0,0 +1,218 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
|
||||
* the transliteration in one direction from one set of characters or short
|
||||
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
|
||||
* two such sets, one for the forward direction, and one for the reverse.
|
||||
*
|
||||
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
|
||||
* finding a matching rule at a given point in the text. This is accomplished
|
||||
* by the <code>findMatch()</code> method.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
/* Note: There was an old implementation that indexed by first letter of
|
||||
* key. Problem with this is that key may not have a meaningful first
|
||||
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
|
||||
* rules whose intial key letter is a category variable. However, the
|
||||
* problem is that they must be kept in order with respect to other rules.
|
||||
* One solution -- add a sequence number to each rule. Do the usual
|
||||
* first-letter lookup, and also a lookup from the spare bin with rules like
|
||||
* {Lu}>*. Take the lower sequence number. This seems complex and not
|
||||
* worth the trouble, but we may revisit this later. For documentation (or
|
||||
* possible resurrection) the old code is included below, commented out
|
||||
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
|
||||
* implementation, <code>rules</code> is a Hashtable, not a Vector.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Vector of rules, in the order added.
|
||||
*/
|
||||
private Vector rules;
|
||||
|
||||
/**
|
||||
* Length of the longest preceding context
|
||||
*/
|
||||
private int maxContextLength;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a new empty rule set.
|
||||
*/
|
||||
public TransliterationRuleSet() {
|
||||
rules = new Vector();
|
||||
maxContextLength = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the maximum context length.
|
||||
* @return the length of the longest preceding context.
|
||||
*/
|
||||
public int getMaximumContextLength() {
|
||||
return maxContextLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a rule to this set. Rules are added in order, and order is
|
||||
* significant.
|
||||
*
|
||||
* <p>Once freeze() is called, this method must not be called.
|
||||
* @param rule the rule to add
|
||||
*/
|
||||
public void addRule(TransliterationRule rule) {
|
||||
|
||||
// Build time, no checking : 3562 ms
|
||||
// Build time, with checking: 6234 ms
|
||||
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule r = (TransliterationRule) rules.elementAt(i);
|
||||
if (r.masks(rule)) {
|
||||
throw new IllegalArgumentException("Rule " + rule +
|
||||
" must precede " + r);
|
||||
}
|
||||
}
|
||||
|
||||
rules.addElement(rule);
|
||||
int len;
|
||||
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
*/
|
||||
public void freeze() {
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
((TransliterationRule) rules.elementAt(i)).freeze();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text. The
|
||||
* text being matched occupies a virtual buffer consisting of the contents
|
||||
* of <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result tranlated text
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param variables a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
public TransliterationRule findMatch(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
|
||||
return rule;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param variables a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
public TransliterationRule findMatch(Replaceable text, int start, int limit,
|
||||
int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, cursor, variables, filter)) {
|
||||
return rule;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* Unlike <code>findMatch()</code>, this method does an incremental match.
|
||||
* An incremental match requires that there be no partial matches that might
|
||||
* pre-empt the full match that is found. If there are partial matches,
|
||||
* then null is returned. A non-null result indicates that a full match has
|
||||
* been found, and that it cannot be pre-empted by a partial match
|
||||
* regardless of what additional text is added to the translation buffer.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param variables a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param partial output parameter. <code>partial[0]</code> is set to
|
||||
* true if a partial match is returned.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found, or if the text buffer
|
||||
* does not have enough text yet to unambiguously match a rule.
|
||||
*/
|
||||
public TransliterationRule findIncrementalMatch(Replaceable text, int start,
|
||||
int limit, int cursor,
|
||||
Dictionary variables,
|
||||
boolean partial[],
|
||||
UnicodeFilter filter) {
|
||||
partial[0] = false;
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
int match = rule.getMatchDegree(text, start, limit, cursor,
|
||||
variables, filter);
|
||||
switch (match) {
|
||||
case TransliterationRule.FULL_MATCH:
|
||||
return rule;
|
||||
case TransliterationRule.PARTIAL_MATCH:
|
||||
partial[0] = true;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
860
icu4j/src/com/ibm/icu/text/Transliterator.java
Executable file
860
icu4j/src/com/ibm/icu/text/Transliterator.java
Executable file
|
@ -0,0 +1,860 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.MessageFormat;
|
||||
|
||||
/**
|
||||
* <code>Transliterator</code> is an abstract class that
|
||||
* transliterates text from one format to another. The most common
|
||||
* kind of transliterator is a script, or alphabet, transliterator.
|
||||
* For example, a Russian to Latin transliterator changes Russian text
|
||||
* written in Cyrillic characters to phonetically equivalent Latin
|
||||
* characters. It does not <em>translate</em> Russian to English!
|
||||
* Transliteration, unlike translation, operates on characters, without
|
||||
* reference to the meanings of words and sentences.
|
||||
*
|
||||
* <p>Although script conversion is its most common use, a
|
||||
* transliterator can actually perform a more general class of tasks.
|
||||
* In fact, <code>Transliterator</code> defines a very general API
|
||||
* which specifies only that a segment of the input text is replaced
|
||||
* by new text. The particulars of this conversion are determined
|
||||
* entirely by subclasses of <code>Transliterator</code>.
|
||||
*
|
||||
* <p><b>Transliterators are stateless</b>
|
||||
*
|
||||
* <p><code>Transliterator</code> objects are <em>stateless</em>; they
|
||||
* retain no information between calls to
|
||||
* <code>transliterate()</code>. As a result, threads may share
|
||||
* transliterators without synchronizing them. This might seem to
|
||||
* limit the complexity of the transliteration operation. In
|
||||
* practice, subclasses perform complex transliterations by delaying
|
||||
* the replacement of text until it is known that no other
|
||||
* replacements are possible. In other words, although the
|
||||
* <code>Transliterator</code> objects are stateless, the source text
|
||||
* itself embodies all the needed information, and delayed operation
|
||||
* allows arbitrary complexity.
|
||||
*
|
||||
* <p><b>Batch transliteration</b>
|
||||
*
|
||||
* <p>The simplest way to perform transliteration is all at once, on a
|
||||
* string of existing text. This is referred to as <em>batch</em>
|
||||
* transliteration. For example, given a string <code>input</code>
|
||||
* and a transliterator <code>t</code>, the call
|
||||
*
|
||||
* <blockquote><code>String result = t.transliterate(input);
|
||||
* </code></blockquote>
|
||||
*
|
||||
* will transliterate it and return the result. Other methods allow
|
||||
* the client to specify a substring to be transliterated and to use
|
||||
* {@link Replaceable} objects instead of strings, in order to
|
||||
* preserve out-of-band information (such as text styles).
|
||||
*
|
||||
* <p><b>Keyboard transliteration</b>
|
||||
*
|
||||
* <p>Somewhat more involved is <em>keyboard</em>, or incremental
|
||||
* transliteration. This is the transliteration of text that is
|
||||
* arriving from some source (typically the user's keyboard) one
|
||||
* character at a time, or in some other piecemeal fashion.
|
||||
*
|
||||
* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
|
||||
* stores the text. As text is inserted, as much as possible is
|
||||
* transliterated on the fly. This means a GUI that displays the
|
||||
* contents of the buffer may show text being modified as each new
|
||||
* character arrives.
|
||||
*
|
||||
* <p>Consider the simple <code>RuleBasedTransliterator</code>:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* th>{theta}<br>
|
||||
* t>{tau}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* When the user types 't', nothing will happen, since the
|
||||
* transliterator is waiting to see if the next character is 'h'. To
|
||||
* remedy this, we introduce the notion of a cursor, marked by a '|'
|
||||
* in the output string:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* t>|{tau}<br>
|
||||
* {tau}h>{theta}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* Now when the user types 't', tau appears, and if the next character
|
||||
* is 'h', the tau changes to a theta. This is accomplished by
|
||||
* maintaining a cursor position (independent of the insertion point,
|
||||
* and invisible in the GUI) across calls to
|
||||
* <code>keyboardTransliterate()</code>. Typically, the cursor will
|
||||
* be coincident with the insertion point, but in a case like the one
|
||||
* above, it will precede the insertion point.
|
||||
*
|
||||
* <p>Keyboard transliteration methods maintain a set of three indices
|
||||
* that are updated with each call to
|
||||
* <code>keyboardTransliterate()</code>, including the cursor, start,
|
||||
* and limit. Since these indices are changed by the method, they are
|
||||
* passed in an <code>int[]</code> array. The <code>START</code> index
|
||||
* marks the beginning of the substring that the transliterator will
|
||||
* look at. It is advanced as text becomes committed (but it is not
|
||||
* the committed index; that's the <code>CURSOR</code>). The
|
||||
* <code>CURSOR</code> index, described above, marks the point at
|
||||
* which the transliterator last stopped, either because it reached
|
||||
* the end, or because it required more characters to disambiguate
|
||||
* between possible inputs. The <code>CURSOR</code> can also be
|
||||
* explicitly set by rules in a <code>RuleBasedTransliterator</code>.
|
||||
* Any characters before the <code>CURSOR</code> index are frozen;
|
||||
* future keyboard transliteration calls within this input sequence
|
||||
* will not change them. New text is inserted at the
|
||||
* <code>LIMIT</code> index, which marks the end of the substring that
|
||||
* the transliterator looks at.
|
||||
*
|
||||
* <p>Because keyboard transliteration assumes that more characters
|
||||
* are to arrive, it is conservative in its operation. It only
|
||||
* transliterates when it can do so unambiguously. Otherwise it waits
|
||||
* for more characters to arrive. When the client code knows that no
|
||||
* more characters are forthcoming, perhaps because the user has
|
||||
* performed some input termination operation, then it should call
|
||||
* <code>finishKeyboardTransliteration()</code> to complete any
|
||||
* pending transliterations.
|
||||
*
|
||||
* <p><b>Inverses</b>
|
||||
*
|
||||
* <p>Pairs of transliterators may be inverses of one another. For
|
||||
* example, if transliterator <b>A</b> transliterates characters by
|
||||
* incrementing their Unicode value (so "abc" -> "def"), and
|
||||
* transliterator <b>B</b> decrements character values, then <b>A</b>
|
||||
* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
|
||||
* with <b>B</b> in a compound transliterator, the result is the
|
||||
* indentity transliterator, that is, a transliterator that does not
|
||||
* change its input text.
|
||||
*
|
||||
* The <code>Transliterator</code> method <code>getInverse()</code>
|
||||
* returns a transliterator's inverse, if one exists, or
|
||||
* <code>null</code> otherwise. However, the result of
|
||||
* <code>getInverse()</code> usually will <em>not</em> be a true
|
||||
* mathematical inverse. This is because true inverse transliterators
|
||||
* are difficult to formulate. For example, consider two
|
||||
* transliterators: <b>AB</b>, which transliterates the character 'A'
|
||||
* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
|
||||
* seem that these are exact inverses, since
|
||||
*
|
||||
* <blockquote>"A" x <b>AB</b> -> "B"<br>
|
||||
* "B" x <b>BA</b> -> "A"</blockquote>
|
||||
*
|
||||
* where 'x' represents transliteration. However,
|
||||
*
|
||||
* <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
|
||||
* "BBCD" x <b>BA</b> -> "AACD"</blockquote>
|
||||
*
|
||||
* so <b>AB</b> composed with <b>BA</b> is not the
|
||||
* identity. Nonetheless, <b>BA</b> may be usefully considered to be
|
||||
* <b>AB</b>'s inverse, and it is on this basis that
|
||||
* <b>AB</b><code>.getInverse()</code> could legitimately return
|
||||
* <b>BA</b>.
|
||||
*
|
||||
* <p><b>IDs and display names</b>
|
||||
*
|
||||
* <p>A transliterator is designated by a short identifier string or
|
||||
* <em>ID</em>. IDs follow the format <em>source-destination</em>,
|
||||
* where <em>source</em> describes the entity being replaced, and
|
||||
* <em>destination</em> describes the entity replacing
|
||||
* <em>source</em>. The entities may be the names of scripts,
|
||||
* particular sequences of characters, or whatever else it is that the
|
||||
* transliterator converts to or from. For example, a transliterator
|
||||
* from Russian to Latin might be named "Russian-Latin". A
|
||||
* transliterator from keyboard escape sequences to Latin-1 characters
|
||||
* might be named "KeyboardEscape-Latin1". By convention, system
|
||||
* entity names are in English, with the initial letters of words
|
||||
* capitalized; user entity names may follow any format so long as
|
||||
* they do not contain dashes.
|
||||
*
|
||||
* <p>In addition to programmatic IDs, transliterator objects have
|
||||
* display names for presentation in user interfaces, returned by
|
||||
* {@link #getDisplayName}.
|
||||
*
|
||||
* <p><b>Factory methods and registration</b>
|
||||
*
|
||||
* <p>In general, client code should use the factory method
|
||||
* <code>getInstance()</code> to obtain an instance of a
|
||||
* transliterator given its ID. Valid IDs may be enumerated using
|
||||
* <code>getAvailableIDs()</code>. Since transliterators are
|
||||
* stateless, multiple calls to <code>getInstance()</code> with the
|
||||
* same ID will return the same object.
|
||||
*
|
||||
* <p>In addition to the system transliterators registered at startup,
|
||||
* user transliterators may be registered by calling
|
||||
* <code>registerInstance()</code> at run time. To register a
|
||||
* transliterator subclass without instantiating it (until it is
|
||||
* needed), users may call <code>registerClass()</code>.
|
||||
*
|
||||
* <p><b>Subclassing</b>
|
||||
*
|
||||
* <p>Subclasses must implement the abstract
|
||||
* <code>transliterate()</code> method. They should also override the
|
||||
* <code>transliterate()</code> method taking a <code>String</code>
|
||||
* and <code>StringBuffer</code> if the performance of these methods
|
||||
* can be improved over the performance obtained by the default
|
||||
* implementations in this class. Subclasses must also implement
|
||||
* <code>handleKeyboardTransliterate()</code>.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public abstract class Transliterator {
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the beginning index, inclusive
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
public static final int START = 0;
|
||||
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the ending index, exclusive
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
public static final int LIMIT = 1;
|
||||
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the next character to be considered
|
||||
* for transliteration
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
public static final int CURSOR = 2;
|
||||
|
||||
/**
|
||||
* Programmatic name, e.g., "Latin-Arabic".
|
||||
*/
|
||||
private String ID;
|
||||
|
||||
/**
|
||||
* This transliterator's filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
private UnicodeFilter filter;
|
||||
|
||||
/**
|
||||
* Dictionary of known transliterators. Keys are <code>String</code>
|
||||
* names, values are one of the following:
|
||||
*
|
||||
* <ul><li><code>Transliterator</code> objects
|
||||
*
|
||||
* <li><code>Class</code> objects. Such objects must represent
|
||||
* subclasses of <code>Transliterator</code>, and must satisfy the
|
||||
* constraints described in <code>registerClass()</code>
|
||||
*
|
||||
* <li><code>RULE_BASED_PLACEHOLDER</code>, in which case the ID
|
||||
* will have its first '-' removed and be appended to
|
||||
* RB_RULE_BASED_PREFIX to form a resource bundle name from which
|
||||
* the RB_RULE key is looked up to obtain the rule.
|
||||
*
|
||||
* <li><code>REVERSE_RULE_BASED_PLACEHOLDER</code>. Like
|
||||
* <code>RULE_BASED_PLACEHOLDER</code>, except the entity names in
|
||||
* the ID are reversed, and the argument
|
||||
* RuleBasedTransliterator.REVERSE is pased to the
|
||||
* RuleBasedTransliterator constructor.
|
||||
* </ul>
|
||||
*/
|
||||
private static Hashtable cache;
|
||||
|
||||
/**
|
||||
* Internal object used to stand for instances of
|
||||
* <code>RuleBasedTransliterator</code> that have not been
|
||||
* constructed yet in the <code>cache</code>. When a
|
||||
* <code>getInstance()</code> call retrieves this object, it is
|
||||
* replaced by the actual <code>RuleBasedTransliterator</code>.
|
||||
* This allows <code>Transliterator</code> to delay instantiation
|
||||
* of such transliterators until they are needed.
|
||||
*/
|
||||
private static final Object RULE_BASED_PLACEHOLDER = new Object();
|
||||
|
||||
/**
|
||||
* Internal object used to stand for instances of
|
||||
* <code>RuleBasedTransliterator</code> that have not been
|
||||
* constructed yet in the <code>cache</code>. These instances are
|
||||
* constructed with an argument
|
||||
* <code>RuleBasedTransliterator.REVERSE</code>.
|
||||
*/
|
||||
private static final Object REVERSE_RULE_BASED_PLACEHOLDER = new Object();
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle key for the display name for a
|
||||
* transliterator. The ID is appended to this to form the key.
|
||||
* The resource bundle value should be a String.
|
||||
*/
|
||||
private static final String RB_DISPLAY_NAME_PREFIX = "T:";
|
||||
|
||||
/**
|
||||
* Resource bundle key for display name pattern.
|
||||
* The resource bundle value should be a String forming a
|
||||
* MessageFormat pattern, e.g.:
|
||||
* "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
|
||||
*/
|
||||
private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
|
||||
|
||||
/**
|
||||
* Resource bundle key for the list of RuleBasedTransliterator IDs.
|
||||
* The resource bundle value should be a String[] with each element
|
||||
* being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
|
||||
* to obtain the class name in which the RB_RULE key will be sought.
|
||||
*/
|
||||
private static final String RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs";
|
||||
|
||||
/**
|
||||
* Resource bundle containing display name keys and the
|
||||
* RB_RULE_BASED_IDS array.
|
||||
*
|
||||
* <p>If we ever integrate this with the Sun JDK, the resource bundle
|
||||
* root will change to java.text.resources.LocaleElements
|
||||
*/
|
||||
private static final String RB_LOCALE_ELEMENTS =
|
||||
"com.ibm.text.resources.LocaleElements";
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle containing RuleBasedTransliterator
|
||||
* RB_RULE string. The ID is munged to remove the first '-' then appended
|
||||
* to this String to obtain the class name.
|
||||
*/
|
||||
private static final String RB_RULE_BASED_PREFIX =
|
||||
"com.ibm.text.resources.TransliterationRule";
|
||||
|
||||
/**
|
||||
* Resource bundle key for the RuleBasedTransliterator rule.
|
||||
*/
|
||||
private static final String RB_RULE = "Rule";
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
* @param ID the string identifier for this transliterator
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
protected Transliterator(String ID, UnicodeFilter filter) {
|
||||
if (ID == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
this.ID = ID;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the segment of a string that begins at the
|
||||
* character at offset <code>start</code> and extends to the
|
||||
* character at offset <code>limit - 1</code>, with optional
|
||||
* filtering. A default implementaion is provided here;
|
||||
* subclasses should provide a more efficient implementation if
|
||||
* possible.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
public void transliterate(String text, int start, int limit,
|
||||
StringBuffer result) {
|
||||
/* This is a default implementation that should be replaced by
|
||||
* a more efficient subclass implementation if possible.
|
||||
*/
|
||||
result.setLength(0);
|
||||
result.append(text.substring(start, limit));
|
||||
transliterate(new ReplaceableString(result),
|
||||
0, result.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string, with optional filtering.
|
||||
* Subclasses must override this abstract method.
|
||||
*
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return The new limit index. The text previously occupying <code>[start,
|
||||
* limit)</code> has been transliterated, possibly to a string of a different
|
||||
* length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
|
||||
* <em>new-limit</em> is the return value.
|
||||
*/
|
||||
public abstract int transliterate(Replaceable text, int start, int limit);
|
||||
|
||||
/**
|
||||
* Transliterates an entire string. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
public final void transliterate(String text, StringBuffer result) {
|
||||
transliterate(text, 0, text.length(), result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterate an entire string and returns the result. Convenience method.
|
||||
*
|
||||
* @param text the string to be transliterated
|
||||
* @return The transliterated text
|
||||
*/
|
||||
public final String transliterate(String text) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
transliterate(text, 0, text.length(), result);
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates an entire string in place. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
*/
|
||||
public final void transliterate(Replaceable text) {
|
||||
transliterate(text, 0, text.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after new text has been inserted,
|
||||
* typically as a result of a keyboard event. The new text in
|
||||
* <code>insertion</code> will be inserted into <code>text</code>
|
||||
* at <code>index[LIMIT]</code>, advancing
|
||||
* <code>index[LIMIT]</code> by <code>insertion.length()</code>.
|
||||
* Then the transliterator will try to transliterate characters of
|
||||
* <code>text</code> between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed.
|
||||
*
|
||||
* <p>Upon return, values in <code>index[]</code> will be updated.
|
||||
* <code>index[START]</code> will be advanced to the first
|
||||
* character that future calls to this method will read.
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code> will
|
||||
* be adjusted to delimit the range of text that future calls to
|
||||
* this method may change.
|
||||
*
|
||||
* <p>Typical usage of this method begins with an initial call
|
||||
* with <code>index[START]</code> and <code>index[LIMIT]</code>
|
||||
* set to indicate the portion of <code>text</code> to be
|
||||
* transliterated, and <code>index[CURSOR] == index[START]</code>.
|
||||
* Thereafter, <code>index[]</code> can be used without
|
||||
* modification in future calls, provided that all changes to
|
||||
* <code>text</code> are made via this method.
|
||||
*
|
||||
* <p>This method assumes that future calls may be made that will
|
||||
* insert new text into the buffer. As a result, it only performs
|
||||
* unambiguous transliterations. After the last call to this
|
||||
* method, there may be untransliterated text that is waiting for
|
||||
* more input to resolve an ambiguity. In order to perform these
|
||||
* pending transliterations, clients should call {@link
|
||||
* #finishKeyboardTransliteration} after the last call to this
|
||||
* method has been made.
|
||||
*
|
||||
* @param text the buffer holding transliterated and untransliterated text
|
||||
* @param index an array of three integers.
|
||||
*
|
||||
* <ul><li><code>index[START]</code>: the beginning index,
|
||||
* inclusive; <code>0 <= index[START] <= index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[LIMIT]</code>: the ending index, exclusive;
|
||||
* <code>index[START] <= index[LIMIT] <= text.length()</code>.
|
||||
* <code>insertion</code> is inserted at
|
||||
* <code>index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[CURSOR]</code>: the next character to be
|
||||
* considered for transliteration; <code>index[START] <=
|
||||
* index[CURSOR] <= index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed by future calls
|
||||
* to this method.</ul>
|
||||
*
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>. If <code>null</code> then no text
|
||||
* is inserted.
|
||||
* @see #START
|
||||
* @see #LIMIT
|
||||
* @see #CURSOR
|
||||
* @see #handleKeyboardTransliterate
|
||||
* @exception IllegalArgumentException if <code>index[]</code>
|
||||
* is invalid
|
||||
*/
|
||||
public final void keyboardTransliterate(Replaceable text, int[] index,
|
||||
String insertion) {
|
||||
if (index.length < 3 ||
|
||||
index[START] < 0 ||
|
||||
index[LIMIT] > text.length() ||
|
||||
index[CURSOR] < index[START] ||
|
||||
index[CURSOR] > index[LIMIT]) {
|
||||
throw new IllegalArgumentException("Invalid index array");
|
||||
}
|
||||
|
||||
int originalStart = index[START];
|
||||
if (insertion != null) {
|
||||
text.replace(index[LIMIT], index[LIMIT], insertion);
|
||||
index[LIMIT] += insertion.length();
|
||||
}
|
||||
|
||||
handleKeyboardTransliterate(text, index);
|
||||
|
||||
index[START] = Math.max(index[CURSOR] - getMaximumContextLength(),
|
||||
originalStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after a new character has been
|
||||
* inserted, typically as a result of a keyboard event. This is a
|
||||
* convenience method; see {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)} for details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
public final void keyboardTransliterate(Replaceable text, int[] index,
|
||||
char insertion) {
|
||||
keyboardTransliterate(text, index, String.valueOf(insertion));
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly. This is a convenience method; see
|
||||
* {@link #keyboardTransliterate(Replaceable, int[], String)} for
|
||||
* details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
public final void keyboardTransliterate(Replaceable text, int[] index) {
|
||||
keyboardTransliterate(text, index, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finishes any pending transliterations that were waiting for
|
||||
* more characters. Clients should call this method as the last
|
||||
* call after a sequence of one or more calls to
|
||||
* <code>keyboardTransliterate()</code>.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text.
|
||||
* @param index the array of indices previously passed to {@link
|
||||
* #keyboardTransliterate}
|
||||
*/
|
||||
public final void finishKeyboardTransliteration(Replaceable text,
|
||||
int[] index) {
|
||||
transliterate(text, index[START], index[LIMIT]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract method that concrete subclasses define to implement
|
||||
* keyboard transliteration. This method should transliterate all
|
||||
* characters between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code> that can be unambiguously
|
||||
* transliterated, regardless of future insertions of text at
|
||||
* <code>index[LIMIT]</code>. <code>index[CURSOR]</code> should
|
||||
* be advanced past committed characters (those that will not
|
||||
* change in future calls to this method).
|
||||
* <code>index[LIMIT]</code> should be updated to reflect text
|
||||
* replacements that shorten or lengthen the text between
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code>. Upon
|
||||
* return, neither <code>index[CURSOR]</code> nor
|
||||
* <code>index[LIMIT]</code> should be less than the initial value
|
||||
* of <code>index[CURSOR]</code>. <code>index[START]</code>
|
||||
* should <em>not</em> be changed.
|
||||
*
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
protected abstract void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] index);
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context. The default implementation supplied
|
||||
* by <code>Transliterator</code> returns zero; subclasses
|
||||
* that use preceding context should override this method to return the
|
||||
* correct value. For example, if a transliterator translates "ddd" (where
|
||||
* d is any digit) to "555" when preceded by "(ddd)", then the preceding
|
||||
* context length is 5, the length of "(ddd)".
|
||||
*
|
||||
* @return The maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a programmatic identifier for this transliterator.
|
||||
* If this identifier is passed to <code>getInstance()</code>, it
|
||||
* will return this object, if it has been registered.
|
||||
* @see #registerInstance
|
||||
* @see #registerClass
|
||||
* @see #getAvailableIDs
|
||||
*/
|
||||
public final String getID() {
|
||||
return ID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the default locale. See {@link
|
||||
* #getDisplayName(Locale)} for details.
|
||||
*/
|
||||
public final String getDisplayName() {
|
||||
return getDisplayName(Locale.getDefault());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the given locale. This name is taken
|
||||
* from the locale resource data in the standard manner of the
|
||||
* <code>java.text</code> package.
|
||||
*
|
||||
* <p>If no localized names exist in the system resource bundles,
|
||||
* a name is synthesized using a localized
|
||||
* <code>MessageFormat</code> pattern from the resource data. The
|
||||
* arguments to this pattern are an integer followed by one or two
|
||||
* strings. The integer is the number of strings, either 1 or 2.
|
||||
* The strings are formed by splitting the ID for this
|
||||
* transliterator at the first '-'. If there is no '-', then the
|
||||
* entire ID forms the only string.
|
||||
* @param inLocale the Locale in which the display name should be
|
||||
* localized.
|
||||
* @see java.text.MessageFormat
|
||||
*/
|
||||
public String getDisplayName(Locale inLocale) {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle(
|
||||
RB_LOCALE_ELEMENTS, inLocale);
|
||||
|
||||
try {
|
||||
return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
|
||||
} catch (MissingResourceException e) {}
|
||||
|
||||
try {
|
||||
// Construct the formatter first; if getString() fails
|
||||
// we'll exit the try block
|
||||
MessageFormat format = new MessageFormat(
|
||||
bundle.getString(RB_DISPLAY_NAME_PATTERN));
|
||||
// Construct the argument array
|
||||
int i = ID.indexOf('-');
|
||||
Object[] args = (i < 0)
|
||||
? new Object[] { new Integer(1), ID }
|
||||
: new Object[] { new Integer(2), ID.substring(0, i),
|
||||
ID.substring(i+1) };
|
||||
// Format it using the pattern in the resource
|
||||
return format.format(args);
|
||||
} catch (MissingResourceException e2) {}
|
||||
|
||||
// We should not reach this point unless there is something
|
||||
// wrong with the build or the RB_DISPLAY_NAME_PATTERN has
|
||||
// been deleted from the root RB_LOCALE_ELEMENTS resource.
|
||||
throw new RuntimeException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the filter used by this transliterator, or <tt>null</tt>
|
||||
* if this transliterator uses no filter.
|
||||
*/
|
||||
public UnicodeFilter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the filter used by this transliterator. If the filter
|
||||
* is set to <tt>null</tt> then no filtering will occur.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The filter should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
*/
|
||||
public void setFilter(UnicodeFilter filter) {
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this transliterator's inverse. See the class
|
||||
* documentation for details. This implementation simply inverts
|
||||
* the two entities in the ID and attempts to retrieve the
|
||||
* resulting transliterator. That is, if <code>getID()</code>
|
||||
* returns "A-B", then this method will return the result of
|
||||
* <code>getInstance("B-A")</code>, or <code>null</code> if that
|
||||
* call fails.
|
||||
*
|
||||
* <p>This method does not take filtering into account. The
|
||||
* returned transliterator will have no filter.
|
||||
*
|
||||
* <p>Subclasses with knowledge of their inverse may wish to
|
||||
* override this method.
|
||||
*
|
||||
* @return a transliterator that is an inverse, not necessarily
|
||||
* exact, of this transliterator, or <code>null</code> if no such
|
||||
* transliterator is registered.
|
||||
* @see #registerInstance
|
||||
*/
|
||||
public Transliterator getInverse() {
|
||||
int i = ID.indexOf('-');
|
||||
if (i >= 0) {
|
||||
String inverseID = ID.substring(i+1) + '-' + ID.substring(0, i);
|
||||
return internalGetInstance(inverseID);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>Transliterator</code> object given its ID.
|
||||
* The ID must be either a system transliterator ID or a ID registered
|
||||
* using <code>registerInstance()</code>.
|
||||
*
|
||||
* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
|
||||
* @return A <code>Transliterator</code> object with the given ID
|
||||
* @exception IllegalArgumentException if the given ID is invalid.
|
||||
* @see #registerInstance
|
||||
* @see #getAvailableIDs
|
||||
* @see #getID
|
||||
*/
|
||||
public static Transliterator getInstance(String ID) {
|
||||
Transliterator t = internalGetInstance(ID);
|
||||
if (t != null) {
|
||||
return t;
|
||||
}
|
||||
throw new IllegalArgumentException("Unsupported transliterator: "
|
||||
+ ID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a transliterator object given its ID. Unlike getInstance(),
|
||||
* this method returns null if it cannot make use of the given ID.
|
||||
*/
|
||||
private static Transliterator internalGetInstance(String ID) {
|
||||
Object obj = cache.get(ID);
|
||||
RuleBasedTransliterator.Data data = null;
|
||||
|
||||
if (obj instanceof RuleBasedTransliterator.Data) {
|
||||
data = (RuleBasedTransliterator.Data) obj;
|
||||
// Fall through to construct transliterator from cached Data object.
|
||||
} else if (obj instanceof Class) {
|
||||
try {
|
||||
return (Transliterator) ((Class) obj).newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
} catch (IllegalAccessException e2) {}
|
||||
} else {
|
||||
synchronized (cache) {
|
||||
boolean isReverse = (obj == REVERSE_RULE_BASED_PLACEHOLDER);
|
||||
String resourceName = RB_RULE_BASED_PREFIX;
|
||||
int i = ID.indexOf('-');
|
||||
if (i < 0) {
|
||||
resourceName += ID;
|
||||
} else {
|
||||
String IDLeft = ID.substring(0, i);
|
||||
String IDRight = ID.substring(i+1);
|
||||
resourceName += isReverse ? (IDRight + IDLeft)
|
||||
: (IDLeft + IDRight);
|
||||
}
|
||||
try {
|
||||
ResourceBundle resource = ResourceBundle.getBundle(resourceName);
|
||||
|
||||
data = RuleBasedTransliterator.parse(resource.getString(RB_RULE),
|
||||
isReverse
|
||||
? RuleBasedTransliterator.REVERSE
|
||||
: RuleBasedTransliterator.FORWARD);
|
||||
|
||||
cache.put(ID, data);
|
||||
// Fall through to construct transliterator from Data object.
|
||||
} catch (MissingResourceException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
if (data != null) {
|
||||
return new RuleBasedTransliterator(ID, data, null);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a subclass of <code>Transliterator</code> with the
|
||||
* system. This subclass must have a public constructor taking no
|
||||
* arguments. When that constructor is called, the resulting
|
||||
* object must return the <code>ID</code> passed to this method if
|
||||
* its <code>getID()</code> method is called.
|
||||
*
|
||||
* @param ID the result of <code>getID()</code> for this
|
||||
* transliterator
|
||||
* @param transClass a subclass of <code>Transliterator</code>
|
||||
* @see #registerInstance
|
||||
* @see #unregister
|
||||
*/
|
||||
public static void registerClass(String ID, Class transClass) {
|
||||
cache.put(ID, transClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unregisters a transliterator or class. This may be either
|
||||
* a system transliterator or a user transliterator or class.
|
||||
*
|
||||
* @param ID the ID of the transliterator or class
|
||||
* @return the <code>Object</code> that was registered with
|
||||
* <code>ID</code>, or <code>null</code> if none was
|
||||
* @see #registerInstance
|
||||
* @see #registerClass
|
||||
*/
|
||||
public static Object unregister(String ID) {
|
||||
return cache.remove(ID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an enumeration over the programmatic names of registered
|
||||
* <code>Transliterator</code> objects. This includes both system
|
||||
* transliterators and user transliterators registered using
|
||||
* <code>registerInstance()</code>. The enumerated names may be
|
||||
* passed to <code>getInstance()</code>.
|
||||
*
|
||||
* @return An <code>Enumeration</code> over <code>String</code> objects
|
||||
* @see #getInstance
|
||||
* @see #registerInstance
|
||||
*/
|
||||
public static final Enumeration getAvailableIDs() {
|
||||
return cache.keys();
|
||||
}
|
||||
|
||||
static {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle(RB_LOCALE_ELEMENTS);
|
||||
|
||||
try {
|
||||
String[] ruleBasedIDs = bundle.getStringArray(RB_RULE_BASED_IDS);
|
||||
|
||||
cache = new Hashtable();
|
||||
|
||||
for (int i=0; i<ruleBasedIDs.length; ++i) {
|
||||
String ID = ruleBasedIDs[i];
|
||||
boolean isReverse = (ID.charAt(0) == '*');
|
||||
if (isReverse) {
|
||||
ID = ID.substring(1);
|
||||
}
|
||||
cache.put(ID, isReverse ? REVERSE_RULE_BASED_PLACEHOLDER
|
||||
: RULE_BASED_PLACEHOLDER);
|
||||
}
|
||||
} catch (MissingResourceException e) {}
|
||||
|
||||
cache.put(HexToUnicodeTransliterator._ID,
|
||||
HexToUnicodeTransliterator.class);
|
||||
cache.put(UnicodeToHexTransliterator._ID,
|
||||
UnicodeToHexTransliterator.class);
|
||||
}
|
||||
}
|
22
icu4j/src/com/ibm/icu/text/UnicodeFilter.java
Executable file
22
icu4j/src/com/ibm/icu/text/UnicodeFilter.java
Executable file
|
@ -0,0 +1,22 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>UnicodeFilter</code> defines a protocol for selecting a
|
||||
* subset of the full range (U+0000 to U+FFFF) of Unicode characters.
|
||||
* Currently, filters are used in conjunction with classes like {@link
|
||||
* Transliterator} to only process selected characters through a
|
||||
* transformation.
|
||||
*
|
||||
* {@link UnicodeFilterLogic}
|
||||
*/
|
||||
|
||||
public interface UnicodeFilter {
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> for characters that are in the selected
|
||||
* subset. In other words, if a character is <b>to be
|
||||
* filtered</b>, then <tt>isIn()</tt> returns
|
||||
* <b><tt>false</tt></b>.
|
||||
*/
|
||||
public boolean isIn(char c);
|
||||
}
|
112
icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java
Executable file
112
icu4j/src/com/ibm/icu/text/UnicodeFilterLogic.java
Executable file
|
@ -0,0 +1,112 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>UnicodeFilterLogic</code> provides logical operators on
|
||||
* {@link UnicodeFilter} objects. This class cannot be instantiated;
|
||||
* it consists only of static methods. The static methods return
|
||||
* filter objects that perform logical inversion (<tt>not</tt>),
|
||||
* intersection (<tt>and</tt>), or union (<tt>or</tt>) of the given
|
||||
* filter objects.
|
||||
*/
|
||||
public final class UnicodeFilterLogic {
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
|
||||
* the given filter.
|
||||
*/
|
||||
public static UnicodeFilter not(final UnicodeFilter f) {
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return !f.isIn(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the two given filters. That is,
|
||||
* if <tt>f.isIn()</tt> is <tt>false</tt>, then <tt>g.isIn()</tt>
|
||||
* is not called, and <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
public static UnicodeFilter and(final UnicodeFilter f,
|
||||
final UnicodeFilter g) {
|
||||
if (f == null) {
|
||||
return g;
|
||||
}
|
||||
if (g == null) {
|
||||
return f;
|
||||
}
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return f.isIn(c) && g.isIn(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*/
|
||||
public static UnicodeFilter and(final UnicodeFilter[] f) {
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
if (!f[i].isIn(c)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the two given filters. That is, if
|
||||
* <tt>f.isIn()</tt> is <tt>true</tt>, then <tt>g.isIn()</tt> is
|
||||
* not called, and <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
public static UnicodeFilter or(final UnicodeFilter f,
|
||||
final UnicodeFilter g) {
|
||||
if (f == null) {
|
||||
return g;
|
||||
}
|
||||
if (g == null) {
|
||||
return f;
|
||||
}
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return f.isIn(c) || g.isIn(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*/
|
||||
public static UnicodeFilter or(final UnicodeFilter[] f) {
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
if (f[i].isIn(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: Add nand() & nor() for convenience, if needed.
|
||||
}
|
1354
icu4j/src/com/ibm/icu/text/UnicodeSet.java
Executable file
1354
icu4j/src/com/ibm/icu/text/UnicodeSet.java
Executable file
File diff suppressed because it is too large
Load diff
172
icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java
Executable file
172
icu4j/src/com/ibm/icu/text/UnicodeToHexTransliterator.java
Executable file
|
@ -0,0 +1,172 @@
|
|||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from Unicode characters to
|
||||
* hexadecimal Unicode escape sequences. It outputs a
|
||||
* prefix specified in the constructor and optionally converts the hex
|
||||
* digits to uppercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class UnicodeToHexTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Unicode-Hex";
|
||||
|
||||
private String prefix;
|
||||
|
||||
private boolean uppercase;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param prefix the string that will precede the four hex
|
||||
* digits for UNICODE_HEX transliterators. Ignored
|
||||
* if direction is HEX_UNICODE.
|
||||
* @param uppercase if true, the four hex digits will be
|
||||
* converted to uppercase; otherwise they will be lowercase.
|
||||
* Ignored if direction is HEX_UNICODE.
|
||||
*/
|
||||
public UnicodeToHexTransliterator(String prefix, boolean uppercase,
|
||||
UnicodeFilter filter) {
|
||||
super(_ID, filter);
|
||||
this.prefix = prefix;
|
||||
this.uppercase = uppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default prefix "\u"
|
||||
* that outputs uppercase hex digits.
|
||||
*/
|
||||
public UnicodeToHexTransliterator() {
|
||||
this("\\u", true, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string that precedes the four hex digits.
|
||||
* @return prefix string
|
||||
*/
|
||||
public String getPrefix() {
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the string that precedes the four hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The prefix should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
* @param prefix prefix string
|
||||
*/
|
||||
public void setPrefix(String prefix) {
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this transliterator outputs uppercase hex digits.
|
||||
*/
|
||||
public boolean isUppercase() {
|
||||
return uppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets if this transliterator outputs uppercase hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The uppercase mode should not be changed by
|
||||
* one thread while another thread may be transliterating.
|
||||
* @param outputUppercase if true, then this transliterator
|
||||
* outputs uppercase hex digits.
|
||||
*/
|
||||
public void setUppercase(boolean outputUppercase) {
|
||||
uppercase = outputUppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
public int transliterate(Replaceable text, int start, int limit) {
|
||||
int[] offsets = { start, limit, start };
|
||||
handleKeyboardTransliterate(text, offsets);
|
||||
return offsets[LIMIT];
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
protected void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] offsets) {
|
||||
/**
|
||||
* Performs transliteration changing all characters to
|
||||
* Unicode hexadecimal escapes. For example, '@' -> "U+0040",
|
||||
* assuming the prefix is "U+".
|
||||
*/
|
||||
int cursor = offsets[CURSOR];
|
||||
int limit = offsets[LIMIT];
|
||||
|
||||
UnicodeFilter filter = getFilter();
|
||||
|
||||
loop:
|
||||
while (cursor < limit) {
|
||||
char c = text.charAt(cursor);
|
||||
if (filter != null && !filter.isIn(c)) {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
String hex = hex(c);
|
||||
text.replace(cursor, cursor+1, hex);
|
||||
int len = hex.length();
|
||||
cursor += len; // Advance cursor by 1 and adjust for new text
|
||||
--len;
|
||||
limit += len;
|
||||
}
|
||||
|
||||
offsets[LIMIT] = limit;
|
||||
offsets[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Form escape sequence.
|
||||
*/
|
||||
private final String hex(char c) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append(prefix);
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
String h = Integer.toHexString(c);
|
||||
buf.append(uppercase ? h.toUpperCase() : h);
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
763
icu4j/src/com/ibm/test/translit/TransliteratorTest.java
Executable file
763
icu4j/src/com/ibm/test/translit/TransliteratorTest.java
Executable file
|
@ -0,0 +1,763 @@
|
|||
import com.ibm.text.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary General test of Transliterator
|
||||
*/
|
||||
public class TransliteratorTest extends IntlTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new TransliteratorTest().run(args);
|
||||
}
|
||||
|
||||
/**
|
||||
* A CommonPoint legacy round-trip test for the Kana transliterator.
|
||||
*/
|
||||
// public void TestKanaRoundTrip() {
|
||||
// Transliterator t = Transliterator.getInstance("Kana");
|
||||
// StringTokenizer tok = new StringTokenizer(KANA_RT_DATA);
|
||||
// while (tok.hasMoreTokens()) {
|
||||
// String str = tok.nextToken();
|
||||
// ReplaceableString tmp = new ReplaceableString(str);
|
||||
// t.transliterate(tmp, Transliterator.FORWARD);
|
||||
//
|
||||
// str = tmp.toString();
|
||||
// tmp = new ReplaceableString(str);
|
||||
// t.transliterate(tmp, Transliterator.REVERSE);
|
||||
// t.transliterate(tmp, Transliterator.FORWARD);
|
||||
// if (!tmp.toString().equals(str)) {
|
||||
// tmp = new ReplaceableString(str);
|
||||
// t.transliterate(tmp, Transliterator.REVERSE);
|
||||
// String a = tmp.toString();
|
||||
// t.transliterate(tmp, Transliterator.FORWARD);
|
||||
// errln("FAIL: " + escape(str) + " -> " +
|
||||
// escape(a) + " -> " + escape(tmp.toString()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
public void TestInstantiation() {
|
||||
long ms = System.currentTimeMillis();
|
||||
String ID;
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
ID = (String) e.nextElement();
|
||||
try {
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
// We should get a new instance if we try again
|
||||
Transliterator t2 = Transliterator.getInstance(ID);
|
||||
if (t != t2) {
|
||||
logln(ID + ":" + t);
|
||||
} else {
|
||||
errln("FAIL: " + ID + " returned identical instances");
|
||||
}
|
||||
} catch (IllegalArgumentException ex) {
|
||||
errln("FAIL: " + ID);
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
// Now test the failure path
|
||||
try {
|
||||
ID = "<Not a valid Transliterator ID>";
|
||||
Transliterator t = Transliterator.getInstance(ID);
|
||||
errln("FAIL: " + ID + " returned " + t);
|
||||
} catch (IllegalArgumentException ex) {
|
||||
logln("OK: Bogus ID handled properly");
|
||||
}
|
||||
|
||||
ms = System.currentTimeMillis() - ms;
|
||||
logln("Elapsed time: " + ms + " ms");
|
||||
}
|
||||
|
||||
public void TestSimpleRules() {
|
||||
/* Example: rules 1. ab>x|y
|
||||
* 2. yc>z
|
||||
*
|
||||
* []|eabcd start - no match, copy e to tranlated buffer
|
||||
* [e]|abcd match rule 1 - copy output & adjust cursor
|
||||
* [ex|y]cd match rule 2 - copy output & adjust cursor
|
||||
* [exz]|d no match, copy d to transliterated buffer
|
||||
* [exzd]| done
|
||||
*/
|
||||
expect("ab>x|y\n" +
|
||||
"yc>z",
|
||||
"eabcd", "exzd");
|
||||
|
||||
/* Another set of rules:
|
||||
* 1. ab>x|yzacw
|
||||
* 2. za>q
|
||||
* 3. qc>r
|
||||
* 4. cw>n
|
||||
*
|
||||
* []|ab Rule 1
|
||||
* [x|yzacw] No match
|
||||
* [xy|zacw] Rule 2
|
||||
* [xyq|cw] Rule 4
|
||||
* [xyqn]| Done
|
||||
*/
|
||||
expect("ab>x|yzacw\n" +
|
||||
"za>q\n" +
|
||||
"qc>r\n" +
|
||||
"cw>n",
|
||||
"ab", "xyqn");
|
||||
|
||||
/* Test categories
|
||||
*/
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>",
|
||||
"dummy=\uE100\n" +
|
||||
"vowel=[aeiouAEIOU]\n" +
|
||||
"lu=[:Lu:]\n" +
|
||||
"{vowel}[{lu}>!\n" +
|
||||
"{vowel}>&\n" +
|
||||
"!]{lu}>^\n" +
|
||||
"{lu}>*\n" +
|
||||
"a>ERROR");
|
||||
expect(t, "abcdefgABCDEFGU", "&bcd&fg!^**!^*&");
|
||||
}
|
||||
|
||||
// Restore this test if/when it's been deciphered. In general,
|
||||
// tests that depend on a specific tranliterator are subject
|
||||
// to the same fragility as tests that depend on resource data.
|
||||
|
||||
// public void TestKana() {
|
||||
// String DATA[] = {
|
||||
// "a", "\u3042",
|
||||
// "A", "\u30A2",
|
||||
// "aA", "\u3042\u30A2",
|
||||
// "aaaa", "\u3042\u3042\u3042\u3042",
|
||||
// "akasata", "\u3042\u304B\u3055\u305F",
|
||||
// };
|
||||
//
|
||||
// Transliterator t = Transliterator.getInstance("Latin-Kana");
|
||||
// Transliterator rt = Transliterator.getInstance("Kana-Latin");
|
||||
// for (int i=0; i<DATA.length; i+=2) {
|
||||
// expect(t, DATA[i], DATA[i+1], rt);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
/**
|
||||
* Create some inverses and confirm that they work. We have to be
|
||||
* careful how we do this, since the inverses will not be true
|
||||
* inverses -- we can't throw any random string at the composition
|
||||
* of the transliterators and expect the identity function. F x
|
||||
* F' != I. However, if we are careful about the input, we will
|
||||
* get the expected results.
|
||||
*/
|
||||
public void TestRuleBasedInverse() {
|
||||
String RULES =
|
||||
"abc>zyx\n" +
|
||||
"ab>yz\n" +
|
||||
"bc>zx\n" +
|
||||
"ca>xy\n" +
|
||||
"a>x\n" +
|
||||
"b>y\n" +
|
||||
"c>z\n" +
|
||||
|
||||
"abc<zyx\n" +
|
||||
"ab<yz\n" +
|
||||
"bc<zx\n" +
|
||||
"ca<xy\n" +
|
||||
"a<x\n" +
|
||||
"b<y\n" +
|
||||
"c<z\n" +
|
||||
|
||||
"";
|
||||
|
||||
String[] DATA = {
|
||||
// Careful here -- random strings will not work. If we keep
|
||||
// the left side to the domain and the right side to the range
|
||||
// we will be okay though (left, abc; right xyz).
|
||||
"a", "x",
|
||||
"abcacab", "zyxxxyy",
|
||||
"caccb", "xyzzy",
|
||||
};
|
||||
|
||||
Transliterator fwd = new RuleBasedTransliterator("<ID>", RULES);
|
||||
Transliterator rev = new RuleBasedTransliterator("<ID>", RULES,
|
||||
RuleBasedTransliterator.REVERSE, null);
|
||||
for (int i=0; i<DATA.length; i+=2) {
|
||||
expect(fwd, DATA[i], DATA[i+1]);
|
||||
expect(rev, DATA[i+1], DATA[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic test of keyboard.
|
||||
*/
|
||||
public void TestKeyboard() {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>",
|
||||
"psch>Y\n"
|
||||
+"ps>y\n"
|
||||
+"ch>x\n"
|
||||
+"a>A\n");
|
||||
String DATA[] = {
|
||||
// insertion, buffer
|
||||
"a", "A",
|
||||
"p", "Ap",
|
||||
"s", "Aps",
|
||||
"c", "Apsc",
|
||||
"a", "AycA",
|
||||
"psch", "AycAY",
|
||||
null, "AycAY", // null means finishKeyboardTransliteration
|
||||
};
|
||||
|
||||
keyboardAux(t, DATA);
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic test of keyboard with cursor.
|
||||
*/
|
||||
public void TestKeyboard2() {
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>",
|
||||
"ych>Y\n"
|
||||
+"ps>|y\n"
|
||||
+"ch>x\n"
|
||||
+"a>A\n");
|
||||
String DATA[] = {
|
||||
// insertion, buffer
|
||||
"a", "A",
|
||||
"p", "Ap",
|
||||
"s", "Ay",
|
||||
"c", "Ayc",
|
||||
"a", "AycA",
|
||||
"p", "AycAp",
|
||||
"s", "AycAy",
|
||||
"c", "AycAyc",
|
||||
"h", "AycAY",
|
||||
null, "AycAY", // null means finishKeyboardTransliteration
|
||||
};
|
||||
|
||||
keyboardAux(t, DATA);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test keyboard transliteration with back-replacement.
|
||||
*/
|
||||
public void TestKeyboard3() {
|
||||
// We want th>z but t>y. Furthermore, during keyboard
|
||||
// transliteration we want t>y then yh>z if t, then h are
|
||||
// typed.
|
||||
String RULES =
|
||||
"t>|y\n" +
|
||||
"yh>z\n" +
|
||||
"";
|
||||
|
||||
String[] DATA = {
|
||||
// Column 1: characters to add to buffer (as if typed)
|
||||
// Column 2: expected appearance of buffer after
|
||||
// keyboard xliteration.
|
||||
"a", "a",
|
||||
"b", "ab",
|
||||
"t", "aby",
|
||||
"c", "abyc",
|
||||
"t", "abycy",
|
||||
"h", "abycz",
|
||||
null, "abycz", // null means finishKeyboardTransliteration
|
||||
};
|
||||
|
||||
Transliterator t = new RuleBasedTransliterator("<ID>", RULES);
|
||||
keyboardAux(t, DATA);
|
||||
}
|
||||
|
||||
private void keyboardAux(Transliterator t, String[] DATA) {
|
||||
int[] index = {0, 0, 0};
|
||||
ReplaceableString s = new ReplaceableString();
|
||||
for (int i=0; i<DATA.length; i+=2) {
|
||||
StringBuffer log;
|
||||
if (DATA[i] != null) {
|
||||
log = new StringBuffer(s.toString() + " + "
|
||||
+ DATA[i]
|
||||
+ " -> ");
|
||||
t.keyboardTransliterate(s, index, DATA[i]);
|
||||
} else {
|
||||
log = new StringBuffer(s.toString() + " => ");
|
||||
t.finishKeyboardTransliteration(s, index);
|
||||
}
|
||||
String str = s.toString();
|
||||
// Show the start index '{' and the cursor '|'
|
||||
log.append(str.substring(0, index[Transliterator.START])).
|
||||
append('{').
|
||||
append(str.substring(index[Transliterator.START],
|
||||
index[Transliterator.CURSOR])).
|
||||
append('|').
|
||||
append(str.substring(index[Transliterator.CURSOR]));
|
||||
if (str.equals(DATA[i+1])) {
|
||||
logln(log.toString());
|
||||
} else {
|
||||
errln("FAIL: " + log.toString() + ", expected " + DATA[i+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestArabic() {
|
||||
String DATA[] = {
|
||||
"Arabic", "\u062a\u062a\u0645\u062a\u0639\u0020"+
|
||||
"\u0627\u0644\u0644\u063a\u0629\u0020"+
|
||||
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
|
||||
"\u0628\u0628\u0646\u0638\u0645\u0020"+
|
||||
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
|
||||
"\u062c\u0645\u064a\u0644\u0629",
|
||||
};
|
||||
|
||||
Transliterator t = Transliterator.getInstance("Latin-Arabic");
|
||||
for (int i=0; i<DATA.length; i+=2) {
|
||||
expect(t, DATA[i], DATA[i+1]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compose the Kana transliterator forward and reverse and try
|
||||
* some strings that should come out unchanged.
|
||||
*/
|
||||
public void TestCompoundKana() {
|
||||
Transliterator kana = Transliterator.getInstance("Latin-Kana");
|
||||
Transliterator rkana = Transliterator.getInstance("Kana-Latin");
|
||||
Transliterator[] trans = { kana, rkana };
|
||||
Transliterator t = new CompoundTransliterator("<ID>", trans);
|
||||
|
||||
expect(t, "aaaaa", "aaaaa");
|
||||
}
|
||||
|
||||
/**
|
||||
* Compose the hex transliterators forward and reverse.
|
||||
*/
|
||||
public void TestCompoundHex() {
|
||||
Transliterator a = Transliterator.getInstance("Unicode-Hex");
|
||||
Transliterator b = Transliterator.getInstance("Hex-Unicode");
|
||||
Transliterator[] trans = { a, b };
|
||||
Transliterator ab = new CompoundTransliterator("ab", trans);
|
||||
String s = "abcde";
|
||||
expect(ab, s, s);
|
||||
|
||||
trans = new Transliterator[] { b, a };
|
||||
Transliterator ba = new CompoundTransliterator("ba", trans);
|
||||
ReplaceableString str = new ReplaceableString(s);
|
||||
a.transliterate(str);
|
||||
expect(ba, str.toString(), str.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Do some basic tests of filtering.
|
||||
*/
|
||||
public void TestFiltering() {
|
||||
Transliterator hex = Transliterator.getInstance("Unicode-Hex");
|
||||
hex.setFilter(new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return c != 'c';
|
||||
}
|
||||
});
|
||||
String s = "abcde";
|
||||
String out = hex.transliterate(s);
|
||||
String exp = "\\u0061\\u0062c\\u0064\\u0065";
|
||||
if (out.equals(exp)) {
|
||||
logln("Ok: \"" + exp + "\"");
|
||||
} else {
|
||||
logln("FAIL: \"" + out + "\", wanted \"" + exp + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
||||
void expect(String rules, String source, String expectedResult) {
|
||||
expect(new RuleBasedTransliterator("<ID>", rules), source, expectedResult);
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult,
|
||||
Transliterator reverseTransliterator) {
|
||||
expect(t, source, expectedResult);
|
||||
if (reverseTransliterator != null) {
|
||||
expect(reverseTransliterator, expectedResult, source);
|
||||
}
|
||||
}
|
||||
|
||||
void expect(Transliterator t, String source, String expectedResult) {
|
||||
String result = t.transliterate(source);
|
||||
expectAux(t.getID() + ":String", source, result, expectedResult);
|
||||
|
||||
ReplaceableString rsource = new ReplaceableString(source);
|
||||
t.transliterate(rsource);
|
||||
result = rsource.toString();
|
||||
expectAux(t.getID() + ":Replaceable", source, result, expectedResult);
|
||||
|
||||
// Test keyboard (incremental) transliteration -- this result
|
||||
// must be the same after we finalize (see below).
|
||||
rsource.getStringBuffer().setLength(0);
|
||||
int[] index = { 0, 0, 0 };
|
||||
StringBuffer log = new StringBuffer();
|
||||
|
||||
for (int i=0; i<source.length(); ++i) {
|
||||
if (i != 0) {
|
||||
log.append(" + ");
|
||||
}
|
||||
log.append(source.charAt(i)).append(" -> ");
|
||||
t.keyboardTransliterate(rsource, index,
|
||||
String.valueOf(source.charAt(i)));
|
||||
// Append the string buffer with a vertical bar '|' where
|
||||
// the committed index is.
|
||||
String s = rsource.toString();
|
||||
log.append(s.substring(0, index[Transliterator.CURSOR])).
|
||||
append('|').
|
||||
append(s.substring(index[Transliterator.CURSOR]));
|
||||
}
|
||||
|
||||
// As a final step in keyboard transliteration, we must call
|
||||
// transliterate to finish off any pending partial matches that
|
||||
// were waiting for more input.
|
||||
t.finishKeyboardTransliteration(rsource, index);
|
||||
result = rsource.toString();
|
||||
log.append(" => ").append(rsource.toString());
|
||||
|
||||
expectAux(t.getID() + ":Keyboard", log.toString(),
|
||||
result.equals(expectedResult),
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void expectAux(String tag, String source,
|
||||
String result, String expectedResult) {
|
||||
expectAux(tag, source + " -> " + result,
|
||||
result.equals(expectedResult),
|
||||
expectedResult);
|
||||
}
|
||||
|
||||
void expectAux(String tag, String summary, boolean pass,
|
||||
String expectedResult) {
|
||||
if (pass) {
|
||||
logln("("+tag+") " + escape(summary));
|
||||
} else {
|
||||
errln("FAIL: ("+tag+") "
|
||||
+ escape(summary)
|
||||
+ ", expected " + escape(expectedResult));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/*
|
||||
static final String KANA_RT_DATA =
|
||||
"a "+
|
||||
|
||||
"ba bi bu be bo "+
|
||||
"bya byi byu bye byo "+
|
||||
"bba "+
|
||||
|
||||
"da di du de do "+
|
||||
"dya dyi dyu dye dyo "+
|
||||
"dha dhi dhu dhe dho "+
|
||||
"dda "+
|
||||
|
||||
"e "+
|
||||
|
||||
"fa fi fe fo "+
|
||||
"fya fyu fyo "+
|
||||
"ffa "+
|
||||
|
||||
"ga gi gu ge go "+
|
||||
"gya gyi gyu gye gyo "+
|
||||
"gwa gwi gwu gwe gwo "+
|
||||
"gga "+
|
||||
|
||||
"ha hi hu he ho "+
|
||||
"hya hyi hyu hye hyo "+
|
||||
"hha "+
|
||||
|
||||
"i "+
|
||||
|
||||
"ka ki ku ke ko "+
|
||||
"kwa kwi kwu kwe kwo "+
|
||||
"kya kyi kyu kye kyo "+
|
||||
"kka "+
|
||||
|
||||
"ma mi mu me mo "+
|
||||
"mya myi myu mye myo "+
|
||||
"mba mfa mma mpa mva "+
|
||||
"m'' "+
|
||||
|
||||
"na ni nu ne no "+
|
||||
"nya nyi nyu nye nyo "+
|
||||
"nn n'' n "+
|
||||
|
||||
"o "+
|
||||
|
||||
"pa pi pu pe po "+
|
||||
"pya pyi pyu pye pyo "+
|
||||
"ppa "+
|
||||
|
||||
"qa qi qu qe qo "+
|
||||
"qya qyi qyu qye qyo "+
|
||||
"qqa "+
|
||||
|
||||
"ra ri ru re ro "+
|
||||
"rya ryi ryu rye ryo "+
|
||||
"rra "+
|
||||
|
||||
"sa si su se so "+
|
||||
"sya syi syu sye syo "+
|
||||
"ssya ssa "+
|
||||
|
||||
"ta ti tu te to "+
|
||||
"tha thi thu the tho "+
|
||||
"tsa tsi tse tso "+
|
||||
"tya tyi tyu tye tyo "+
|
||||
"ttsa "+
|
||||
"tta "+
|
||||
|
||||
"u "+
|
||||
|
||||
"va vi vu ve vo "+
|
||||
"vya vyi vyu vye vyo "+
|
||||
"vva "+
|
||||
|
||||
"wa wi we wo "+
|
||||
"wwa "+
|
||||
|
||||
"ya yu ye yo "+
|
||||
"yya "+
|
||||
|
||||
"za zi zu ze zo "+
|
||||
"zya zyi zyu zye zyo "+
|
||||
"zza "+
|
||||
|
||||
"xa xi xu xe xo "+
|
||||
"xka xke "+
|
||||
"xtu "+
|
||||
"xwa "+
|
||||
"xya xyu xyo "+
|
||||
|
||||
"akka akki akku akke akko "+
|
||||
"akkya akkyu akkyo "+
|
||||
|
||||
"atta atti attu atte atto "+
|
||||
"attya attyu attyo "+
|
||||
"adda addi addu adde addo "+
|
||||
|
||||
"atcha atchi atchu atche atcho "+
|
||||
|
||||
"assa assi assu asse asso "+
|
||||
"assya assyu assyo "+
|
||||
|
||||
"ahha ahhi ahhu ahhe ahho "+
|
||||
"appa appi appu appe appo "+
|
||||
|
||||
"an "+
|
||||
"ana ani anu ane ano "+
|
||||
"anna anni annu anne anno "+
|
||||
"an'a an'i an'u an'e an'o "+
|
||||
|
||||
"annna annni annnu annne annno "+
|
||||
"an'na an'ni an'nu an'ne an'no "+
|
||||
|
||||
"anka anki anku anke anko "+
|
||||
"anga angi angu ange ango "+
|
||||
|
||||
"ansa ansi ansu anse anso "+
|
||||
"anza anzi anzu anze anzo "+
|
||||
"anzya anzyu anzyo "+
|
||||
|
||||
"anta anti antu ante anto "+
|
||||
"antya antyu antyo "+
|
||||
"anda andi andu ande ando "+
|
||||
|
||||
"ancha anchi anchu anche ancho "+
|
||||
"anja anji anju anje anjo "+
|
||||
"antsa antsu antso "+
|
||||
|
||||
"anpa anpi anpu anpe anpo "+
|
||||
"ampa ampi ampu ampe ampo "+
|
||||
|
||||
"anba anbi anbu anbe anbo "+
|
||||
"amba ambi ambu ambe ambo "+
|
||||
|
||||
"anma anmi anmu anme anmo "+
|
||||
"amma ammi ammu amme ammo "+
|
||||
|
||||
"anwa anwi anwu anwe anwo "+
|
||||
|
||||
"anha anhi anhu anhe anho "+
|
||||
|
||||
"anya anyi anyu anye anyo "+
|
||||
"annya annyi annyu annye annyo "+
|
||||
"an'ya an'yi an'yu an'ye an'yo "+
|
||||
|
||||
"kkk "+
|
||||
"ggg "+
|
||||
"sss "+
|
||||
"zzz "+
|
||||
"ttt "+
|
||||
"ddd "+
|
||||
"nnn "+
|
||||
"hhh "+
|
||||
"bbb "+
|
||||
"ppp "+
|
||||
"mmm "+
|
||||
"yyy "+
|
||||
"rrr "+
|
||||
"www ";
|
||||
*/
|
||||
|
||||
/*+
|
||||
|
||||
"A I U E O "+
|
||||
"XA XI XU XE XO "+
|
||||
|
||||
"KA KI KU KE KO "+
|
||||
"KYA KYI KYU KYE KYO "+
|
||||
"KWA KWI KWU KWE KWO "+
|
||||
"QA QI QU QE QO "+
|
||||
"QYA QYI QYU QYE QYO "+
|
||||
"XKA XKE "+
|
||||
|
||||
"GA GI GU GE GO "+
|
||||
"GYA GYI GYU GYE GYO "+
|
||||
"GWA GWI GWU GWE GWO "+
|
||||
|
||||
"SA SI SU SE SO "+
|
||||
"SHA SHI SHU SHE SHO "+
|
||||
"SYA SYI SYU SYE SYO "+
|
||||
|
||||
"ZA ZI ZU ZE ZO "+
|
||||
"ZYA ZYI ZYU ZYE ZYO "+
|
||||
"JA JI JU JE JO "+
|
||||
"JYA JYU JYO "+
|
||||
|
||||
"TA TI TU TE TO "+
|
||||
"XTU XTSU "+
|
||||
"TYA TYU TYO "+
|
||||
"CYA CYU CYO "+
|
||||
"CHA CHI CHU CHE CHO "+
|
||||
"TSA TSI TSU TSE TSO "+
|
||||
"DA DI DU DE DO "+
|
||||
"DYA DYU DYO "+
|
||||
"THA THI THU THE THO "+
|
||||
"DHA DHI DHU DHE DHO "+
|
||||
|
||||
"NA NI NU NE NO "+
|
||||
"NYA NYU NYO "+
|
||||
|
||||
"HA HI HU HE HO "+
|
||||
"HYA HYU HYO "+
|
||||
"FA FI FU FE FO "+
|
||||
"FYA FYU FYO "+
|
||||
"BA BI BU BE BO "+
|
||||
"BYA BYU BYO "+
|
||||
"PA PI PU PE PO "+
|
||||
"PYA PYU PYO "+
|
||||
|
||||
"MA MI MU ME MO "+
|
||||
"MYA MYU MYO "+
|
||||
"YA YI YU YE YO "+
|
||||
"XYA XYI XYU XYE XYO "+
|
||||
|
||||
"RA RI RU RE RO "+
|
||||
"LA LI LU LE LO "+
|
||||
"RYA RYI RYU RYE RYO "+
|
||||
"LYA LYI LYU LYE LYO "+
|
||||
|
||||
"WA WI WU WE WO "+
|
||||
"VA VI VU VE VO "+
|
||||
"VYA VYU VYO "+
|
||||
|
||||
"CYA CYI CYU CYE CYO "+
|
||||
|
||||
"NN "+
|
||||
"N' "+
|
||||
"N "+
|
||||
|
||||
"AKKA AKKI AKKU AKKE AKKO "+
|
||||
"AKKYA AKKYU AKKYO "+
|
||||
|
||||
"ATTA ATTI ATTU ATTE ATTO "+
|
||||
"ATTYA ATTYU ATTYO "+
|
||||
"ADDA ADDI ADDU ADDE ADDO "+
|
||||
|
||||
"ATCHA ATCHI ATCHU ATCHE ATCHO "+
|
||||
|
||||
"ASSA ASSI ASSU ASSE ASSO "+
|
||||
"ASSYA ASSYU ASSYO "+
|
||||
|
||||
"AHHA AHHI AHHU AHHE AHHO "+
|
||||
"APPA APPI APPU APPE APPO "+
|
||||
|
||||
"AN "+
|
||||
"ANA ANI ANU ANE ANO "+
|
||||
"ANNA ANNI ANNU ANNE ANNO "+
|
||||
"AN'A AN'I AN'U AN'E AN'O "+
|
||||
|
||||
"ANNNA ANNNI ANNNU ANNNE ANNNO "+
|
||||
"AN'NA AN'NI AN'NU AN'NE AN'NO "+
|
||||
|
||||
"ANKA ANKI ANKU ANKE ANKO "+
|
||||
"ANGA ANGI ANGU ANGE ANGO "+
|
||||
|
||||
"ANSA ANSI ANSU ANSE ANSO "+
|
||||
"ANZA ANZI ANZU ANZE ANZO "+
|
||||
"ANZYA ANZYU ANZYO "+
|
||||
|
||||
"ANTA ANTI ANTU ANTE ANTO "+
|
||||
"ANTYA ANTYU ANTYO "+
|
||||
"ANDA ANDI ANDU ANDE ANDO "+
|
||||
|
||||
"ANCHA ANCHI ANCHU ANCHE ANCHO "+
|
||||
"ANJA ANJI ANJU ANJE ANJO "+
|
||||
"ANTSA ANTSU ANTSO "+
|
||||
|
||||
"ANPA ANPI ANPU ANPE ANPO "+
|
||||
"AMPA AMPI AMPU AMPE AMPO "+
|
||||
|
||||
"ANBA ANBI ANBU ANBE ANBO "+
|
||||
"AMBA AMBI AMBU AMBE AMBO "+
|
||||
|
||||
"ANMA ANMI ANMU ANME ANMO "+
|
||||
"AMMA AMMI AMMU AMME AMMO "+
|
||||
|
||||
"ANWA ANWI ANWU ANWE ANWO "+
|
||||
|
||||
"ANHA ANHI ANHU ANHE ANHO "+
|
||||
|
||||
"ANYA ANYI ANYU ANYE ANYO "+
|
||||
"ANNYA ANNYI ANNYU ANNYE ANNYO "+
|
||||
"AN'YA AN'YI AN'YU AN'YE AN'YO "+
|
||||
|
||||
"KKK "+
|
||||
"GGG "+
|
||||
"SSS "+
|
||||
"ZZZ "+
|
||||
"TTT "+
|
||||
"DDD "+
|
||||
"NNN "+
|
||||
"HHH "+
|
||||
"BBB "+
|
||||
"PPP "+
|
||||
"MMM "+
|
||||
"YYY "+
|
||||
"RRR "+
|
||||
"WWW";*/
|
||||
}
|
118
icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
Executable file
118
icu4j/src/com/ibm/test/translit/UnicodeSetTest.java
Executable file
|
@ -0,0 +1,118 @@
|
|||
import com.ibm.text.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary General test of UnicodeSet
|
||||
*/
|
||||
public class UnicodeSetTest extends IntlTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
new UnicodeSetTest().run(args);
|
||||
}
|
||||
|
||||
public void TestPatterns() {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
expectPattern(set, "[[a-m]&[d-z]&[k-y]]", "km");
|
||||
expectPattern(set, "[[a-z]-[m-y]-[d-r]]", "aczz");
|
||||
expectPattern(set, "[a\\-z]", "--aazz");
|
||||
expectPattern(set, "[-az]", "--aazz");
|
||||
expectPattern(set, "[az-]", "--aazz");
|
||||
expectPattern(set, "[[[a-z]-[aeiou]i]]", "bdfnptvz");
|
||||
|
||||
// Throw in a test of complement
|
||||
set.complement();
|
||||
String exp = '\u0000' + "aeeoouu" + (char)('z'+1) + '\uFFFF';
|
||||
expectPairs(set, exp);
|
||||
}
|
||||
|
||||
public void TestAddRemove() {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
set.add('a', 'z');
|
||||
expectPairs(set, "az");
|
||||
set.remove('m', 'p');
|
||||
expectPairs(set, "alqz");
|
||||
set.remove('e', 'g');
|
||||
expectPairs(set, "adhlqz");
|
||||
set.remove('d', 'i');
|
||||
expectPairs(set, "acjlqz");
|
||||
set.remove('c', 'r');
|
||||
expectPairs(set, "absz");
|
||||
set.add('f', 'q');
|
||||
expectPairs(set, "abfqsz");
|
||||
set.remove('a', 'g');
|
||||
expectPairs(set, "hqsz");
|
||||
set.remove('a', 'z');
|
||||
expectPairs(set, "");
|
||||
|
||||
// Try removing an entire set from another set
|
||||
expectPattern(set, "[c-x]", "cx");
|
||||
UnicodeSet set2 = new UnicodeSet();
|
||||
expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
|
||||
set.removeAll(set2);
|
||||
expectPairs(set, "deluxx");
|
||||
|
||||
// Try adding an entire set to another set
|
||||
expectPattern(set, "[jackiemclean]", "aacceein");
|
||||
expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
|
||||
set.addAll(set2);
|
||||
expectPairs(set, "aacehort");
|
||||
|
||||
// Test commutativity
|
||||
expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
|
||||
expectPattern(set2, "[jackiemclean]", "aacceein");
|
||||
set.addAll(set2);
|
||||
expectPairs(set, "aacehort");
|
||||
}
|
||||
|
||||
void expectPattern(UnicodeSet set,
|
||||
String pattern,
|
||||
String expectedPairs) {
|
||||
set.applyPattern(pattern);
|
||||
if (!set.getPairs().equals(expectedPairs)) {
|
||||
errln("FAIL: applyPattern(\"" + pattern +
|
||||
"\") => pairs \"" +
|
||||
escape(set.getPairs()) + "\", expected \"" +
|
||||
escape(expectedPairs) + "\"");
|
||||
} else {
|
||||
logln("Ok: applyPattern(\"" + pattern +
|
||||
"\") => pairs \"" +
|
||||
escape(set.getPairs()) + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
void expectPairs(UnicodeSet set, String expectedPairs) {
|
||||
if (!set.getPairs().equals(expectedPairs)) {
|
||||
errln("FAIL: Expected pair list \"" +
|
||||
escape(expectedPairs) + "\", got \"" +
|
||||
escape(set.getPairs()) + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
*/
|
||||
static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
285
icu4j/src/com/ibm/text/CompoundTransliterator.java
Executable file
285
icu4j/src/com/ibm/text/CompoundTransliterator.java
Executable file
|
@ -0,0 +1,285 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.Vector;
|
||||
|
||||
/**
|
||||
* A transliterator that is composed of two or more other
|
||||
* transliterator objects linked together. For example, if one
|
||||
* transliterator transliterates from script A to script B, and
|
||||
* another transliterates from script B to script C, the two may be
|
||||
* combined to form a new transliterator from A to C.
|
||||
*
|
||||
* <p>Composed transliterators may not behave as expected. For
|
||||
* example, inverses may not combine to form the identity
|
||||
* transliterator. See the class documentation for {@link
|
||||
* Transliterator} for details.
|
||||
*
|
||||
* <p>If a non-<tt>null</tt> <tt>UnicodeFilter</tt> is applied to a
|
||||
* <tt>CompoundTransliterator</tt>, it has the effect of being
|
||||
* logically <b>and</b>ed with the filter of each transliterator in
|
||||
* the chain.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: CompoundTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class CompoundTransliterator extends Transliterator {
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
private Transliterator[] trans;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
* transliterators. The array of transliterators may be of any
|
||||
* length, including zero or one, however, useful compound
|
||||
* transliterators have at least two components.
|
||||
* @param transliterators array of <code>Transliterator</code>
|
||||
* objects
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
public CompoundTransliterator(String ID, Transliterator[] transliterators,
|
||||
UnicodeFilter filter) {
|
||||
super(ID, filter);
|
||||
trans = new Transliterator[transliterators.length];
|
||||
System.arraycopy(transliterators, 0, trans, 0, trans.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new compound transliterator given an array of
|
||||
* transliterators. The array of transliterators may be of any
|
||||
* length, including zero or one, however, useful compound
|
||||
* transliterators have at least two components.
|
||||
* @param transliterators array of <code>Transliterator</code>
|
||||
* objects
|
||||
*/
|
||||
public CompoundTransliterator(String ID, Transliterator[] transliterators) {
|
||||
this(ID, transliterators, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of transliterators in this chain.
|
||||
* @return number of transliterators in this chain.
|
||||
*/
|
||||
public int getCount() {
|
||||
return trans.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the transliterator at the given index in this chain.
|
||||
* @param index index into chain, from 0 to <code>getCount() - 1</code>
|
||||
* @return transliterator at the given index
|
||||
*/
|
||||
public Transliterator getTransliterator(int index) {
|
||||
return trans[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
public int transliterate(Replaceable text, int start, int limit) {
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
limit = trans[i].transliterate(text, start, limit);
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
protected void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] index) {
|
||||
/* Call each transliterator with the same start value and
|
||||
* initial cursor index, but with the limit index as modified
|
||||
* by preceding transliterators. The cursor index must be
|
||||
* reset for each transliterator to give each a chance to
|
||||
* transliterate the text. The initial cursor index is known
|
||||
* to still point to the same place after each transliterator
|
||||
* is called because each transliterator will not change the
|
||||
* text between start and the initial value of cursor.
|
||||
*
|
||||
* IMPORTANT: After the first transliterator, each subsequent
|
||||
* transliterator only gets to transliterate text committed by
|
||||
* preceding transliterators; that is, the cursor (output
|
||||
* value) of transliterator i becomes the limit (input value)
|
||||
* of transliterator i+1. Finally, the overall limit is fixed
|
||||
* up before we return.
|
||||
*
|
||||
* Assumptions we make here:
|
||||
* (1) start <= cursor <= limit ;cursor valid on entry
|
||||
* (2) cursor <= cursor' <= limit' ;cursor doesn't move back
|
||||
* (3) cursor <= limit' ;text before cursor unchanged
|
||||
* - cursor' is the value of cursor after calling handleKT
|
||||
* - limit' is the value of limit after calling handleKT
|
||||
*/
|
||||
|
||||
/**
|
||||
* Example: 3 transliterators. This example illustrates the
|
||||
* mechanics we need to implement. S, C, and L are the start,
|
||||
* cursor, and limit. gl is the globalLimit.
|
||||
*
|
||||
* 1. h-u, changes hex to Unicode
|
||||
*
|
||||
* 4 7 a d 0 4 7 a
|
||||
* abc/u0061/u => abca/u
|
||||
* S C L S C L gl=f->a
|
||||
*
|
||||
* 2. upup, changes "x" to "XX"
|
||||
*
|
||||
* 4 7 a 4 7 a
|
||||
* abca/u => abcAA/u
|
||||
* S CL S C
|
||||
* L gl=a->b
|
||||
* 3. u-h, changes Unicode to hex
|
||||
*
|
||||
* 4 7 a 4 7 a d 0 3
|
||||
* abcAA/u => abc/u0041/u0041/u
|
||||
* S C L S C
|
||||
* L gl=b->15
|
||||
* 4. return
|
||||
*
|
||||
* 4 7 a d 0 3
|
||||
* abc/u0041/u0041/u
|
||||
* S C L
|
||||
*/
|
||||
|
||||
/**
|
||||
* One more wrinkle. If there is a filter F for the compound
|
||||
* transliterator as a whole, then we need to modify every
|
||||
* non-null filter f in the chain to be f' = F & f. Then,
|
||||
* when we're done, we restore the original filters.
|
||||
*
|
||||
* A possible future optimization is to change f to f' at
|
||||
* construction time, but then if anyone else is using the
|
||||
* transliterators in the chain outside of this context, they
|
||||
* will get unexpected results.
|
||||
*/
|
||||
UnicodeFilter F = getFilter();
|
||||
UnicodeFilter[] f = null;
|
||||
if (F != null) {
|
||||
f = new UnicodeFilter[trans.length];
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
f[i] = trans[i].getFilter();
|
||||
trans[i].setFilter(UnicodeFilterLogic.and(F, f[i]));
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
int cursor = index[CURSOR];
|
||||
int limit = index[LIMIT];
|
||||
int globalLimit = limit;
|
||||
/* globalLimit is the overall limit. We keep track of this
|
||||
* since we overwrite index[LIMIT] with the previous
|
||||
* index[CURSOR]. After each transliteration, we update
|
||||
* globalLimit for insertions or deletions that have happened.
|
||||
*/
|
||||
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
index[CURSOR] = cursor; // Reset cursor
|
||||
index[LIMIT] = limit;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.print(escape(i + ": \"" +
|
||||
substring(text, index[START], index[CURSOR]) + '|' +
|
||||
substring(text, index[CURSOR], index[LIMIT]) +
|
||||
"\" -> \""));
|
||||
}
|
||||
|
||||
trans[i].handleKeyboardTransliterate(text, index);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(escape(
|
||||
substring(text, index[START], index[CURSOR]) + '|' +
|
||||
substring(text, index[CURSOR], index[LIMIT]) +
|
||||
'"'));
|
||||
}
|
||||
|
||||
// Adjust overall limit for insertions/deletions
|
||||
globalLimit += index[LIMIT] - limit;
|
||||
limit = index[CURSOR]; // Move limit to end of committed text
|
||||
}
|
||||
// Cursor is good where it is -- where the last
|
||||
// transliterator left it. Limit needs to be put back
|
||||
// where it was, modulo adjustments for deletions/insertions.
|
||||
index[LIMIT] = globalLimit;
|
||||
|
||||
} finally {
|
||||
// Fixup the transliterator filters, if we had to modify them.
|
||||
if (f != null) {
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
trans[i].setFilter(f[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
int max = 0;
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
int len = trans[i].getMaximumContextLength();
|
||||
if (len > max) {
|
||||
max = len;
|
||||
}
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
/**
|
||||
* DEBUG
|
||||
* Returns a substring of a Replaceable.
|
||||
*/
|
||||
private static final String substring(Replaceable str, int start, int limit) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
while (start < limit) {
|
||||
buf.append(str.charAt(start++));
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* DEBUG
|
||||
* Escapes non-ASCII characters as Unicode.
|
||||
*/
|
||||
private static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
130
icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java
Executable file
130
icu4j/src/com/ibm/text/HexToUnicodeTransliterator.java
Executable file
|
@ -0,0 +1,130 @@
|
|||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from hexadecimal Unicode
|
||||
* escape sequences to the characters they represent. For example, "U+0040"
|
||||
* and '\u0040'. It recognizes the
|
||||
* prefixes "U+", "u+", "\U", and "\u". Hex values may be
|
||||
* upper- or lowercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: HexToUnicodeTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class HexToUnicodeTransliterator extends Transliterator {
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Hex-Unicode";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
public HexToUnicodeTransliterator() {
|
||||
super(_ID, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
public int transliterate(Replaceable text, int start, int limit) {
|
||||
int[] offsets = { start, limit, start };
|
||||
handleKeyboardTransliterate(text, offsets);
|
||||
return offsets[LIMIT];
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
protected void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] offsets) {
|
||||
/**
|
||||
* Performs transliteration changing Unicode hexadecimal
|
||||
* escapes to characters. For example, "U+0040" -> '@'. A fixed
|
||||
* set of prefixes is recognized: "\u", "\U", "u+", "U+".
|
||||
*/
|
||||
int cursor = offsets[CURSOR];
|
||||
int limit = offsets[LIMIT];
|
||||
|
||||
int maxCursor = limit - 6;
|
||||
loop:
|
||||
while (cursor <= maxCursor) {
|
||||
char c = filteredCharAt(text, cursor + 5);
|
||||
int digit0 = Character.digit(c, 16);
|
||||
if (digit0 < 0) {
|
||||
if (c == '\\') {
|
||||
cursor += 5;
|
||||
} else if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += 4;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
int u = digit0;
|
||||
|
||||
for (int i=4; i>=2; --i) {
|
||||
c = filteredCharAt(text, cursor + i);
|
||||
int digit = Character.digit(c, 16);
|
||||
if (digit < 0) {
|
||||
if (c == 'U' || c == 'u' || c == '+') {
|
||||
cursor += i-1;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
continue loop;
|
||||
}
|
||||
u |= digit << (4 * (5-i));
|
||||
}
|
||||
|
||||
c = filteredCharAt(text, cursor);
|
||||
char d = filteredCharAt(text, cursor + 1);
|
||||
if (((c == 'U' || c == 'u') && d == '+')
|
||||
|| (c == '\\' && (d == 'U' || d == 'u'))) {
|
||||
|
||||
// At this point, we have a match; replace cursor..cursor+5
|
||||
// with u.
|
||||
text.replace(cursor, cursor+6, String.valueOf((char) u));
|
||||
limit -= 5;
|
||||
maxCursor -= 5;
|
||||
|
||||
++cursor;
|
||||
} else {
|
||||
cursor += 6;
|
||||
}
|
||||
}
|
||||
|
||||
offsets[LIMIT] = limit;
|
||||
offsets[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
private char filteredCharAt(Replaceable text, int i) {
|
||||
char c;
|
||||
UnicodeFilter filter = getFilter();
|
||||
return (filter == null) ? text.charAt(i) :
|
||||
(filter.isIn(c = text.charAt(i)) ? c : '\uFFFF');
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
}
|
77
icu4j/src/com/ibm/text/Replaceable.java
Executable file
77
icu4j/src/com/ibm/text/Replaceable.java
Executable file
|
@ -0,0 +1,77 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>Replaceable</code> is an interface that supports the
|
||||
* operation of replacing a substring with another piece of text.
|
||||
* <code>Replaceable</code> is needed in order to change a piece of
|
||||
* text while retaining style attributes. For example, if the string
|
||||
* "the <b>bold</b> font" has range (4, 8) replaced with "strong",
|
||||
* then it becomes "the <b>strong</b> font".
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Replaceable.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public interface Replaceable {
|
||||
/**
|
||||
* Return the number of characters in the text.
|
||||
* @return number of characters in text
|
||||
*/
|
||||
int length();
|
||||
|
||||
/**
|
||||
* Return the character at the given offset into the text.
|
||||
* @param offset an integer between 0 and <code>length()</code>-1
|
||||
* inclusive
|
||||
* @return character of text at given offset
|
||||
*/
|
||||
char charAt(int offset);
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination
|
||||
* character array. The first character to be copied is at index
|
||||
* <code>srcStart</code>; the last character to be copied is at
|
||||
* index <code>srcLimit-1</code> (thus the total number of
|
||||
* characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code>
|
||||
* starting at index <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive; <code>0
|
||||
* <= start <= limit</code>.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* <code>start <= limit <= length()</code>.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
*/
|
||||
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
|
||||
|
||||
/**
|
||||
* Replace a substring of this object with the given text.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param text the text to replace characters <code>start</code>
|
||||
* to <code>limit - 1</code>
|
||||
*/
|
||||
void replace(int start, int limit, String text);
|
||||
|
||||
/**
|
||||
* Replace a substring of this object with the given text.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param chars the text to replace characters <code>start</code>
|
||||
* to <code>limit - 1</code>
|
||||
* @param charsStart the beginning index into <code>chars</code>,
|
||||
* inclusive; <code>0 <= start <= limit</code>.
|
||||
* @param charsLen the number of characters of <code>chars</code>.
|
||||
*/
|
||||
void replace(int start, int limit, char[] chars,
|
||||
int charsStart, int charsLen);
|
||||
// Note: We use length rather than limit to conform to StringBuffer
|
||||
// and System.arraycopy.
|
||||
}
|
159
icu4j/src/com/ibm/text/ReplaceableString.java
Executable file
159
icu4j/src/com/ibm/text/ReplaceableString.java
Executable file
|
@ -0,0 +1,159 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>ReplaceableString</code> is an adapter class that implements the
|
||||
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
|
||||
*
|
||||
* <p><em>Note:</em> This class does not support attributes and is not
|
||||
* intended for general use. Most clients will need to implement
|
||||
* {@link Replaceable} in their text representation class.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: ReplaceableString.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
private StringBuffer buf;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a new object with the given initial contents.
|
||||
* @param str initial contents
|
||||
*/
|
||||
public ReplaceableString(String str) {
|
||||
buf = new StringBuffer(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new object using <code>buf</code> for internal
|
||||
* storage. The contents of <code>buf</code> at the time of
|
||||
* construction are used as the initial contents. <em>Note!
|
||||
* Modifications to <code>buf</code> will modify this object, and
|
||||
* vice versa.</em>
|
||||
* @param buf object to be used as internal storage
|
||||
*/
|
||||
public ReplaceableString(StringBuffer buf) {
|
||||
this.buf = buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new empty object.
|
||||
*/
|
||||
public ReplaceableString() {
|
||||
buf = new StringBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the contents of this object as a <code>String</code>.
|
||||
* @return string contents of this object
|
||||
*/
|
||||
public String toString() {
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the internal storage of this object. <em>Note! Any
|
||||
* changes made to the returned object affect this object's
|
||||
* contents, and vice versa.</em>
|
||||
* @return internal buffer used by this object
|
||||
*/
|
||||
public StringBuffer getStringBuffer() {
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
*/
|
||||
public int length() {
|
||||
return buf.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the character at the given position in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
* @param offset offset into the contents, from 0 to
|
||||
* <code>length()</code> - 1
|
||||
*/
|
||||
public char charAt(int offset) {
|
||||
return buf.charAt(offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies characters from this object into the destination
|
||||
* character array. The first character to be copied is at index
|
||||
* <code>srcStart</code>; the last character to be copied is at
|
||||
* index <code>srcLimit-1</code> (thus the total number of
|
||||
* characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code>
|
||||
* starting at index <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive; <code>0
|
||||
* <= start <= limit</code>.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* <code>start <= limit <= length()</code>.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
*/
|
||||
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
|
||||
buf.getChars(srcStart, srcLimit, dst, dstStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace zero or more characters with new characters.
|
||||
* <code>Replaceable</code> API.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param text new text to replace characters <code>start</code> to
|
||||
* <code>limit - 1</code>
|
||||
*/
|
||||
public void replace(int start, int limit, String text) {
|
||||
if (start == limit) {
|
||||
buf.insert(start, text);
|
||||
} else {
|
||||
char[] tail = null;
|
||||
if (limit < buf.length()) {
|
||||
tail = new char[buf.length() - limit];
|
||||
buf.getChars(limit, buf.length(), tail, 0);
|
||||
}
|
||||
buf.setLength(start);
|
||||
buf.append(text);
|
||||
if (tail != null) {
|
||||
buf.append(tail);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace a substring of this object with the given text.
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= length()</code>.
|
||||
* @param chars the text to replace characters <code>start</code>
|
||||
* to <code>limit - 1</code>
|
||||
* @param charsStart the beginning index into <code>chars</code>,
|
||||
* inclusive; <code>0 <= start <= limit</code>.
|
||||
* @param charsLen the number of characters of <code>chars</code>.
|
||||
*/
|
||||
public void replace(int start, int limit, char[] chars,
|
||||
int charsStart, int charsLen) {
|
||||
char[] tail = null;
|
||||
if (limit < buf.length()) {
|
||||
tail = new char[buf.length() - limit];
|
||||
buf.getChars(limit, buf.length(), tail, 0);
|
||||
}
|
||||
buf.setLength(start);
|
||||
buf.append(chars, charsStart, charsLen);
|
||||
if (tail != null) {
|
||||
buf.append(tail);
|
||||
}
|
||||
}
|
||||
}
|
1187
icu4j/src/com/ibm/text/RuleBasedTransliterator.java
Executable file
1187
icu4j/src/com/ibm/text/RuleBasedTransliterator.java
Executable file
File diff suppressed because it is too large
Load diff
530
icu4j/src/com/ibm/text/TransliterationRule.java
Executable file
530
icu4j/src/com/ibm/text/TransliterationRule.java
Executable file
|
@ -0,0 +1,530 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.Dictionary;
|
||||
|
||||
/**
|
||||
* A transliteration rule used by
|
||||
* <code>RuleBasedTransliterator</code>.
|
||||
* <code>TransliterationRule</code> is an immutable object.
|
||||
*
|
||||
* <p>A rule consists of an input pattern and an output string. When
|
||||
* the input pattern is matched, the output string is emitted. The
|
||||
* input pattern consists of zero or more characters which are matched
|
||||
* exactly (the key) and optional context. Context must match if it
|
||||
* is specified. Context may be specified before the key, after the
|
||||
* key, or both. The key, preceding context, and following context
|
||||
* may contain variables. Variables represent a set of Unicode
|
||||
* characters, such as the letters <i>a</i> through <i>z</i>.
|
||||
* Variables are detected by looking up each character in a supplied
|
||||
* variable list to see if it has been so defined.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRule.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
class TransliterationRule {
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code> indicating a mismatch
|
||||
* between the text and this rule. One or more characters of the context or
|
||||
* key do not match the text.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
public static final int MISMATCH = 0;
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code> indicating a partial
|
||||
* match between the text and this rule. All characters of the text match
|
||||
* the corresponding context or key, but more characters are required for a
|
||||
* complete match. There are some key or context characters at the end of
|
||||
* the pattern that remain unmatched because the text isn't long enough.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
public static final int PARTIAL_MATCH = 1;
|
||||
|
||||
/**
|
||||
* Constant returned by <code>getMatchDegree()</code> indicating a complete
|
||||
* match between the text and this rule. The text matches all context and
|
||||
* key characters.
|
||||
* @see #getMatchDegree
|
||||
*/
|
||||
public static final int FULL_MATCH = 2;
|
||||
|
||||
/**
|
||||
* The string that must be matched.
|
||||
*/
|
||||
private String key;
|
||||
|
||||
/**
|
||||
* The string that is emitted if the key, anteContext, and postContext
|
||||
* are matched.
|
||||
*/
|
||||
private String output;
|
||||
|
||||
/**
|
||||
* The string that must match before the key. Must not be the empty string.
|
||||
* May be null; if null, then there is no matching requirement before the
|
||||
* key.
|
||||
*/
|
||||
private String anteContext;
|
||||
|
||||
/**
|
||||
* The string that must match after the key. Must not be the empty string.
|
||||
* May be null; if null, then there is no matching requirement after the
|
||||
* key.
|
||||
*/
|
||||
private String postContext;
|
||||
|
||||
/**
|
||||
* The position of the cursor after emitting the output string, from 0 to
|
||||
* output.length(). For most rules with no special cursor specification,
|
||||
* the cursorPos is output.length().
|
||||
*/
|
||||
private int cursorPos;
|
||||
|
||||
/**
|
||||
* A string used to implement masks().
|
||||
*/
|
||||
private String maskKey;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a new rule with the given key, output text, and other
|
||||
* attributes. Zero, one, or two context strings may be specified. A
|
||||
* cursor position may be specified for the output text.
|
||||
* @param key the string to match
|
||||
* @param output the string to produce when the <code>key</code> is seen
|
||||
* @param anteContext if not null and not empty, then it must be matched
|
||||
* before the <code>key</code>
|
||||
* @param postContext if not null and not empty, then it must be matched
|
||||
* after the <code>key</code>
|
||||
* @param cursorPos a position for the cursor after the <code>output</code>
|
||||
* is emitted. If less than zero, then the cursor is placed after the
|
||||
* <code>output</code>; that is, -1 is equivalent to
|
||||
* <code>output.length()</code>. If greater than
|
||||
* <code>output.length()</code> then an exception is thrown.
|
||||
* @exception IllegalArgumentException if the cursor position is out of
|
||||
* range.
|
||||
*/
|
||||
public TransliterationRule(String key, String output,
|
||||
String anteContext, String postContext,
|
||||
int cursorPos) {
|
||||
this.key = key;
|
||||
this.output = output;
|
||||
this.anteContext = (anteContext != null && anteContext.length() > 0)
|
||||
? anteContext : null;
|
||||
this.postContext = (postContext != null && postContext.length() > 0)
|
||||
? postContext : null;
|
||||
this.cursorPos = cursorPos < 0 ? output.length() : cursorPos;
|
||||
if (this.cursorPos > output.length()) {
|
||||
throw new IllegalArgumentException("Illegal cursor position");
|
||||
}
|
||||
|
||||
/* The mask key is needed when we are adding individual rules to a rule
|
||||
* set, for performance. Here are the numbers: Without mask key, 13.0
|
||||
* seconds. With mask key, 6.2 seconds. However, once the rules have
|
||||
* been added to the set, then they can be discarded to free up space.
|
||||
* This is what the freeze() method does. After freeze() has been
|
||||
* called, the method masks() must NOT be called.
|
||||
*/
|
||||
maskKey = key;
|
||||
if (postContext != null) {
|
||||
maskKey += postContext;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the key. Equivalent to <code>getKey().length()</code>.
|
||||
* @return the length of the match key.
|
||||
*/
|
||||
public int getKeyLength() {
|
||||
return key.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the key.
|
||||
* @return the match key.
|
||||
*/
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the output string.
|
||||
* @return the output string.
|
||||
*/
|
||||
public String getOutput() {
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the position of the cursor within the output string.
|
||||
* @return a value from 0 to <code>getOutput().length()</code>, inclusive.
|
||||
*/
|
||||
public int getCursorPos() {
|
||||
return cursorPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the preceding context length. This method is needed to
|
||||
* support the <code>Transliterator</code> method
|
||||
* <code>getMaximumContextLength()</code>.
|
||||
*/
|
||||
public int getAnteContextLength() {
|
||||
return anteContext == null ? 0 : anteContext.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule masks another rule. If r1 masks r2 then
|
||||
* r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
|
||||
* r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
|
||||
* "[c]a>x" masks "[dc]a>y".
|
||||
*
|
||||
* <p>This method must not be called after freeze() is called.
|
||||
*/
|
||||
public boolean masks(TransliterationRule r2) {
|
||||
/* There are three cases of masking. In each instance, rule1
|
||||
* masks rule2.
|
||||
*
|
||||
* 1. KEY mask: len(key1) < len(key2), key2 starts with key1.
|
||||
*
|
||||
* 2. PREFIX mask: key1 == key2, len(prefix1) < len(prefix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*
|
||||
* 3. SUFFIX mask: key1 == key2, len(suffix1) < len(suffix2),
|
||||
* prefix2 ends with prefix1, suffix2 starts with suffix1.
|
||||
*/
|
||||
|
||||
/* LIMITATION of the current mask algorithm: Some rule
|
||||
* maskings are currently not detected. For example,
|
||||
* "{Lu}]a>x" masks "A]a>y". To detect these sorts of masking,
|
||||
* we need a subset operator on UnicodeSet objects, which we
|
||||
* currently do not have. This can be added later.
|
||||
*/
|
||||
return ((maskKey.length() < r2.maskKey.length() &&
|
||||
r2.maskKey.startsWith(maskKey)) ||
|
||||
(r2.anteContext != null && maskKey.equals(r2.maskKey) &&
|
||||
((anteContext == null) ||
|
||||
(anteContext.length() < r2.anteContext.length() &&
|
||||
r2.anteContext.endsWith(anteContext)))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, masks() must NOT be called.
|
||||
* If it is called, an exception will be thrown.
|
||||
*/
|
||||
public void freeze() {
|
||||
maskKey = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object.
|
||||
* @return string representation of this object
|
||||
*/
|
||||
public String toString() {
|
||||
return getClass().getName() + '['
|
||||
+ escape((anteContext != null ? ("[" + anteContext + ']') : "")
|
||||
+ key
|
||||
+ (postContext != null ? ("[" + postContext + ']') : "")
|
||||
+ " -> "
|
||||
+ (cursorPos < output.length()
|
||||
? (output.substring(0, cursorPos) + '|' + output.substring(cursorPos))
|
||||
: output))
|
||||
+ ']';
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text. The text being matched
|
||||
* occupies a virtual buffer consisting of the contents of
|
||||
* <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
public boolean matches(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
return
|
||||
(anteContext == null
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor - anteContext.length(),
|
||||
anteContext, variables, filter)) &&
|
||||
regionMatches(text, start, limit, result, cursor,
|
||||
key, variables, filter) &&
|
||||
(postContext == null
|
||||
|| regionMatches(text, start, limit, result,
|
||||
cursor + key.length(),
|
||||
postContext, variables, filter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if this rule matches the given text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
public boolean matches(Replaceable text, int start, int limit,
|
||||
int cursor, Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
return
|
||||
(anteContext == null
|
||||
|| regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, variables, filter)) &&
|
||||
regionMatches(text, start, limit, cursor,
|
||||
key, variables, filter) &&
|
||||
(postContext == null
|
||||
|| regionMatches(text, start, limit, cursor + key.length(),
|
||||
postContext, variables, filter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the degree of match between this rule and the given text. The
|
||||
* degree of match may be mismatch, a partial match, or a full match. A
|
||||
* mismatch means at least one character of the text does not match the
|
||||
* context or key. A partial match means some context and key characters
|
||||
* match, but the text is not long enough to match all of them. A full
|
||||
* match means all context and key characters match.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return one of <code>MISMATCH</code>, <code>PARTIAL_MATCH</code>, or
|
||||
* <code>FULL_MATCH</code>.
|
||||
* @see #MISMATCH
|
||||
* @see #PARTIAL_MATCH
|
||||
* @see #FULL_MATCH
|
||||
*/
|
||||
public int getMatchDegree(Replaceable text, int start, int limit,
|
||||
int cursor, Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
if (anteContext != null
|
||||
&& !regionMatches(text, start, limit, cursor - anteContext.length(),
|
||||
anteContext, variables, filter)) {
|
||||
return MISMATCH;
|
||||
}
|
||||
int len = getRegionMatchLength(text, start, limit, cursor,
|
||||
key, variables, filter);
|
||||
if (len < 0) {
|
||||
return MISMATCH;
|
||||
}
|
||||
if (len < key.length()) {
|
||||
return PARTIAL_MATCH;
|
||||
}
|
||||
if (postContext == null) {
|
||||
return FULL_MATCH;
|
||||
}
|
||||
len = getRegionMatchLength(text, start, limit,
|
||||
cursor + key.length(),
|
||||
postContext, variables, filter);
|
||||
return (len < 0) ? MISMATCH
|
||||
: ((len == postContext.length()) ? FULL_MATCH
|
||||
: PARTIAL_MATCH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor. As in
|
||||
* <code>matches()</code>, the text being matched occupies a virtual buffer
|
||||
* consisting of the contents of <code>result</code> concatenated to a
|
||||
* substring of <code>text</code>. See <code>matches()</code> for details.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result translated text so far
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param template the text to match against. All characters must match.
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
protected static boolean regionMatches(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
String template,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
int rlen = result.length();
|
||||
if (cursor < 0
|
||||
|| (cursor + template.length()) > (rlen + limit - start)) {
|
||||
return false;
|
||||
}
|
||||
for (int i=0; i<template.length(); ++i, ++cursor) {
|
||||
if (!charMatches(template.charAt(i),
|
||||
cursor < rlen ? result.charAt(cursor)
|
||||
: text.charAt(cursor - rlen + start),
|
||||
variables, filter)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a template matches the text. The entire length of the
|
||||
* template is compared to the text at the cursor.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param template the text to match against. All characters must match.
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return true if there is a match
|
||||
*/
|
||||
protected static boolean regionMatches(Replaceable text, int start, int limit,
|
||||
int cursor,
|
||||
String template, Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
if (cursor < start
|
||||
|| (cursor + template.length()) > limit) {
|
||||
return false;
|
||||
}
|
||||
for (int i=0; i<template.length(); ++i, ++cursor) {
|
||||
if (!charMatches(template.charAt(i), text.charAt(cursor),
|
||||
variables, filter)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters of the text that match this rule. If
|
||||
* there is a mismatch, return -1. If the text is not long enough to match
|
||||
* any characters, return 0.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param template the text to match against. All characters must match.
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return -1 if there is a mismatch, 0 if the text is not long enough to
|
||||
* match any characters, otherwise the number of characters of text that
|
||||
* match this rule.
|
||||
*/
|
||||
protected static int getRegionMatchLength(Replaceable text, int start,
|
||||
int limit, int cursor,
|
||||
String template,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
if (cursor < start) {
|
||||
return -1;
|
||||
}
|
||||
int i;
|
||||
for (i=0; i<template.length() && cursor<limit; ++i, ++cursor) {
|
||||
if (!charMatches(template.charAt(i), text.charAt(cursor),
|
||||
variables, filter)) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given key matches the given text. This method
|
||||
* accounts for the fact that the key character may represent a character
|
||||
* set. Note that the key and text characters may not be interchanged
|
||||
* without altering the results.
|
||||
* @param keyChar a character in the match key
|
||||
* @param textChar a character in the text being transliterated
|
||||
* @param variables a dictionary of variables mapping <code>Character</code>
|
||||
* to <code>UnicodeSet</code>
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
protected static boolean charMatches(char keyChar, char textChar,
|
||||
Dictionary variables, UnicodeFilter filter) {
|
||||
UnicodeSet set = null;
|
||||
return (filter == null || filter.isIn(textChar)) &&
|
||||
((set = (UnicodeSet) variables.get(new Character(keyChar)))
|
||||
== null) ?
|
||||
keyChar == textChar : set.contains(textChar);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
218
icu4j/src/com/ibm/text/TransliterationRuleSet.java
Executable file
218
icu4j/src/com/ibm/text/TransliterationRuleSet.java
Executable file
|
@ -0,0 +1,218 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
|
||||
* the transliteration in one direction from one set of characters or short
|
||||
* strings to another. A <code>RuleBasedTransliterator</code> consists of up to
|
||||
* two such sets, one for the forward direction, and one for the reverse.
|
||||
*
|
||||
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of
|
||||
* finding a matching rule at a given point in the text. This is accomplished
|
||||
* by the <code>findMatch()</code> method.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliterationRuleSet.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
class TransliterationRuleSet {
|
||||
/* Note: There was an old implementation that indexed by first letter of
|
||||
* key. Problem with this is that key may not have a meaningful first
|
||||
* letter; e.g., {Lu}>*. One solution is to keep a separate vector of all
|
||||
* rules whose intial key letter is a category variable. However, the
|
||||
* problem is that they must be kept in order with respect to other rules.
|
||||
* One solution -- add a sequence number to each rule. Do the usual
|
||||
* first-letter lookup, and also a lookup from the spare bin with rules like
|
||||
* {Lu}>*. Take the lower sequence number. This seems complex and not
|
||||
* worth the trouble, but we may revisit this later. For documentation (or
|
||||
* possible resurrection) the old code is included below, commented out
|
||||
* with the remark "// OLD INDEXED IMPLEMENTATION". Under the old
|
||||
* implementation, <code>rules</code> is a Hashtable, not a Vector.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Vector of rules, in the order added.
|
||||
*/
|
||||
private Vector rules;
|
||||
|
||||
/**
|
||||
* Length of the longest preceding context
|
||||
*/
|
||||
private int maxContextLength;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a new empty rule set.
|
||||
*/
|
||||
public TransliterationRuleSet() {
|
||||
rules = new Vector();
|
||||
maxContextLength = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the maximum context length.
|
||||
* @return the length of the longest preceding context.
|
||||
*/
|
||||
public int getMaximumContextLength() {
|
||||
return maxContextLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a rule to this set. Rules are added in order, and order is
|
||||
* significant.
|
||||
*
|
||||
* <p>Once freeze() is called, this method must not be called.
|
||||
* @param rule the rule to add
|
||||
*/
|
||||
public void addRule(TransliterationRule rule) {
|
||||
|
||||
// Build time, no checking : 3562 ms
|
||||
// Build time, with checking: 6234 ms
|
||||
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
TransliterationRule r = (TransliterationRule) rules.elementAt(i);
|
||||
if (r.masks(rule)) {
|
||||
throw new IllegalArgumentException("Rule " + rule +
|
||||
" must precede " + r);
|
||||
}
|
||||
}
|
||||
|
||||
rules.addElement(rule);
|
||||
int len;
|
||||
if ((len = rule.getAnteContextLength()) > maxContextLength) {
|
||||
maxContextLength = len;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Free up space. Once this method is called, addRule() must NOT
|
||||
* be called again.
|
||||
*/
|
||||
public void freeze() {
|
||||
for (int i=0; i<rules.size(); ++i) {
|
||||
((TransliterationRule) rules.elementAt(i)).freeze();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text. The
|
||||
* text being matched occupies a virtual buffer consisting of the contents
|
||||
* of <code>result</code> concatenated to a substring of <code>text</code>.
|
||||
* The substring is specified by <code>start</code> and <code>limit</code>.
|
||||
* The value of <code>cursor</code> is an index into this virtual buffer,
|
||||
* from 0 to the length of the buffer. In terms of the parameters,
|
||||
* <code>cursor</code> must be between 0 and <code>result.length() + limit -
|
||||
* start</code>.
|
||||
* @param text the untranslated text
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result tranlated text
|
||||
* @param cursor position at which to translate next, an offset into result.
|
||||
* If greater than or equal to result.length(), represents offset start +
|
||||
* cursor - result.length() into text.
|
||||
* @param variables a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
public TransliterationRule findMatch(String text, int start, int limit,
|
||||
StringBuffer result, int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, result, cursor, variables, filter)) {
|
||||
return rule;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param variables a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found.
|
||||
*/
|
||||
public TransliterationRule findMatch(Replaceable text, int start, int limit,
|
||||
int cursor,
|
||||
Dictionary variables,
|
||||
UnicodeFilter filter) {
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
if (rule.matches(text, start, limit, cursor, variables, filter)) {
|
||||
return rule;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to find a matching rule at the specified point in the text.
|
||||
* Unlike <code>findMatch()</code>, this method does an incremental match.
|
||||
* An incremental match requires that there be no partial matches that might
|
||||
* pre-empt the full match that is found. If there are partial matches,
|
||||
* then null is returned. A non-null result indicates that a full match has
|
||||
* been found, and that it cannot be pre-empted by a partial match
|
||||
* regardless of what additional text is added to the translation buffer.
|
||||
* @param text the text, both translated and untranslated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param cursor position at which to translate next, representing offset
|
||||
* into text. This value must be between <code>start</code> and
|
||||
* <code>limit</code>.
|
||||
* @param variables a dictionary mapping variables to the sets they
|
||||
* represent (maps <code>Character</code> to <code>UnicodeSet</code>)
|
||||
* @param partial output parameter. <code>partial[0]</code> is set to
|
||||
* true if a partial match is returned.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return the matching rule, or null if none found, or if the text buffer
|
||||
* does not have enough text yet to unambiguously match a rule.
|
||||
*/
|
||||
public TransliterationRule findIncrementalMatch(Replaceable text, int start,
|
||||
int limit, int cursor,
|
||||
Dictionary variables,
|
||||
boolean partial[],
|
||||
UnicodeFilter filter) {
|
||||
partial[0] = false;
|
||||
for (Enumeration e = rules.elements(); e.hasMoreElements(); ) {
|
||||
TransliterationRule rule = (TransliterationRule) e.nextElement();
|
||||
int match = rule.getMatchDegree(text, start, limit, cursor,
|
||||
variables, filter);
|
||||
switch (match) {
|
||||
case TransliterationRule.FULL_MATCH:
|
||||
return rule;
|
||||
case TransliterationRule.PARTIAL_MATCH:
|
||||
partial[0] = true;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
860
icu4j/src/com/ibm/text/Transliterator.java
Executable file
860
icu4j/src/com/ibm/text/Transliterator.java
Executable file
|
@ -0,0 +1,860 @@
|
|||
package com.ibm.text;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.MessageFormat;
|
||||
|
||||
/**
|
||||
* <code>Transliterator</code> is an abstract class that
|
||||
* transliterates text from one format to another. The most common
|
||||
* kind of transliterator is a script, or alphabet, transliterator.
|
||||
* For example, a Russian to Latin transliterator changes Russian text
|
||||
* written in Cyrillic characters to phonetically equivalent Latin
|
||||
* characters. It does not <em>translate</em> Russian to English!
|
||||
* Transliteration, unlike translation, operates on characters, without
|
||||
* reference to the meanings of words and sentences.
|
||||
*
|
||||
* <p>Although script conversion is its most common use, a
|
||||
* transliterator can actually perform a more general class of tasks.
|
||||
* In fact, <code>Transliterator</code> defines a very general API
|
||||
* which specifies only that a segment of the input text is replaced
|
||||
* by new text. The particulars of this conversion are determined
|
||||
* entirely by subclasses of <code>Transliterator</code>.
|
||||
*
|
||||
* <p><b>Transliterators are stateless</b>
|
||||
*
|
||||
* <p><code>Transliterator</code> objects are <em>stateless</em>; they
|
||||
* retain no information between calls to
|
||||
* <code>transliterate()</code>. As a result, threads may share
|
||||
* transliterators without synchronizing them. This might seem to
|
||||
* limit the complexity of the transliteration operation. In
|
||||
* practice, subclasses perform complex transliterations by delaying
|
||||
* the replacement of text until it is known that no other
|
||||
* replacements are possible. In other words, although the
|
||||
* <code>Transliterator</code> objects are stateless, the source text
|
||||
* itself embodies all the needed information, and delayed operation
|
||||
* allows arbitrary complexity.
|
||||
*
|
||||
* <p><b>Batch transliteration</b>
|
||||
*
|
||||
* <p>The simplest way to perform transliteration is all at once, on a
|
||||
* string of existing text. This is referred to as <em>batch</em>
|
||||
* transliteration. For example, given a string <code>input</code>
|
||||
* and a transliterator <code>t</code>, the call
|
||||
*
|
||||
* <blockquote><code>String result = t.transliterate(input);
|
||||
* </code></blockquote>
|
||||
*
|
||||
* will transliterate it and return the result. Other methods allow
|
||||
* the client to specify a substring to be transliterated and to use
|
||||
* {@link Replaceable} objects instead of strings, in order to
|
||||
* preserve out-of-band information (such as text styles).
|
||||
*
|
||||
* <p><b>Keyboard transliteration</b>
|
||||
*
|
||||
* <p>Somewhat more involved is <em>keyboard</em>, or incremental
|
||||
* transliteration. This is the transliteration of text that is
|
||||
* arriving from some source (typically the user's keyboard) one
|
||||
* character at a time, or in some other piecemeal fashion.
|
||||
*
|
||||
* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
|
||||
* stores the text. As text is inserted, as much as possible is
|
||||
* transliterated on the fly. This means a GUI that displays the
|
||||
* contents of the buffer may show text being modified as each new
|
||||
* character arrives.
|
||||
*
|
||||
* <p>Consider the simple <code>RuleBasedTransliterator</code>:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* th>{theta}<br>
|
||||
* t>{tau}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* When the user types 't', nothing will happen, since the
|
||||
* transliterator is waiting to see if the next character is 'h'. To
|
||||
* remedy this, we introduce the notion of a cursor, marked by a '|'
|
||||
* in the output string:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* t>|{tau}<br>
|
||||
* {tau}h>{theta}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* Now when the user types 't', tau appears, and if the next character
|
||||
* is 'h', the tau changes to a theta. This is accomplished by
|
||||
* maintaining a cursor position (independent of the insertion point,
|
||||
* and invisible in the GUI) across calls to
|
||||
* <code>keyboardTransliterate()</code>. Typically, the cursor will
|
||||
* be coincident with the insertion point, but in a case like the one
|
||||
* above, it will precede the insertion point.
|
||||
*
|
||||
* <p>Keyboard transliteration methods maintain a set of three indices
|
||||
* that are updated with each call to
|
||||
* <code>keyboardTransliterate()</code>, including the cursor, start,
|
||||
* and limit. Since these indices are changed by the method, they are
|
||||
* passed in an <code>int[]</code> array. The <code>START</code> index
|
||||
* marks the beginning of the substring that the transliterator will
|
||||
* look at. It is advanced as text becomes committed (but it is not
|
||||
* the committed index; that's the <code>CURSOR</code>). The
|
||||
* <code>CURSOR</code> index, described above, marks the point at
|
||||
* which the transliterator last stopped, either because it reached
|
||||
* the end, or because it required more characters to disambiguate
|
||||
* between possible inputs. The <code>CURSOR</code> can also be
|
||||
* explicitly set by rules in a <code>RuleBasedTransliterator</code>.
|
||||
* Any characters before the <code>CURSOR</code> index are frozen;
|
||||
* future keyboard transliteration calls within this input sequence
|
||||
* will not change them. New text is inserted at the
|
||||
* <code>LIMIT</code> index, which marks the end of the substring that
|
||||
* the transliterator looks at.
|
||||
*
|
||||
* <p>Because keyboard transliteration assumes that more characters
|
||||
* are to arrive, it is conservative in its operation. It only
|
||||
* transliterates when it can do so unambiguously. Otherwise it waits
|
||||
* for more characters to arrive. When the client code knows that no
|
||||
* more characters are forthcoming, perhaps because the user has
|
||||
* performed some input termination operation, then it should call
|
||||
* <code>finishKeyboardTransliteration()</code> to complete any
|
||||
* pending transliterations.
|
||||
*
|
||||
* <p><b>Inverses</b>
|
||||
*
|
||||
* <p>Pairs of transliterators may be inverses of one another. For
|
||||
* example, if transliterator <b>A</b> transliterates characters by
|
||||
* incrementing their Unicode value (so "abc" -> "def"), and
|
||||
* transliterator <b>B</b> decrements character values, then <b>A</b>
|
||||
* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
|
||||
* with <b>B</b> in a compound transliterator, the result is the
|
||||
* indentity transliterator, that is, a transliterator that does not
|
||||
* change its input text.
|
||||
*
|
||||
* The <code>Transliterator</code> method <code>getInverse()</code>
|
||||
* returns a transliterator's inverse, if one exists, or
|
||||
* <code>null</code> otherwise. However, the result of
|
||||
* <code>getInverse()</code> usually will <em>not</em> be a true
|
||||
* mathematical inverse. This is because true inverse transliterators
|
||||
* are difficult to formulate. For example, consider two
|
||||
* transliterators: <b>AB</b>, which transliterates the character 'A'
|
||||
* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
|
||||
* seem that these are exact inverses, since
|
||||
*
|
||||
* <blockquote>"A" x <b>AB</b> -> "B"<br>
|
||||
* "B" x <b>BA</b> -> "A"</blockquote>
|
||||
*
|
||||
* where 'x' represents transliteration. However,
|
||||
*
|
||||
* <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
|
||||
* "BBCD" x <b>BA</b> -> "AACD"</blockquote>
|
||||
*
|
||||
* so <b>AB</b> composed with <b>BA</b> is not the
|
||||
* identity. Nonetheless, <b>BA</b> may be usefully considered to be
|
||||
* <b>AB</b>'s inverse, and it is on this basis that
|
||||
* <b>AB</b><code>.getInverse()</code> could legitimately return
|
||||
* <b>BA</b>.
|
||||
*
|
||||
* <p><b>IDs and display names</b>
|
||||
*
|
||||
* <p>A transliterator is designated by a short identifier string or
|
||||
* <em>ID</em>. IDs follow the format <em>source-destination</em>,
|
||||
* where <em>source</em> describes the entity being replaced, and
|
||||
* <em>destination</em> describes the entity replacing
|
||||
* <em>source</em>. The entities may be the names of scripts,
|
||||
* particular sequences of characters, or whatever else it is that the
|
||||
* transliterator converts to or from. For example, a transliterator
|
||||
* from Russian to Latin might be named "Russian-Latin". A
|
||||
* transliterator from keyboard escape sequences to Latin-1 characters
|
||||
* might be named "KeyboardEscape-Latin1". By convention, system
|
||||
* entity names are in English, with the initial letters of words
|
||||
* capitalized; user entity names may follow any format so long as
|
||||
* they do not contain dashes.
|
||||
*
|
||||
* <p>In addition to programmatic IDs, transliterator objects have
|
||||
* display names for presentation in user interfaces, returned by
|
||||
* {@link #getDisplayName}.
|
||||
*
|
||||
* <p><b>Factory methods and registration</b>
|
||||
*
|
||||
* <p>In general, client code should use the factory method
|
||||
* <code>getInstance()</code> to obtain an instance of a
|
||||
* transliterator given its ID. Valid IDs may be enumerated using
|
||||
* <code>getAvailableIDs()</code>. Since transliterators are
|
||||
* stateless, multiple calls to <code>getInstance()</code> with the
|
||||
* same ID will return the same object.
|
||||
*
|
||||
* <p>In addition to the system transliterators registered at startup,
|
||||
* user transliterators may be registered by calling
|
||||
* <code>registerInstance()</code> at run time. To register a
|
||||
* transliterator subclass without instantiating it (until it is
|
||||
* needed), users may call <code>registerClass()</code>.
|
||||
*
|
||||
* <p><b>Subclassing</b>
|
||||
*
|
||||
* <p>Subclasses must implement the abstract
|
||||
* <code>transliterate()</code> method. They should also override the
|
||||
* <code>transliterate()</code> method taking a <code>String</code>
|
||||
* and <code>StringBuffer</code> if the performance of these methods
|
||||
* can be improved over the performance obtained by the default
|
||||
* implementations in this class. Subclasses must also implement
|
||||
* <code>handleKeyboardTransliterate()</code>.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: Transliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public abstract class Transliterator {
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the beginning index, inclusive
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
public static final int START = 0;
|
||||
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the ending index, exclusive
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
public static final int LIMIT = 1;
|
||||
|
||||
/**
|
||||
* In the <code>keyboardTransliterate()</code>
|
||||
* <code>index[]</code> array, the next character to be considered
|
||||
* for transliteration
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
public static final int CURSOR = 2;
|
||||
|
||||
/**
|
||||
* Programmatic name, e.g., "Latin-Arabic".
|
||||
*/
|
||||
private String ID;
|
||||
|
||||
/**
|
||||
* This transliterator's filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
private UnicodeFilter filter;
|
||||
|
||||
/**
|
||||
* Dictionary of known transliterators. Keys are <code>String</code>
|
||||
* names, values are one of the following:
|
||||
*
|
||||
* <ul><li><code>Transliterator</code> objects
|
||||
*
|
||||
* <li><code>Class</code> objects. Such objects must represent
|
||||
* subclasses of <code>Transliterator</code>, and must satisfy the
|
||||
* constraints described in <code>registerClass()</code>
|
||||
*
|
||||
* <li><code>RULE_BASED_PLACEHOLDER</code>, in which case the ID
|
||||
* will have its first '-' removed and be appended to
|
||||
* RB_RULE_BASED_PREFIX to form a resource bundle name from which
|
||||
* the RB_RULE key is looked up to obtain the rule.
|
||||
*
|
||||
* <li><code>REVERSE_RULE_BASED_PLACEHOLDER</code>. Like
|
||||
* <code>RULE_BASED_PLACEHOLDER</code>, except the entity names in
|
||||
* the ID are reversed, and the argument
|
||||
* RuleBasedTransliterator.REVERSE is pased to the
|
||||
* RuleBasedTransliterator constructor.
|
||||
* </ul>
|
||||
*/
|
||||
private static Hashtable cache;
|
||||
|
||||
/**
|
||||
* Internal object used to stand for instances of
|
||||
* <code>RuleBasedTransliterator</code> that have not been
|
||||
* constructed yet in the <code>cache</code>. When a
|
||||
* <code>getInstance()</code> call retrieves this object, it is
|
||||
* replaced by the actual <code>RuleBasedTransliterator</code>.
|
||||
* This allows <code>Transliterator</code> to delay instantiation
|
||||
* of such transliterators until they are needed.
|
||||
*/
|
||||
private static final Object RULE_BASED_PLACEHOLDER = new Object();
|
||||
|
||||
/**
|
||||
* Internal object used to stand for instances of
|
||||
* <code>RuleBasedTransliterator</code> that have not been
|
||||
* constructed yet in the <code>cache</code>. These instances are
|
||||
* constructed with an argument
|
||||
* <code>RuleBasedTransliterator.REVERSE</code>.
|
||||
*/
|
||||
private static final Object REVERSE_RULE_BASED_PLACEHOLDER = new Object();
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle key for the display name for a
|
||||
* transliterator. The ID is appended to this to form the key.
|
||||
* The resource bundle value should be a String.
|
||||
*/
|
||||
private static final String RB_DISPLAY_NAME_PREFIX = "T:";
|
||||
|
||||
/**
|
||||
* Resource bundle key for display name pattern.
|
||||
* The resource bundle value should be a String forming a
|
||||
* MessageFormat pattern, e.g.:
|
||||
* "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
|
||||
*/
|
||||
private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
|
||||
|
||||
/**
|
||||
* Resource bundle key for the list of RuleBasedTransliterator IDs.
|
||||
* The resource bundle value should be a String[] with each element
|
||||
* being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
|
||||
* to obtain the class name in which the RB_RULE key will be sought.
|
||||
*/
|
||||
private static final String RB_RULE_BASED_IDS = "RuleBasedTransliteratorIDs";
|
||||
|
||||
/**
|
||||
* Resource bundle containing display name keys and the
|
||||
* RB_RULE_BASED_IDS array.
|
||||
*
|
||||
* <p>If we ever integrate this with the Sun JDK, the resource bundle
|
||||
* root will change to java.text.resources.LocaleElements
|
||||
*/
|
||||
private static final String RB_LOCALE_ELEMENTS =
|
||||
"com.ibm.text.resources.LocaleElements";
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle containing RuleBasedTransliterator
|
||||
* RB_RULE string. The ID is munged to remove the first '-' then appended
|
||||
* to this String to obtain the class name.
|
||||
*/
|
||||
private static final String RB_RULE_BASED_PREFIX =
|
||||
"com.ibm.text.resources.TransliterationRule";
|
||||
|
||||
/**
|
||||
* Resource bundle key for the RuleBasedTransliterator rule.
|
||||
*/
|
||||
private static final String RB_RULE = "Rule";
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
* @param ID the string identifier for this transliterator
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
*/
|
||||
protected Transliterator(String ID, UnicodeFilter filter) {
|
||||
if (ID == null) {
|
||||
throw new NullPointerException();
|
||||
}
|
||||
this.ID = ID;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the segment of a string that begins at the
|
||||
* character at offset <code>start</code> and extends to the
|
||||
* character at offset <code>limit - 1</code>, with optional
|
||||
* filtering. A default implementaion is provided here;
|
||||
* subclasses should provide a more efficient implementation if
|
||||
* possible.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
public void transliterate(String text, int start, int limit,
|
||||
StringBuffer result) {
|
||||
/* This is a default implementation that should be replaced by
|
||||
* a more efficient subclass implementation if possible.
|
||||
*/
|
||||
result.setLength(0);
|
||||
result.append(text.substring(start, limit));
|
||||
transliterate(new ReplaceableString(result),
|
||||
0, result.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string, with optional filtering.
|
||||
* Subclasses must override this abstract method.
|
||||
*
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @param filter the filter. Any character for which
|
||||
* <tt>filter.isIn()</tt> returns <tt>false</tt> will not be
|
||||
* altered by this transliterator. If <tt>filter</tt> is
|
||||
* <tt>null</tt> then no filtering is applied.
|
||||
* @return The new limit index. The text previously occupying <code>[start,
|
||||
* limit)</code> has been transliterated, possibly to a string of a different
|
||||
* length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
|
||||
* <em>new-limit</em> is the return value.
|
||||
*/
|
||||
public abstract int transliterate(Replaceable text, int start, int limit);
|
||||
|
||||
/**
|
||||
* Transliterates an entire string. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
* @param result buffer to receive the transliterated text; previous
|
||||
* contents are discarded
|
||||
*/
|
||||
public final void transliterate(String text, StringBuffer result) {
|
||||
transliterate(text, 0, text.length(), result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterate an entire string and returns the result. Convenience method.
|
||||
*
|
||||
* @param text the string to be transliterated
|
||||
* @return The transliterated text
|
||||
*/
|
||||
public final String transliterate(String text) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
transliterate(text, 0, text.length(), result);
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates an entire string in place. Convenience method.
|
||||
* @param text the string to be transliterated
|
||||
*/
|
||||
public final void transliterate(Replaceable text) {
|
||||
transliterate(text, 0, text.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after new text has been inserted,
|
||||
* typically as a result of a keyboard event. The new text in
|
||||
* <code>insertion</code> will be inserted into <code>text</code>
|
||||
* at <code>index[LIMIT]</code>, advancing
|
||||
* <code>index[LIMIT]</code> by <code>insertion.length()</code>.
|
||||
* Then the transliterator will try to transliterate characters of
|
||||
* <code>text</code> between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed.
|
||||
*
|
||||
* <p>Upon return, values in <code>index[]</code> will be updated.
|
||||
* <code>index[START]</code> will be advanced to the first
|
||||
* character that future calls to this method will read.
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code> will
|
||||
* be adjusted to delimit the range of text that future calls to
|
||||
* this method may change.
|
||||
*
|
||||
* <p>Typical usage of this method begins with an initial call
|
||||
* with <code>index[START]</code> and <code>index[LIMIT]</code>
|
||||
* set to indicate the portion of <code>text</code> to be
|
||||
* transliterated, and <code>index[CURSOR] == index[START]</code>.
|
||||
* Thereafter, <code>index[]</code> can be used without
|
||||
* modification in future calls, provided that all changes to
|
||||
* <code>text</code> are made via this method.
|
||||
*
|
||||
* <p>This method assumes that future calls may be made that will
|
||||
* insert new text into the buffer. As a result, it only performs
|
||||
* unambiguous transliterations. After the last call to this
|
||||
* method, there may be untransliterated text that is waiting for
|
||||
* more input to resolve an ambiguity. In order to perform these
|
||||
* pending transliterations, clients should call {@link
|
||||
* #finishKeyboardTransliteration} after the last call to this
|
||||
* method has been made.
|
||||
*
|
||||
* @param text the buffer holding transliterated and untransliterated text
|
||||
* @param index an array of three integers.
|
||||
*
|
||||
* <ul><li><code>index[START]</code>: the beginning index,
|
||||
* inclusive; <code>0 <= index[START] <= index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[LIMIT]</code>: the ending index, exclusive;
|
||||
* <code>index[START] <= index[LIMIT] <= text.length()</code>.
|
||||
* <code>insertion</code> is inserted at
|
||||
* <code>index[LIMIT]</code>.
|
||||
*
|
||||
* <li><code>index[CURSOR]</code>: the next character to be
|
||||
* considered for transliteration; <code>index[START] <=
|
||||
* index[CURSOR] <= index[LIMIT]</code>. Characters before
|
||||
* <code>index[CURSOR]</code> will not be changed by future calls
|
||||
* to this method.</ul>
|
||||
*
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>. If <code>null</code> then no text
|
||||
* is inserted.
|
||||
* @see #START
|
||||
* @see #LIMIT
|
||||
* @see #CURSOR
|
||||
* @see #handleKeyboardTransliterate
|
||||
* @exception IllegalArgumentException if <code>index[]</code>
|
||||
* is invalid
|
||||
*/
|
||||
public final void keyboardTransliterate(Replaceable text, int[] index,
|
||||
String insertion) {
|
||||
if (index.length < 3 ||
|
||||
index[START] < 0 ||
|
||||
index[LIMIT] > text.length() ||
|
||||
index[CURSOR] < index[START] ||
|
||||
index[CURSOR] > index[LIMIT]) {
|
||||
throw new IllegalArgumentException("Invalid index array");
|
||||
}
|
||||
|
||||
int originalStart = index[START];
|
||||
if (insertion != null) {
|
||||
text.replace(index[LIMIT], index[LIMIT], insertion);
|
||||
index[LIMIT] += insertion.length();
|
||||
}
|
||||
|
||||
handleKeyboardTransliterate(text, index);
|
||||
|
||||
index[START] = Math.max(index[CURSOR] - getMaximumContextLength(),
|
||||
originalStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly after a new character has been
|
||||
* inserted, typically as a result of a keyboard event. This is a
|
||||
* convenience method; see {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)} for details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @param insertion text to be inserted and possibly
|
||||
* transliterated into the translation buffer at
|
||||
* <code>index[LIMIT]</code>.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
public final void keyboardTransliterate(Replaceable text, int[] index,
|
||||
char insertion) {
|
||||
keyboardTransliterate(text, index, String.valueOf(insertion));
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates the portion of the text buffer that can be
|
||||
* transliterated unambiguosly. This is a convenience method; see
|
||||
* {@link #keyboardTransliterate(Replaceable, int[], String)} for
|
||||
* details.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate(Replaceable, int[], String)
|
||||
*/
|
||||
public final void keyboardTransliterate(Replaceable text, int[] index) {
|
||||
keyboardTransliterate(text, index, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finishes any pending transliterations that were waiting for
|
||||
* more characters. Clients should call this method as the last
|
||||
* call after a sequence of one or more calls to
|
||||
* <code>keyboardTransliterate()</code>.
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text.
|
||||
* @param index the array of indices previously passed to {@link
|
||||
* #keyboardTransliterate}
|
||||
*/
|
||||
public final void finishKeyboardTransliteration(Replaceable text,
|
||||
int[] index) {
|
||||
transliterate(text, index[START], index[LIMIT]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract method that concrete subclasses define to implement
|
||||
* keyboard transliteration. This method should transliterate all
|
||||
* characters between <code>index[CURSOR]</code> and
|
||||
* <code>index[LIMIT]</code> that can be unambiguously
|
||||
* transliterated, regardless of future insertions of text at
|
||||
* <code>index[LIMIT]</code>. <code>index[CURSOR]</code> should
|
||||
* be advanced past committed characters (those that will not
|
||||
* change in future calls to this method).
|
||||
* <code>index[LIMIT]</code> should be updated to reflect text
|
||||
* replacements that shorten or lengthen the text between
|
||||
* <code>index[CURSOR]</code> and <code>index[LIMIT]</code>. Upon
|
||||
* return, neither <code>index[CURSOR]</code> nor
|
||||
* <code>index[LIMIT]</code> should be less than the initial value
|
||||
* of <code>index[CURSOR]</code>. <code>index[START]</code>
|
||||
* should <em>not</em> be changed.
|
||||
*
|
||||
* @param text the buffer holding transliterated and
|
||||
* untransliterated text
|
||||
* @param index an array of three integers. See {@link
|
||||
* #keyboardTransliterate(Replaceable, int[], String)}.
|
||||
* @see #keyboardTransliterate
|
||||
*/
|
||||
protected abstract void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] index);
|
||||
|
||||
/**
|
||||
* Returns the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context. The default implementation supplied
|
||||
* by <code>Transliterator</code> returns zero; subclasses
|
||||
* that use preceding context should override this method to return the
|
||||
* correct value. For example, if a transliterator translates "ddd" (where
|
||||
* d is any digit) to "555" when preceded by "(ddd)", then the preceding
|
||||
* context length is 5, the length of "(ddd)".
|
||||
*
|
||||
* @return The maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a programmatic identifier for this transliterator.
|
||||
* If this identifier is passed to <code>getInstance()</code>, it
|
||||
* will return this object, if it has been registered.
|
||||
* @see #registerInstance
|
||||
* @see #registerClass
|
||||
* @see #getAvailableIDs
|
||||
*/
|
||||
public final String getID() {
|
||||
return ID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the default locale. See {@link
|
||||
* #getDisplayName(Locale)} for details.
|
||||
*/
|
||||
public final String getDisplayName() {
|
||||
return getDisplayName(Locale.getDefault());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a name for this transliterator that is appropriate for
|
||||
* display to the user in the given locale. This name is taken
|
||||
* from the locale resource data in the standard manner of the
|
||||
* <code>java.text</code> package.
|
||||
*
|
||||
* <p>If no localized names exist in the system resource bundles,
|
||||
* a name is synthesized using a localized
|
||||
* <code>MessageFormat</code> pattern from the resource data. The
|
||||
* arguments to this pattern are an integer followed by one or two
|
||||
* strings. The integer is the number of strings, either 1 or 2.
|
||||
* The strings are formed by splitting the ID for this
|
||||
* transliterator at the first '-'. If there is no '-', then the
|
||||
* entire ID forms the only string.
|
||||
* @param inLocale the Locale in which the display name should be
|
||||
* localized.
|
||||
* @see java.text.MessageFormat
|
||||
*/
|
||||
public String getDisplayName(Locale inLocale) {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle(
|
||||
RB_LOCALE_ELEMENTS, inLocale);
|
||||
|
||||
try {
|
||||
return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
|
||||
} catch (MissingResourceException e) {}
|
||||
|
||||
try {
|
||||
// Construct the formatter first; if getString() fails
|
||||
// we'll exit the try block
|
||||
MessageFormat format = new MessageFormat(
|
||||
bundle.getString(RB_DISPLAY_NAME_PATTERN));
|
||||
// Construct the argument array
|
||||
int i = ID.indexOf('-');
|
||||
Object[] args = (i < 0)
|
||||
? new Object[] { new Integer(1), ID }
|
||||
: new Object[] { new Integer(2), ID.substring(0, i),
|
||||
ID.substring(i+1) };
|
||||
// Format it using the pattern in the resource
|
||||
return format.format(args);
|
||||
} catch (MissingResourceException e2) {}
|
||||
|
||||
// We should not reach this point unless there is something
|
||||
// wrong with the build or the RB_DISPLAY_NAME_PATTERN has
|
||||
// been deleted from the root RB_LOCALE_ELEMENTS resource.
|
||||
throw new RuntimeException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the filter used by this transliterator, or <tt>null</tt>
|
||||
* if this transliterator uses no filter.
|
||||
*/
|
||||
public UnicodeFilter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the filter used by this transliterator. If the filter
|
||||
* is set to <tt>null</tt> then no filtering will occur.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The filter should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
*/
|
||||
public void setFilter(UnicodeFilter filter) {
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this transliterator's inverse. See the class
|
||||
* documentation for details. This implementation simply inverts
|
||||
* the two entities in the ID and attempts to retrieve the
|
||||
* resulting transliterator. That is, if <code>getID()</code>
|
||||
* returns "A-B", then this method will return the result of
|
||||
* <code>getInstance("B-A")</code>, or <code>null</code> if that
|
||||
* call fails.
|
||||
*
|
||||
* <p>This method does not take filtering into account. The
|
||||
* returned transliterator will have no filter.
|
||||
*
|
||||
* <p>Subclasses with knowledge of their inverse may wish to
|
||||
* override this method.
|
||||
*
|
||||
* @return a transliterator that is an inverse, not necessarily
|
||||
* exact, of this transliterator, or <code>null</code> if no such
|
||||
* transliterator is registered.
|
||||
* @see #registerInstance
|
||||
*/
|
||||
public Transliterator getInverse() {
|
||||
int i = ID.indexOf('-');
|
||||
if (i >= 0) {
|
||||
String inverseID = ID.substring(i+1) + '-' + ID.substring(0, i);
|
||||
return internalGetInstance(inverseID);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>Transliterator</code> object given its ID.
|
||||
* The ID must be either a system transliterator ID or a ID registered
|
||||
* using <code>registerInstance()</code>.
|
||||
*
|
||||
* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
|
||||
* @return A <code>Transliterator</code> object with the given ID
|
||||
* @exception IllegalArgumentException if the given ID is invalid.
|
||||
* @see #registerInstance
|
||||
* @see #getAvailableIDs
|
||||
* @see #getID
|
||||
*/
|
||||
public static Transliterator getInstance(String ID) {
|
||||
Transliterator t = internalGetInstance(ID);
|
||||
if (t != null) {
|
||||
return t;
|
||||
}
|
||||
throw new IllegalArgumentException("Unsupported transliterator: "
|
||||
+ ID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a transliterator object given its ID. Unlike getInstance(),
|
||||
* this method returns null if it cannot make use of the given ID.
|
||||
*/
|
||||
private static Transliterator internalGetInstance(String ID) {
|
||||
Object obj = cache.get(ID);
|
||||
RuleBasedTransliterator.Data data = null;
|
||||
|
||||
if (obj instanceof RuleBasedTransliterator.Data) {
|
||||
data = (RuleBasedTransliterator.Data) obj;
|
||||
// Fall through to construct transliterator from cached Data object.
|
||||
} else if (obj instanceof Class) {
|
||||
try {
|
||||
return (Transliterator) ((Class) obj).newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
} catch (IllegalAccessException e2) {}
|
||||
} else {
|
||||
synchronized (cache) {
|
||||
boolean isReverse = (obj == REVERSE_RULE_BASED_PLACEHOLDER);
|
||||
String resourceName = RB_RULE_BASED_PREFIX;
|
||||
int i = ID.indexOf('-');
|
||||
if (i < 0) {
|
||||
resourceName += ID;
|
||||
} else {
|
||||
String IDLeft = ID.substring(0, i);
|
||||
String IDRight = ID.substring(i+1);
|
||||
resourceName += isReverse ? (IDRight + IDLeft)
|
||||
: (IDLeft + IDRight);
|
||||
}
|
||||
try {
|
||||
ResourceBundle resource = ResourceBundle.getBundle(resourceName);
|
||||
|
||||
data = RuleBasedTransliterator.parse(resource.getString(RB_RULE),
|
||||
isReverse
|
||||
? RuleBasedTransliterator.REVERSE
|
||||
: RuleBasedTransliterator.FORWARD);
|
||||
|
||||
cache.put(ID, data);
|
||||
// Fall through to construct transliterator from Data object.
|
||||
} catch (MissingResourceException e) {}
|
||||
}
|
||||
}
|
||||
|
||||
if (data != null) {
|
||||
return new RuleBasedTransliterator(ID, data, null);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a subclass of <code>Transliterator</code> with the
|
||||
* system. This subclass must have a public constructor taking no
|
||||
* arguments. When that constructor is called, the resulting
|
||||
* object must return the <code>ID</code> passed to this method if
|
||||
* its <code>getID()</code> method is called.
|
||||
*
|
||||
* @param ID the result of <code>getID()</code> for this
|
||||
* transliterator
|
||||
* @param transClass a subclass of <code>Transliterator</code>
|
||||
* @see #registerInstance
|
||||
* @see #unregister
|
||||
*/
|
||||
public static void registerClass(String ID, Class transClass) {
|
||||
cache.put(ID, transClass);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unregisters a transliterator or class. This may be either
|
||||
* a system transliterator or a user transliterator or class.
|
||||
*
|
||||
* @param ID the ID of the transliterator or class
|
||||
* @return the <code>Object</code> that was registered with
|
||||
* <code>ID</code>, or <code>null</code> if none was
|
||||
* @see #registerInstance
|
||||
* @see #registerClass
|
||||
*/
|
||||
public static Object unregister(String ID) {
|
||||
return cache.remove(ID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an enumeration over the programmatic names of registered
|
||||
* <code>Transliterator</code> objects. This includes both system
|
||||
* transliterators and user transliterators registered using
|
||||
* <code>registerInstance()</code>. The enumerated names may be
|
||||
* passed to <code>getInstance()</code>.
|
||||
*
|
||||
* @return An <code>Enumeration</code> over <code>String</code> objects
|
||||
* @see #getInstance
|
||||
* @see #registerInstance
|
||||
*/
|
||||
public static final Enumeration getAvailableIDs() {
|
||||
return cache.keys();
|
||||
}
|
||||
|
||||
static {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle(RB_LOCALE_ELEMENTS);
|
||||
|
||||
try {
|
||||
String[] ruleBasedIDs = bundle.getStringArray(RB_RULE_BASED_IDS);
|
||||
|
||||
cache = new Hashtable();
|
||||
|
||||
for (int i=0; i<ruleBasedIDs.length; ++i) {
|
||||
String ID = ruleBasedIDs[i];
|
||||
boolean isReverse = (ID.charAt(0) == '*');
|
||||
if (isReverse) {
|
||||
ID = ID.substring(1);
|
||||
}
|
||||
cache.put(ID, isReverse ? REVERSE_RULE_BASED_PLACEHOLDER
|
||||
: RULE_BASED_PLACEHOLDER);
|
||||
}
|
||||
} catch (MissingResourceException e) {}
|
||||
|
||||
cache.put(HexToUnicodeTransliterator._ID,
|
||||
HexToUnicodeTransliterator.class);
|
||||
cache.put(UnicodeToHexTransliterator._ID,
|
||||
UnicodeToHexTransliterator.class);
|
||||
}
|
||||
}
|
22
icu4j/src/com/ibm/text/UnicodeFilter.java
Executable file
22
icu4j/src/com/ibm/text/UnicodeFilter.java
Executable file
|
@ -0,0 +1,22 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>UnicodeFilter</code> defines a protocol for selecting a
|
||||
* subset of the full range (U+0000 to U+FFFF) of Unicode characters.
|
||||
* Currently, filters are used in conjunction with classes like {@link
|
||||
* Transliterator} to only process selected characters through a
|
||||
* transformation.
|
||||
*
|
||||
* {@link UnicodeFilterLogic}
|
||||
*/
|
||||
|
||||
public interface UnicodeFilter {
|
||||
|
||||
/**
|
||||
* Returns <tt>true</tt> for characters that are in the selected
|
||||
* subset. In other words, if a character is <b>to be
|
||||
* filtered</b>, then <tt>isIn()</tt> returns
|
||||
* <b><tt>false</tt></b>.
|
||||
*/
|
||||
public boolean isIn(char c);
|
||||
}
|
112
icu4j/src/com/ibm/text/UnicodeFilterLogic.java
Executable file
112
icu4j/src/com/ibm/text/UnicodeFilterLogic.java
Executable file
|
@ -0,0 +1,112 @@
|
|||
package com.ibm.text;
|
||||
|
||||
/**
|
||||
* <code>UnicodeFilterLogic</code> provides logical operators on
|
||||
* {@link UnicodeFilter} objects. This class cannot be instantiated;
|
||||
* it consists only of static methods. The static methods return
|
||||
* filter objects that perform logical inversion (<tt>not</tt>),
|
||||
* intersection (<tt>and</tt>), or union (<tt>or</tt>) of the given
|
||||
* filter objects.
|
||||
*/
|
||||
public final class UnicodeFilterLogic {
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
|
||||
* the given filter.
|
||||
*/
|
||||
public static UnicodeFilter not(final UnicodeFilter f) {
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return !f.isIn(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the two given filters. That is,
|
||||
* if <tt>f.isIn()</tt> is <tt>false</tt>, then <tt>g.isIn()</tt>
|
||||
* is not called, and <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
public static UnicodeFilter and(final UnicodeFilter f,
|
||||
final UnicodeFilter g) {
|
||||
if (f == null) {
|
||||
return g;
|
||||
}
|
||||
if (g == null) {
|
||||
return f;
|
||||
}
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return f.isIn(c) && g.isIn(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit AND of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>false</tt>.
|
||||
*/
|
||||
public static UnicodeFilter and(final UnicodeFilter[] f) {
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
if (!f[i].isIn(c)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the two given filters. That is, if
|
||||
* <tt>f.isIn()</tt> is <tt>true</tt>, then <tt>g.isIn()</tt> is
|
||||
* not called, and <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*
|
||||
* <p>Either <tt>f</tt> or <tt>g</tt> must be non-null.
|
||||
*/
|
||||
public static UnicodeFilter or(final UnicodeFilter f,
|
||||
final UnicodeFilter g) {
|
||||
if (f == null) {
|
||||
return g;
|
||||
}
|
||||
if (g == null) {
|
||||
return f;
|
||||
}
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
return f.isIn(c) || g.isIn(c);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
* circuit OR of the result of the given filters. That is, if
|
||||
* <tt>f[i].isIn()</tt> is <tt>false</tt>, then
|
||||
* <tt>f[j].isIn()</tt> is not called, where <tt>j > i</tt>, and
|
||||
* <tt>isIn()</tt> returns <tt>true</tt>.
|
||||
*/
|
||||
public static UnicodeFilter or(final UnicodeFilter[] f) {
|
||||
return new UnicodeFilter() {
|
||||
public boolean isIn(char c) {
|
||||
for (int i=0; i<f.length; ++i) {
|
||||
if (f[i].isIn(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: Add nand() & nor() for convenience, if needed.
|
||||
}
|
1354
icu4j/src/com/ibm/text/UnicodeSet.java
Executable file
1354
icu4j/src/com/ibm/text/UnicodeSet.java
Executable file
File diff suppressed because it is too large
Load diff
172
icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java
Executable file
172
icu4j/src/com/ibm/text/UnicodeToHexTransliterator.java
Executable file
|
@ -0,0 +1,172 @@
|
|||
package com.ibm.text;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A transliterator that converts from Unicode characters to
|
||||
* hexadecimal Unicode escape sequences. It outputs a
|
||||
* prefix specified in the constructor and optionally converts the hex
|
||||
* digits to uppercase.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeToHexTransliterator.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class UnicodeToHexTransliterator extends Transliterator {
|
||||
|
||||
/**
|
||||
* Package accessible ID for this transliterator.
|
||||
*/
|
||||
static String _ID = "Unicode-Hex";
|
||||
|
||||
private String prefix;
|
||||
|
||||
private boolean uppercase;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
* @param prefix the string that will precede the four hex
|
||||
* digits for UNICODE_HEX transliterators. Ignored
|
||||
* if direction is HEX_UNICODE.
|
||||
* @param uppercase if true, the four hex digits will be
|
||||
* converted to uppercase; otherwise they will be lowercase.
|
||||
* Ignored if direction is HEX_UNICODE.
|
||||
*/
|
||||
public UnicodeToHexTransliterator(String prefix, boolean uppercase,
|
||||
UnicodeFilter filter) {
|
||||
super(_ID, filter);
|
||||
this.prefix = prefix;
|
||||
this.uppercase = uppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator with the default prefix "\u"
|
||||
* that outputs uppercase hex digits.
|
||||
*/
|
||||
public UnicodeToHexTransliterator() {
|
||||
this("\\u", true, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string that precedes the four hex digits.
|
||||
* @return prefix string
|
||||
*/
|
||||
public String getPrefix() {
|
||||
return prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the string that precedes the four hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The prefix should not be changed by one
|
||||
* thread while another thread may be transliterating.
|
||||
* @param prefix prefix string
|
||||
*/
|
||||
public void setPrefix(String prefix) {
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this transliterator outputs uppercase hex digits.
|
||||
*/
|
||||
public boolean isUppercase() {
|
||||
return uppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets if this transliterator outputs uppercase hex digits.
|
||||
*
|
||||
* <p>Callers must take care if a transliterator is in use by
|
||||
* multiple threads. The uppercase mode should not be changed by
|
||||
* one thread while another thread may be transliterating.
|
||||
* @param outputUppercase if true, then this transliterator
|
||||
* outputs uppercase hex digits.
|
||||
*/
|
||||
public void setUppercase(boolean outputUppercase) {
|
||||
uppercase = outputUppercase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterates a segment of a string. <code>Transliterator</code> API.
|
||||
* @param text the string to be transliterated
|
||||
* @param start the beginning index, inclusive; <code>0 <= start
|
||||
* <= limit</code>.
|
||||
* @param limit the ending index, exclusive; <code>start <= limit
|
||||
* <= text.length()</code>.
|
||||
* @return the new limit index
|
||||
*/
|
||||
public int transliterate(Replaceable text, int start, int limit) {
|
||||
int[] offsets = { start, limit, start };
|
||||
handleKeyboardTransliterate(text, offsets);
|
||||
return offsets[LIMIT];
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleKeyboardTransliterate}.
|
||||
*/
|
||||
protected void handleKeyboardTransliterate(Replaceable text,
|
||||
int[] offsets) {
|
||||
/**
|
||||
* Performs transliteration changing all characters to
|
||||
* Unicode hexadecimal escapes. For example, '@' -> "U+0040",
|
||||
* assuming the prefix is "U+".
|
||||
*/
|
||||
int cursor = offsets[CURSOR];
|
||||
int limit = offsets[LIMIT];
|
||||
|
||||
UnicodeFilter filter = getFilter();
|
||||
|
||||
loop:
|
||||
while (cursor < limit) {
|
||||
char c = text.charAt(cursor);
|
||||
if (filter != null && !filter.isIn(c)) {
|
||||
++cursor;
|
||||
continue;
|
||||
}
|
||||
String hex = hex(c);
|
||||
text.replace(cursor, cursor+1, hex);
|
||||
int len = hex.length();
|
||||
cursor += len; // Advance cursor by 1 and adjust for new text
|
||||
--len;
|
||||
limit += len;
|
||||
}
|
||||
|
||||
offsets[LIMIT] = limit;
|
||||
offsets[CURSOR] = cursor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the longest context required by this transliterator.
|
||||
* This is <em>preceding</em> context.
|
||||
* @param direction either <code>FORWARD</code> or <code>REVERSE</code>
|
||||
* @return maximum number of preceding context characters this
|
||||
* transliterator needs to examine
|
||||
*/
|
||||
protected int getMaximumContextLength() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Form escape sequence.
|
||||
*/
|
||||
private final String hex(char c) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
buf.append(prefix);
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
String h = Integer.toHexString(c);
|
||||
buf.append(uppercase ? h.toUpperCase() : h);
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
126
icu4j/src/com/ibm/text/components/AppletFrame.java
Executable file
126
icu4j/src/com/ibm/text/components/AppletFrame.java
Executable file
|
@ -0,0 +1,126 @@
|
|||
package com.ibm.text.components;
|
||||
import java.applet.*;
|
||||
import java.net.URL;
|
||||
import java.util.Enumeration;
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
|
||||
/**
|
||||
* <p>A Frame that runs an Applet within itself, making it possible
|
||||
* for an applet to run as an application. Usage:
|
||||
*
|
||||
* <pre>
|
||||
* public class MyApplet extends Applet {
|
||||
* public static void main(String args[]) {
|
||||
* MyApplet applet = new MyApplet();
|
||||
* new AppletFrame("My Applet Running As An App", applet, 640, 480);
|
||||
* }
|
||||
* ...
|
||||
* }
|
||||
* <pre>
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: AppletFrame.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class AppletFrame extends Frame implements AppletStub, AppletContext {
|
||||
|
||||
Applet applet;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Construct a Frame running the given Applet with the default size
|
||||
* of 640 by 480.
|
||||
* When the Frame is closed, the applet's stop() method is called,
|
||||
* the Frame is dispose()d of, and System.exit(0) is called.
|
||||
*
|
||||
* @param name the Frame title
|
||||
* @param applet the applet to be run
|
||||
*/
|
||||
public AppletFrame(String name, Applet applet) {
|
||||
this(name, applet, 640, 480);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a Frame running the given Applet with the given size.
|
||||
* When the Frame is closed, the applet's stop() method is called,
|
||||
* the Frame is dispose()d of, and System.exit(0) is called.
|
||||
*
|
||||
* @param name the Frame title
|
||||
* @param applet the applet to be run
|
||||
* @param width width of the Frame
|
||||
* @param height height of the Frame
|
||||
*/
|
||||
public AppletFrame(String name, Applet applet, int width, int height) {
|
||||
super(name);
|
||||
this.applet = applet;
|
||||
applet.setStub(this);
|
||||
|
||||
resize(width, height);
|
||||
add("Center", applet);
|
||||
show();
|
||||
addWindowListener(new WindowAdapter() {
|
||||
public void windowClosing(WindowEvent e) {
|
||||
AppletFrame.this.applet.stop();
|
||||
dispose();
|
||||
System.exit(0);
|
||||
}
|
||||
});
|
||||
|
||||
applet.init();
|
||||
applet.start();
|
||||
}
|
||||
|
||||
// AppletStub API
|
||||
public void appletResize(int width,
|
||||
int height) {
|
||||
resize(width, height);
|
||||
}
|
||||
|
||||
public AppletContext getAppletContext() {
|
||||
return this;
|
||||
}
|
||||
|
||||
public URL getCodeBase() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public URL getDocumentBase() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public String getParameter(String name) {
|
||||
return "PARAMETER";
|
||||
}
|
||||
|
||||
public boolean isActive() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// AppletContext API
|
||||
public Applet getApplet(String name) {
|
||||
return applet;
|
||||
}
|
||||
|
||||
public Enumeration getApplets() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public AudioClip getAudioClip(URL url) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public Image getImage(URL url) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public void showDocument(URL url) {}
|
||||
public void showDocument(URL url, String target) {}
|
||||
|
||||
public void showStatus(String status) {
|
||||
System.out.println(status);
|
||||
}
|
||||
}
|
708
icu4j/src/com/ibm/text/components/DumbTextComponent.java
Executable file
708
icu4j/src/com/ibm/text/components/DumbTextComponent.java
Executable file
|
@ -0,0 +1,708 @@
|
|||
package com.ibm.text.components;
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
import java.text.*;
|
||||
import java.awt.datatransfer.*;
|
||||
|
||||
// LIU: Changed from final to non-final
|
||||
public class DumbTextComponent extends Canvas
|
||||
implements KeyListener, MouseListener, MouseMotionListener, FocusListener
|
||||
{
|
||||
private transient static final String copyright =
|
||||
"Copyright \u00A9 1998, Mark Davis. All Rights Reserved.";
|
||||
private transient static boolean DEBUG = false;
|
||||
|
||||
private String contents = "";
|
||||
private Selection selection = new Selection();
|
||||
private boolean editable = true;
|
||||
|
||||
private transient Selection tempSelection = new Selection();
|
||||
private transient boolean focus;
|
||||
private transient BreakIterator lineBreaker = BreakIterator.getLineInstance();
|
||||
private transient BreakIterator wordBreaker = BreakIterator.getWordInstance();
|
||||
private transient BreakIterator charBreaker = BreakIterator.getCharacterInstance();
|
||||
private transient int lineAscent;
|
||||
private transient int lineHeight;
|
||||
private transient int lineLeading;
|
||||
private transient int lastHeight = 10;
|
||||
private transient int lastWidth = 50;
|
||||
private static final int MAX_LINES = 200; // LIU: Use symbolic name
|
||||
private transient int[] lineStarts = new int[MAX_LINES]; // LIU
|
||||
private transient int lineCount = 1;
|
||||
|
||||
private transient boolean valid = false;
|
||||
private transient FontMetrics fm;
|
||||
private transient boolean redoLines = true;
|
||||
private transient boolean doubleClick = false;
|
||||
private transient TextListener textListener;
|
||||
private transient ActionListener selectionListener;
|
||||
private transient Image cacheImage;
|
||||
private transient Dimension mySize;
|
||||
private transient int xInset = 5;
|
||||
private transient int yInset = 5;
|
||||
private transient Point startPoint = new Point();
|
||||
private transient Point endPoint = new Point();
|
||||
private transient Point caretPoint = new Point();
|
||||
private transient static String clipBoard;
|
||||
|
||||
private static final char CR = '\015'; // LIU
|
||||
|
||||
// ============================================
|
||||
|
||||
public DumbTextComponent() {
|
||||
addMouseListener(this);
|
||||
addMouseMotionListener(this);
|
||||
addKeyListener(this);
|
||||
addFocusListener(this);
|
||||
setCursor(Cursor.getPredefinedCursor(Cursor.TEXT_CURSOR));
|
||||
|
||||
}
|
||||
|
||||
// ================ Events ====================
|
||||
|
||||
public boolean isFocusTraversable() { return true; }
|
||||
|
||||
public void addActionListener(ActionListener l) {
|
||||
selectionListener = AWTEventMulticaster.add(selectionListener, l);
|
||||
}
|
||||
|
||||
public void removeActionListener(ActionListener l) {
|
||||
selectionListener = AWTEventMulticaster.remove(selectionListener, l);
|
||||
}
|
||||
|
||||
public void addTextListener(TextListener l) {
|
||||
textListener = AWTEventMulticaster.add(textListener, l);
|
||||
}
|
||||
|
||||
public void removeTextListener(TextListener l) {
|
||||
textListener = AWTEventMulticaster.remove(textListener, l);
|
||||
}
|
||||
|
||||
private transient boolean pressed;
|
||||
|
||||
public void mousePressed(MouseEvent e) {
|
||||
if (DEBUG) System.out.println("mousePressed");
|
||||
if (pressed) {
|
||||
select(e,false);
|
||||
} else {
|
||||
doubleClick = e.getClickCount() > 1;
|
||||
requestFocus();
|
||||
select(e, true);
|
||||
pressed = true;
|
||||
}
|
||||
}
|
||||
|
||||
public void mouseDragged(MouseEvent e) {
|
||||
if (DEBUG) System.out.println("mouseDragged");
|
||||
select(e, false);
|
||||
}
|
||||
|
||||
public void mouseReleased(MouseEvent e) {
|
||||
if (DEBUG) System.out.println("mouseReleased");
|
||||
pressed = false;
|
||||
}
|
||||
|
||||
public void mouseEntered(MouseEvent e) {
|
||||
//if (pressed) select(e, false);
|
||||
}
|
||||
|
||||
public void mouseExited(MouseEvent e){
|
||||
//if (pressed) select(e, false);
|
||||
}
|
||||
|
||||
public void mouseClicked(MouseEvent e) {}
|
||||
public void mouseMoved(MouseEvent e) {}
|
||||
|
||||
|
||||
public void focusGained(FocusEvent e) {
|
||||
if (DEBUG) System.out.println("focusGained");
|
||||
focus = true;
|
||||
valid = false;
|
||||
repaint(16);
|
||||
}
|
||||
public void focusLost(FocusEvent e) {
|
||||
if (DEBUG) System.out.println("focusLost");
|
||||
focus = false;
|
||||
valid = false;
|
||||
repaint(16);
|
||||
}
|
||||
|
||||
public void select(MouseEvent e, boolean first) {
|
||||
point2Offset(e.getPoint(), tempSelection);
|
||||
if (first) {
|
||||
if ((e.getModifiers() & InputEvent.SHIFT_MASK) == 0) {
|
||||
tempSelection.anchor = tempSelection.caret;
|
||||
}
|
||||
}
|
||||
// fix words
|
||||
if (doubleClick) {
|
||||
tempSelection.expand(wordBreaker);
|
||||
}
|
||||
select(tempSelection);
|
||||
}
|
||||
|
||||
public void keyPressed(KeyEvent e) {
|
||||
int code = e.getKeyCode();
|
||||
if (DEBUG) System.out.println("keyPressed "
|
||||
+ hex((char)code) + ", " + hex((char)e.getModifiers()));
|
||||
int start = selection.getStart();
|
||||
int end = selection.getEnd();
|
||||
boolean shift = (e.getModifiers() & KeyEvent.SHIFT_MASK) != 0;
|
||||
boolean ctrl = (e.getModifiers() & KeyEvent.CTRL_MASK) != 0;
|
||||
switch (code) {
|
||||
case KeyEvent.VK_Q:
|
||||
if (!ctrl || !editable) break;
|
||||
fixHex();
|
||||
break;
|
||||
case KeyEvent.VK_V:
|
||||
if (!ctrl || !editable) break;
|
||||
insertText(clipBoard);
|
||||
break;
|
||||
case KeyEvent.VK_C:
|
||||
if (!ctrl) break;
|
||||
clipBoard = contents.substring(selection.getStart(), selection.getEnd());
|
||||
break;
|
||||
case KeyEvent.VK_X:
|
||||
if (!ctrl) break;
|
||||
clipBoard = contents.substring(selection.getStart(), selection.getEnd());
|
||||
if (editable) break;
|
||||
insertText("");
|
||||
break;
|
||||
case KeyEvent.VK_A:
|
||||
if (!ctrl) break;
|
||||
select(Integer.MAX_VALUE, 0, false);
|
||||
break;
|
||||
case KeyEvent.VK_RIGHT:
|
||||
tempSelection.set(selection);
|
||||
tempSelection.nextBound(ctrl ? wordBreaker : charBreaker, +1, shift);
|
||||
select(tempSelection);
|
||||
break;
|
||||
case KeyEvent.VK_LEFT:
|
||||
tempSelection.set(selection);
|
||||
tempSelection.nextBound(ctrl ? wordBreaker : charBreaker, -1, shift);
|
||||
select(tempSelection);
|
||||
break;
|
||||
case KeyEvent.VK_UP: // LIU: Add support for up arrow
|
||||
tempSelection.set(selection);
|
||||
tempSelection.caret = lineDelta(tempSelection.caret, -1);
|
||||
if (!shift) {
|
||||
tempSelection.anchor = tempSelection.caret;
|
||||
}
|
||||
select(tempSelection);
|
||||
break;
|
||||
case KeyEvent.VK_DOWN: // LIU: Add support for down arrow
|
||||
tempSelection.set(selection);
|
||||
tempSelection.caret = lineDelta(tempSelection.caret, +1);
|
||||
if (!shift) {
|
||||
tempSelection.anchor = tempSelection.caret;
|
||||
}
|
||||
select(tempSelection);
|
||||
break;
|
||||
case KeyEvent.VK_DELETE: // LIU: Add delete key support
|
||||
if (!editable) break;
|
||||
if (contents.length() == 0) break;
|
||||
start = selection.getStart();
|
||||
end = selection.getEnd();
|
||||
if (start == end) {
|
||||
++end;
|
||||
if (end > contents.length()) {
|
||||
getToolkit().beep();
|
||||
return;
|
||||
}
|
||||
}
|
||||
replaceRange("", start, end);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* LIU: Given an offset into contents, moves up or down by lines,
|
||||
* according to lineStarts[].
|
||||
* @param off the offset into contents
|
||||
* @param delta how many lines to move up (< 0) or down (> 0)
|
||||
* @return the new offset into contents
|
||||
*/
|
||||
private int lineDelta(int off, int delta) {
|
||||
int line = findLine(off, false);
|
||||
int posInLine = off - lineStarts[line];
|
||||
// System.out.println("off=" + off + " at " + line + ":" + posInLine);
|
||||
line += delta;
|
||||
if (line < 0) {
|
||||
line = posInLine = 0;
|
||||
} else if (line >= lineCount) {
|
||||
return contents.length();
|
||||
}
|
||||
off = lineStarts[line] + posInLine;
|
||||
if (off >= lineStarts[line+1]) {
|
||||
off = lineStarts[line+1] - 1;
|
||||
}
|
||||
return off;
|
||||
}
|
||||
|
||||
public void keyReleased(KeyEvent e) {
|
||||
int code = e.getKeyCode();
|
||||
if (DEBUG) System.out.println("keyReleased "
|
||||
+ hex((char)code) + ", " + hex((char)e.getModifiers()));
|
||||
}
|
||||
|
||||
public void keyTyped(KeyEvent e) {
|
||||
char ch = e.getKeyChar();
|
||||
if (DEBUG) System.out.println("keyTyped "
|
||||
+ hex((char)ch) + ", " + hex((char)e.getModifiers()));
|
||||
if ((e.getModifiers() & KeyEvent.CTRL_MASK) != 0) return;
|
||||
switch (ch) {
|
||||
case KeyEvent.CHAR_UNDEFINED:
|
||||
break;
|
||||
case KeyEvent.VK_BACK_SPACE:
|
||||
if (!editable) break;
|
||||
if (contents.length() == 0) break;
|
||||
int start = selection.getStart();
|
||||
int end = selection.getEnd();
|
||||
if (start == end) {
|
||||
--start;
|
||||
if (start < 0) {
|
||||
getToolkit().beep(); // LIU: Add audio feedback of NOP
|
||||
return;
|
||||
}
|
||||
}
|
||||
replaceRange("", start, end);
|
||||
break;
|
||||
default:
|
||||
if (!editable) break;
|
||||
// LIU: Dispatch to subclass API
|
||||
handleKeyTyped(e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// LIU: Subclass API for handling of key typing
|
||||
protected void handleKeyTyped(KeyEvent e) {
|
||||
insertText(String.valueOf(e.getKeyChar()));
|
||||
}
|
||||
|
||||
// ===================== Control ======================
|
||||
|
||||
public synchronized void setEditable(boolean b) {
|
||||
editable = b;
|
||||
}
|
||||
|
||||
public boolean isEditable() {
|
||||
return editable;
|
||||
}
|
||||
|
||||
public void select(Selection newSelection) {
|
||||
newSelection.pin(contents);
|
||||
if (!selection.equals(newSelection)) {
|
||||
selection.set(newSelection);
|
||||
if (selectionListener != null) {
|
||||
selectionListener.actionPerformed(
|
||||
new ActionEvent(this, ActionEvent.ACTION_PERFORMED,
|
||||
"Selection Changed", 0));
|
||||
}
|
||||
repaint(10);
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
public void select(int start, int end) {
|
||||
select(start, end, false);
|
||||
}
|
||||
|
||||
public void select(int start, int end, boolean clickAfter) {
|
||||
tempSelection.set(start, end, clickAfter);
|
||||
select(tempSelection);
|
||||
}
|
||||
|
||||
public int getSelectionStart() {
|
||||
return selection.getStart();
|
||||
}
|
||||
|
||||
public int getSelectionEnd() {
|
||||
return selection.getEnd();
|
||||
}
|
||||
|
||||
public void setBounds(int x, int y, int w, int h) {
|
||||
super.setBounds(x,y,w,h);
|
||||
redoLines = true;
|
||||
}
|
||||
|
||||
public Dimension getPreferredSize() {
|
||||
return new Dimension(lastWidth,lastHeight);
|
||||
}
|
||||
|
||||
public Dimension getMaximumSize() {
|
||||
return new Dimension(lastWidth,lastHeight);
|
||||
}
|
||||
|
||||
public Dimension getMinimumSize() {
|
||||
return new Dimension(lastHeight,lastHeight);
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
setText2(text);
|
||||
select(tempSelection.set(selection).pin(contents));
|
||||
}
|
||||
|
||||
public void setText2(String text) {
|
||||
contents = text;
|
||||
charBreaker.setText(text);
|
||||
wordBreaker.setText(text);
|
||||
lineBreaker.setText(text);
|
||||
redoLines = true;
|
||||
if (textListener != null)
|
||||
textListener.textValueChanged(
|
||||
new TextEvent(this, TextEvent.TEXT_VALUE_CHANGED));
|
||||
repaint(16);
|
||||
}
|
||||
|
||||
public void insertText(String text) {
|
||||
replaceRange(text, selection.getStart(), selection.getEnd());
|
||||
}
|
||||
|
||||
public void replaceRange(String s, int start, int end) {
|
||||
setText2(contents.substring(0,start) + s
|
||||
+ contents.substring(end));
|
||||
select(tempSelection.set(selection).
|
||||
fixAfterReplace(start, end, s.length()));
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return contents;
|
||||
}
|
||||
|
||||
public void setFont(Font font) {
|
||||
super.setFont(font);
|
||||
redoLines = true;
|
||||
repaint(16);
|
||||
}
|
||||
|
||||
// ================== Graphics ======================
|
||||
|
||||
public void update(Graphics g) {
|
||||
if (DEBUG) System.out.println("update");
|
||||
paint(g);
|
||||
}
|
||||
|
||||
public void paint(Graphics g) {
|
||||
mySize = getSize();
|
||||
if (cacheImage == null
|
||||
|| cacheImage.getHeight(this) != mySize.height
|
||||
|| cacheImage.getWidth(this) != mySize.width) {
|
||||
cacheImage = createImage(mySize.width, mySize.height);
|
||||
valid = false;
|
||||
}
|
||||
if (!valid || redoLines) {
|
||||
if (DEBUG) System.out.println("painting");
|
||||
paint2(cacheImage.getGraphics());
|
||||
valid = true;
|
||||
}
|
||||
//getToolkit().sync();
|
||||
if (DEBUG) System.out.println("copying");
|
||||
g.drawImage(cacheImage,
|
||||
0, 0, mySize.width, mySize.height,
|
||||
0, 0, mySize.width, mySize.height,
|
||||
this);
|
||||
}
|
||||
|
||||
public void paint2(Graphics g) {
|
||||
g.clearRect(0, 0, mySize.width, mySize.height);
|
||||
if (DEBUG) System.out.println("print");
|
||||
if (focus) g.setColor(Color.black);
|
||||
else g.setColor(Color.gray);
|
||||
g.drawRect(0,0,mySize.width-1,mySize.height-1);
|
||||
g.setClip(1,1,
|
||||
mySize.width-2,mySize.height-2);
|
||||
g.setColor(Color.black);
|
||||
g.setFont(getFont());
|
||||
fm = g.getFontMetrics();
|
||||
lineAscent = fm.getAscent();
|
||||
lineLeading = fm.getLeading();
|
||||
lineHeight = lineAscent + fm.getDescent() + lineLeading;
|
||||
int y = yInset + lineAscent;
|
||||
String lastSubstring = "";
|
||||
if (redoLines) fixLineStarts(mySize.width-xInset-xInset);
|
||||
for (int i = 0; i < lineCount; y += lineHeight, ++i) {
|
||||
// LIU: Don't display terminating ^M characters
|
||||
int lim = lineStarts[i+1];
|
||||
if (lim > 0 && contents.length() > 0 &&
|
||||
contents.charAt(lim-1) == CR) --lim;
|
||||
lastSubstring = contents.substring(lineStarts[i],lim);
|
||||
g.drawString(lastSubstring, xInset, y);
|
||||
}
|
||||
drawSelection(g, lastSubstring);
|
||||
lastHeight = y + yInset - lineHeight + yInset;
|
||||
lastWidth = mySize.width-xInset-xInset;
|
||||
}
|
||||
|
||||
void paintRect(Graphics g, int x, int y, int w, int h) {
|
||||
if (focus) {
|
||||
g.fillRect(x, y, w, h);
|
||||
} else {
|
||||
g.drawRect(x, y, w-1, h-1);
|
||||
}
|
||||
}
|
||||
|
||||
public void drawSelection(Graphics g, String lastSubstring) {
|
||||
g.setXORMode(Color.black);
|
||||
if (selection.isCaret()) {
|
||||
offset2Point(selection.caret, selection.clickAfter, caretPoint);
|
||||
} else {
|
||||
if (focus) g.setColor(Color.blue);
|
||||
else g.setColor(Color.yellow);
|
||||
offset2Point(selection.getStart(), true, startPoint);
|
||||
offset2Point(selection.getEnd(), false, endPoint);
|
||||
if (selection.getStart() == selection.caret)
|
||||
caretPoint.setLocation(startPoint);
|
||||
else caretPoint.setLocation(endPoint);
|
||||
if (startPoint.y == endPoint.y) {
|
||||
paintRect(g, startPoint.x, startPoint.y,
|
||||
Math.max(1,endPoint.x-startPoint.x), lineHeight);
|
||||
} else {
|
||||
paintRect(g, startPoint.x, startPoint.y,
|
||||
(mySize.width-xInset)-startPoint.x, lineHeight);
|
||||
if (startPoint.y + lineHeight < endPoint.y)
|
||||
paintRect(g, xInset, startPoint.y + lineHeight,
|
||||
(mySize.width-xInset)-xInset, endPoint.y - startPoint.y - lineHeight);
|
||||
paintRect(g, xInset, endPoint.y, endPoint.x-xInset, lineHeight);
|
||||
}
|
||||
}
|
||||
if (focus || selection.isCaret()) {
|
||||
if (focus) g.setColor(Color.green);
|
||||
else g.setColor(Color.red);
|
||||
int line = caretPoint.x - (selection.clickAfter ? 0 : 1);
|
||||
g.fillRect(line, caretPoint.y, 1, lineHeight);
|
||||
int w = lineHeight/12 + 1;
|
||||
int braces = line - (selection.clickAfter ? -1 : w);
|
||||
g.fillRect(braces, caretPoint.y, w, 1);
|
||||
g.fillRect(braces, caretPoint.y + lineHeight - 1, w, 1);
|
||||
}
|
||||
}
|
||||
|
||||
public Point offset2Point(int off, boolean start, Point p) {
|
||||
int line = findLine(off, start);
|
||||
int width = 0;
|
||||
try {
|
||||
width = fm.stringWidth(
|
||||
contents.substring(lineStarts[line], off));
|
||||
} catch (Exception e) {
|
||||
System.out.println(e);
|
||||
}
|
||||
p.x = width + xInset;
|
||||
if (p.x > mySize.width - xInset)
|
||||
p.x = mySize.width - xInset;
|
||||
p.y = lineHeight * line + yInset;
|
||||
return p;
|
||||
}
|
||||
|
||||
private int findLine(int off, boolean start) {
|
||||
// if it is start, then go to the next line!
|
||||
if (start) ++off;
|
||||
for (int i = 1; i < lineCount; ++i) {
|
||||
// LIU: This was <= ; changed to < to make caret after
|
||||
// final CR in line appear at START of next line.
|
||||
if (off < lineStarts[i]) return i-1;
|
||||
}
|
||||
// LIU: Check for special case; after CR at end of the last line
|
||||
if (off == lineStarts[lineCount] &&
|
||||
off > 0 && contents.length() > 0 && contents.charAt(off-1) == CR) {
|
||||
return lineCount;
|
||||
}
|
||||
return lineCount-1;
|
||||
}
|
||||
|
||||
// offsets on any line will go from start,true to end,false
|
||||
// excluding start,false and end,true
|
||||
public Selection point2Offset(Point p, Selection o) {
|
||||
if (p.y < yInset) {
|
||||
o.caret = 0;
|
||||
o.clickAfter = true;
|
||||
return o;
|
||||
}
|
||||
int line = (p.y - yInset)/lineHeight;
|
||||
if (line >= lineCount) {
|
||||
o.caret = contents.length();
|
||||
o.clickAfter = false;
|
||||
return o;
|
||||
}
|
||||
int target = p.x - xInset;
|
||||
if (target <= 0) {
|
||||
o.caret = lineStarts[line];
|
||||
o.clickAfter = true;
|
||||
return o;
|
||||
}
|
||||
int lowGuess = lineStarts[line];
|
||||
int lowWidth = 0;
|
||||
int highGuess = lineStarts[line+1];
|
||||
int highWidth = fm.stringWidth(contents.substring(lineStarts[line],highGuess));
|
||||
if (target >= highWidth) {
|
||||
o.caret = lineStarts[line+1];
|
||||
o.clickAfter = false;
|
||||
return o;
|
||||
}
|
||||
while (lowGuess < highGuess - 1) {
|
||||
int guess = (lowGuess + highGuess)/2;
|
||||
int width = fm.stringWidth(contents.substring(lineStarts[line],guess));
|
||||
if (width <= target) {
|
||||
lowGuess = guess;
|
||||
lowWidth = width;
|
||||
if (width == target) break;
|
||||
} else {
|
||||
highGuess = guess;
|
||||
highWidth = width;
|
||||
}
|
||||
}
|
||||
// at end, either lowWidth < target < width(low+1), or lowWidth = target
|
||||
int highBound = charBreaker.following(lowGuess);
|
||||
int lowBound = charBreaker.previous();
|
||||
// we are now at character boundaries
|
||||
if (lowBound != lowGuess)
|
||||
lowWidth = fm.stringWidth(contents.substring(lineStarts[line],lowBound));
|
||||
if (highBound != highGuess)
|
||||
highWidth = fm.stringWidth(contents.substring(lineStarts[line],highBound));
|
||||
// we now have the right widths
|
||||
if (target - lowWidth < highWidth - target) {
|
||||
o.caret = lowBound;
|
||||
o.clickAfter = true;
|
||||
} else {
|
||||
o.caret = highBound;
|
||||
o.clickAfter = false;
|
||||
}
|
||||
// we now have the closest!
|
||||
return o;
|
||||
}
|
||||
|
||||
private void fixLineStarts(int width) {
|
||||
lineCount = 1;
|
||||
lineStarts[0] = 0;
|
||||
if (contents.length() == 0) {
|
||||
lineStarts[1] = 0;
|
||||
return;
|
||||
}
|
||||
int end = 0;
|
||||
// LIU: Add check for MAX_LINES
|
||||
for (int start = 0; start < contents.length() && lineCount < MAX_LINES;
|
||||
start = end) {
|
||||
end = nextLine(fm, start, width);
|
||||
lineStarts[lineCount++] = end;
|
||||
if (end == start) { // LIU: Assertion
|
||||
throw new RuntimeException("nextLine broken");
|
||||
}
|
||||
}
|
||||
--lineCount;
|
||||
redoLines = false;
|
||||
}
|
||||
|
||||
// LIU: Enhanced to wrap long lines. Bug with return of start fixed.
|
||||
public int nextLine(FontMetrics fm, int start, int width) {
|
||||
int len = contents.length();
|
||||
for (int i = start; i < len; ++i) {
|
||||
// check for line separator
|
||||
char ch = (contents.charAt(i));
|
||||
if (ch >= 0x000A && ch <= 0x000D || ch == 0x2028 || ch == 0x2029) {
|
||||
len = i + 1;
|
||||
if (ch == 0x000D && i+1 < len && contents.charAt(i+1) == 0x000A) // crlf
|
||||
++len; // grab extra char
|
||||
break;
|
||||
}
|
||||
}
|
||||
String subject = contents.substring(start,len);
|
||||
if (visibleWidth(fm, subject) <= width)
|
||||
return len;
|
||||
|
||||
// LIU: Remainder of this method rewritten to accomodate lines
|
||||
// longer than the component width by first trying to break
|
||||
// into lines; then words; finally chars.
|
||||
int n = findFittingBreak(fm, subject, width, lineBreaker);
|
||||
if (n == 0) {
|
||||
n = findFittingBreak(fm, subject, width, wordBreaker);
|
||||
}
|
||||
if (n == 0) {
|
||||
n = findFittingBreak(fm, subject, width, charBreaker);
|
||||
}
|
||||
return n > 0 ? start + n : len;
|
||||
}
|
||||
|
||||
/**
|
||||
* LIU: Finds the longest substring that fits a given width
|
||||
* composed of subunits returned by a BreakIterator. If the smallest
|
||||
* subunit is too long, returns 0.
|
||||
* @param fm metrics to use
|
||||
* @param line the string to be fix into width
|
||||
* @param width line.substring(0, result) must be <= width
|
||||
* @param breaker the BreakIterator that will be used to find subunits
|
||||
* @return maximum characters, at boundaries returned by breaker,
|
||||
* that fit into width, or zero on failure
|
||||
*/
|
||||
private int findFittingBreak(FontMetrics fm, String line, int width,
|
||||
BreakIterator breaker) {
|
||||
breaker.setText(line);
|
||||
int last = breaker.first();
|
||||
int end = breaker.next();
|
||||
while (end != BreakIterator.DONE &&
|
||||
visibleWidth(fm, line.substring(0, end)) <= width) {
|
||||
last = end;
|
||||
end = breaker.next();
|
||||
}
|
||||
return last;
|
||||
}
|
||||
|
||||
public int visibleWidth(FontMetrics fm, String s) {
|
||||
int i;
|
||||
for (i = s.length()-1; i >= 0; --i) {
|
||||
char ch = s.charAt(i);
|
||||
if (!(ch == ' ' || ch >= 0x000A && ch <= 0x000D || ch == 0x2028 || ch == 0x2029))
|
||||
return fm.stringWidth(s.substring(0,i+1));;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =============== Utility ====================
|
||||
|
||||
private void fixHex() {
|
||||
if (selection.getEnd() == 0) return;
|
||||
int store = 0;
|
||||
int places = 1;
|
||||
int count = 0;
|
||||
int min = Math.min(8,selection.getEnd());
|
||||
for (int i = 0; i < min; ++i) {
|
||||
char ch = contents.charAt(selection.getEnd()-1-i);
|
||||
int value = Character.getNumericValue(ch);
|
||||
if (value < 0 || value > 15) break;
|
||||
store += places * value;
|
||||
++count;
|
||||
places *= 16;
|
||||
}
|
||||
String add = "";
|
||||
int bottom = store & 0xFFFF;
|
||||
if (store >= 0xD8000000 && store < 0xDC000000
|
||||
&& bottom >= 0xDC00 && bottom < 0xE000) { // surrogates
|
||||
add = "" + (char)(store >> 16) + (char)bottom;
|
||||
} else if (store > 0xFFFF && store <= 0x10FFFF) {
|
||||
store -= 0x10000;
|
||||
add = "" + (char)(((store >> 10) & 0x3FF) + 0xD800)
|
||||
+ (char)((store & 0x3FF) + 0xDC00);
|
||||
|
||||
} else if (count >= 4) {
|
||||
count = 4;
|
||||
add = ""+(char)(store & 0xFFFF);
|
||||
} else {
|
||||
count = 1;
|
||||
char ch = contents.charAt(selection.getEnd()-1);
|
||||
add = hex(ch);
|
||||
if (ch >= 0xDC00 && ch <= 0xDFFF && selection.getEnd() > 1) {
|
||||
ch = contents.charAt(selection.getEnd()-2);
|
||||
if (ch >= 0xD800 && ch <= 0xDBFF) {
|
||||
count = 2;
|
||||
add = hex(ch) + add;
|
||||
}
|
||||
}
|
||||
}
|
||||
replaceRange(add, selection.getEnd()-count, selection.getEnd());
|
||||
}
|
||||
|
||||
public static String hex(char ch) {
|
||||
String result = Integer.toString(ch,16).toUpperCase();
|
||||
result = "0000".substring(result.length(),4) + result;
|
||||
return result;
|
||||
}
|
||||
}
|
155
icu4j/src/com/ibm/text/components/Selection.java
Executable file
155
icu4j/src/com/ibm/text/components/Selection.java
Executable file
|
@ -0,0 +1,155 @@
|
|||
package com.ibm.text.components;
|
||||
import java.text.*;
|
||||
|
||||
public final class Selection {
|
||||
|
||||
public int anchor;
|
||||
public int caret;
|
||||
public boolean clickAfter;
|
||||
|
||||
public int getStart() {
|
||||
return anchor < caret ? anchor : caret;
|
||||
}
|
||||
|
||||
public int getEnd() {
|
||||
return anchor > caret ? anchor : caret;
|
||||
}
|
||||
|
||||
public boolean isCaret() {
|
||||
return anchor == caret;
|
||||
}
|
||||
|
||||
public Selection set(Selection other) {
|
||||
anchor = other.anchor;
|
||||
caret = other.caret;
|
||||
clickAfter = other.clickAfter;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Selection set(int anchor, int caret, boolean clickAfter) {
|
||||
this.anchor = anchor;
|
||||
this.caret = caret;
|
||||
this.clickAfter = clickAfter;
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
Selection other2 = (Selection)other;
|
||||
return anchor == other2.anchor
|
||||
&& caret == other2.caret
|
||||
&& clickAfter == other2.clickAfter;
|
||||
}
|
||||
|
||||
public boolean isLessThan(Selection other) {
|
||||
return getStart() < other.getEnd();
|
||||
}
|
||||
|
||||
public Selection pin(String text) {
|
||||
if (anchor > text.length()) {
|
||||
anchor = text.length();
|
||||
} else if (anchor < 0) {
|
||||
anchor = 0;
|
||||
}
|
||||
if (caret > text.length()) {
|
||||
caret = text.length();
|
||||
clickAfter = true;
|
||||
} else if (caret < 0) {
|
||||
caret = 0;
|
||||
clickAfter = false;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public Selection swap(Selection after) {
|
||||
int temp = anchor;
|
||||
anchor = after.anchor;
|
||||
after.anchor = temp;
|
||||
temp = caret;
|
||||
caret = after.caret;
|
||||
after.caret = temp;
|
||||
boolean b = clickAfter;
|
||||
clickAfter = after.clickAfter;
|
||||
after.clickAfter = b;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Selection fixAfterReplace(int start, int end, int len) {
|
||||
if (anchor >= start) {
|
||||
if (anchor < end) anchor = end;
|
||||
anchor = start + len + anchor - end;
|
||||
}
|
||||
if (caret >= start) {
|
||||
if (caret < end) caret = end;
|
||||
caret = start + len + caret - end;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
// Mac & Windows considerably different
|
||||
// Mac: end++. If start!=end, start=end
|
||||
// SHIFT: move end right
|
||||
// CTL: no different
|
||||
// Windows:
|
||||
// UNSHIFTED: if start!=end, start = end, else start=end=end+1;
|
||||
// anchor = tip = start
|
||||
// SHIFT: tip++
|
||||
// CTL: if start!=end, start = end = nextbound(end-1),
|
||||
// else start=end=nextbound(end)
|
||||
// anchor = tip = start
|
||||
// CTL/SHIFT: tip = nextbound(tip)
|
||||
|
||||
public Selection nextBound(BreakIterator breaker,
|
||||
int direction, boolean extend) {
|
||||
if (!extend && anchor != caret) caret -= direction;
|
||||
caret = next(caret, breaker, direction, true);
|
||||
if (!extend) anchor = caret;
|
||||
clickAfter = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
// expand start and end to word breaks--if they are not already on one
|
||||
public void expand(BreakIterator breaker) {
|
||||
if (anchor <= caret) {
|
||||
anchor = next(anchor,breaker,-1,false);
|
||||
caret = next(caret,breaker,1,false);
|
||||
/*
|
||||
try {
|
||||
breaker.following(anchor);
|
||||
anchor = breaker.previous();
|
||||
} catch (Exception e) {}
|
||||
try {
|
||||
caret = breaker.following(caret-1);
|
||||
} catch (Exception e) {}
|
||||
*/
|
||||
} else {
|
||||
anchor = next(anchor,breaker,1,false);
|
||||
caret = next(caret,breaker,-1,false);
|
||||
/*
|
||||
try {
|
||||
breaker.following(caret);
|
||||
caret = breaker.previous();
|
||||
} catch (Exception e) {}
|
||||
try {
|
||||
anchor = breaker.following(anchor-1);
|
||||
} catch (Exception e) {}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
// different = false - move to next boundary, unless on one
|
||||
// true - move to next boundary, even if on one
|
||||
public static int next(int position, BreakIterator breaker,
|
||||
int direction, boolean different) {
|
||||
if (!different) position -= direction;
|
||||
try {
|
||||
if (direction > 0) {
|
||||
position = breaker.following(position);
|
||||
} else {
|
||||
breaker.following(position-1);
|
||||
position = breaker.previous();
|
||||
}
|
||||
} catch (Exception e) {}
|
||||
return position;
|
||||
}
|
||||
}
|
||||
|
191
icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java
Executable file
191
icu4j/src/com/ibm/text/components/TransliteratingTextComponent.java
Executable file
|
@ -0,0 +1,191 @@
|
|||
package com.ibm.text.components;
|
||||
|
||||
import java.awt.*;
|
||||
import java.awt.event.*;
|
||||
import java.text.*;
|
||||
import java.awt.datatransfer.*;
|
||||
import com.ibm.text.*;
|
||||
|
||||
/**
|
||||
* A subclass of {@link DumbTextComponent} that passes key events through
|
||||
* a {@link com.ibm.text.Transliterator}.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: TransliteratingTextComponent.java,v $ $Revision: 1.1 $ $Date: 1999/12/20 18:29:21 $
|
||||
*/
|
||||
public class TransliteratingTextComponent extends DumbTextComponent {
|
||||
|
||||
private static boolean DEBUG = false;
|
||||
|
||||
private Transliterator translit = null;
|
||||
|
||||
// Index into getText() where the start of transliteration is.
|
||||
// As we commit text during keyboardTransliteration, we advance
|
||||
// this.
|
||||
private int start = 0;
|
||||
|
||||
// Index into getText() where the cursor is; cursor >= start
|
||||
private int cursor = 0;
|
||||
|
||||
private static final String COPYRIGHT =
|
||||
"\u00A9 IBM Corporation 1999. All rights reserved.";
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
*/
|
||||
public TransliteratingTextComponent() {
|
||||
super();
|
||||
addActionListener(new ActionListener() {
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
// We get an ActionEvent only when the selection changes
|
||||
resetTransliterationStart();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link DumbTextComponent} API. Framework method that is called
|
||||
* when a <code>KeyEvent</code> is received. This implementation
|
||||
* runs the new character through the current
|
||||
* <code>Transliterator</code>, if one is set, and inserts the
|
||||
* transliterated text into the buffer.
|
||||
*/
|
||||
protected void handleKeyTyped(KeyEvent e) {
|
||||
char ch = e.getKeyChar();
|
||||
|
||||
if (translit == null) {
|
||||
super.handleKeyTyped(e);
|
||||
return;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// The following case motivates the two lines that recompute
|
||||
// start and cursor below.
|
||||
|
||||
// " "
|
||||
// a b c q r|s t u m m
|
||||
// 0 1 2 3 4 5 6 7 8 9
|
||||
// 0 1 2
|
||||
|
||||
// start 3, cursor 5, sel 6 -> { 0, 3, 2 }
|
||||
// : new int[] { 0, sel - start, cursor - start };
|
||||
|
||||
// sz>99|9
|
||||
|
||||
// " { "
|
||||
// a b c q r 9 9|9 t u m m
|
||||
// 0 1 2 3 4 5 6 7 8 9 a b
|
||||
// 0 1 2 3 4
|
||||
|
||||
// { 3, 5, 4 } -> start 6, cursor 7, sel 8
|
||||
// : start += index[0];
|
||||
// : cursor = start + index[2] - index[0];
|
||||
// ------------------------------------------------------------
|
||||
|
||||
// Need to save start because calls to replaceRange will update
|
||||
// start and cursor.
|
||||
int saveStart = start;
|
||||
|
||||
ReplaceableString buf = new ReplaceableString();
|
||||
buf.getStringBuffer().append(getText().substring(start,
|
||||
getSelectionStart()));
|
||||
|
||||
int[] index = new int[] { 0, getSelectionStart() - start,
|
||||
cursor - start};
|
||||
|
||||
StringBuffer log = null;
|
||||
if (DEBUG) {
|
||||
log = new StringBuffer();
|
||||
log.append("start " + start + ", cursor " + cursor);
|
||||
log.append(", sel " + getSelectionStart());
|
||||
log.append(", {" + index[0] + ", " + index[1] + ", " + index[2] + "}, ");
|
||||
log.append('"' + buf.toString() + "\" + '" + ch + "' -> \"");
|
||||
}
|
||||
|
||||
translit.keyboardTransliterate(buf, index, ch);
|
||||
replaceRange(buf.toString(), start, getSelectionEnd());
|
||||
// At this point start has been changed by the callback to
|
||||
// resetTransliteratorStart() via replaceRange() -- so use our
|
||||
// local copy, saveStart.
|
||||
|
||||
// The START index is zero-based. On entry to keyboardTransliterate(),
|
||||
// it was zero. We can therefore just add it to our original
|
||||
// getText()-based index value of start (in saveStart) to get
|
||||
// the new getText()-based start.
|
||||
start = saveStart + index[Transliterator.START];
|
||||
|
||||
// Make the cursor getText()-based. The CURSOR index is zero-based.
|
||||
cursor = start + index[Transliterator.CURSOR]
|
||||
- index[Transliterator.START];
|
||||
|
||||
if (DEBUG) {
|
||||
String out = buf.toString();
|
||||
log.append(out.substring(0, index[Transliterator.START])).
|
||||
append('{').
|
||||
append(out.substring(index[Transliterator.START],
|
||||
index[Transliterator.CURSOR])).
|
||||
append('|').
|
||||
append(out.substring(index[Transliterator.CURSOR])).
|
||||
append('"');
|
||||
log.append(", {" + index[0] + ", " + index[1] + ", " + index[2] + "}, ");
|
||||
log.append("start " + start + ", cursor " + cursor);
|
||||
log.append(", sel " + getSelectionStart());
|
||||
System.out.println(escape(log.toString()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the {@link com.ibm.text.Transliterator} and direction to
|
||||
* use to process incoming <code>KeyEvent</code>s.
|
||||
* @param t the {@link com.ibm.text.Transliterator} to use
|
||||
*/
|
||||
public void setTransliterator(Transliterator t) {
|
||||
if (translit != t) { // [sic] pointer compare ok; singletons
|
||||
resetTransliterationStart();
|
||||
}
|
||||
translit = t;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the start point at which transliteration begins. This
|
||||
* needs to be done when the user moves the cursor or when the
|
||||
* current {@link com.ibm.text.Transliterator} is changed.
|
||||
*/
|
||||
private void resetTransliterationStart() {
|
||||
start = getSelectionStart();
|
||||
cursor = start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape non-ASCII characters as Unicode.
|
||||
* JUST FOR DEBUGGING OUTPUT.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
if (c == '\\') {
|
||||
buf.append("\\\\"); // That is, "\\"
|
||||
} else {
|
||||
buf.append(c);
|
||||
}
|
||||
} else {
|
||||
buf.append("\\u");
|
||||
if (c < 0x1000) {
|
||||
buf.append('0');
|
||||
if (c < 0x100) {
|
||||
buf.append('0');
|
||||
if (c < 0x10) {
|
||||
buf.append('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
buf.append(Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
}
|
132
icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java
Executable file
132
icu4j/src/com/ibm/text/resources/TransliterationRule$KeyboardEscape$Latin1.java
Executable file
|
@ -0,0 +1,132 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleKeyboardEscapeLatin1 extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"Keyboard transliterator for Latin-1 block" },
|
||||
|
||||
{ "Rule",
|
||||
"esc=''\n"
|
||||
+ "grave=`\n"
|
||||
+ "acute=''\n"
|
||||
+ "hat=^\n"
|
||||
+ "tilde=~\n"
|
||||
+ "umlaut=:\n"
|
||||
+ "ring=.\n"
|
||||
+ "cedilla=,\n"
|
||||
+ "slash=/\n"
|
||||
+ "super=^\n"
|
||||
|
||||
// Make keyboard entry of {esc} possible
|
||||
// and of backslash
|
||||
+ "'\\'{esc}>{esc}\n"
|
||||
+ "'\\\\'>'\\'\n"
|
||||
|
||||
// Long keys
|
||||
+ "cur{esc}>\u00A4\n"
|
||||
+ "sec{esc}>\u00A7\n"
|
||||
+ "not{esc}>\u00AC\n"
|
||||
+ "mul{esc}>\u00D7\n"
|
||||
+ "div{esc}>\u00F7\n"
|
||||
|
||||
+ " {esc}>\u00A0\n" // non-breaking space
|
||||
+ "!{esc}>\u00A1\n" // inverted exclamation
|
||||
+ "c/{esc}>\u00A2\n" // cent sign
|
||||
+ "lb{esc}>\u00A3\n" // pound sign
|
||||
+ "'|'{esc}>\u00A6\n" // broken vertical bar
|
||||
+ ":{esc}>\u00A8\n" // umlaut
|
||||
+ "{super}a{esc}>\u00AA\n" // feminine ordinal
|
||||
+ "'<<'{esc}>\u00AB\n"
|
||||
+ "r{esc}>\u00AE\n"
|
||||
+ "--{esc}>\u00AF\n"
|
||||
+ "-{esc}>\u00AD\n"
|
||||
+ "+-{esc}>\u00B1\n"
|
||||
+ "{super}2{esc}>\u00B2\n"
|
||||
+ "{super}3{esc}>\u00B3\n"
|
||||
+ "{acute}{esc}>\u00B4\n"
|
||||
+ "m{esc}>\u00B5\n"
|
||||
+ "para{esc}>\u00B6\n"
|
||||
+ "dot{esc}>\u00B7\n"
|
||||
+ "{cedilla}{esc}>\u00B8\n"
|
||||
+ "{super}1{esc}>\u00B9\n"
|
||||
+ "{super}o{esc}>\u00BA\n" // masculine ordinal
|
||||
+ "'>>'{esc}>\u00BB\n"
|
||||
+ "1/4{esc}>\u00BC\n"
|
||||
+ "1/2{esc}>\u00BD\n"
|
||||
+ "3/4{esc}>\u00BE\n"
|
||||
+ "?{esc}>\u00BF\n"
|
||||
+ "A{grave}{esc}>\u00C0\n"
|
||||
+ "A{acute}{esc}>\u00C1\n"
|
||||
+ "A{hat}{esc}>\u00C2\n"
|
||||
+ "A{tilde}{esc}>\u00C3\n"
|
||||
+ "A{umlaut}{esc}>\u00C4\n"
|
||||
+ "A{ring}{esc}>\u00C5\n"
|
||||
+ "AE{esc}>\u00C6\n"
|
||||
+ "C{cedilla}{esc}>\u00C7\n"
|
||||
+ "E{grave}{esc}>\u00C8\n"
|
||||
+ "E{acute}{esc}>\u00C9\n"
|
||||
+ "E{hat}{esc}>\u00CA\n"
|
||||
+ "E{umlaut}{esc}>\u00CB\n"
|
||||
+ "I{grave}{esc}>\u00CC\n"
|
||||
+ "I{acute}{esc}>\u00CD\n"
|
||||
+ "I{hat}{esc}>\u00CE\n"
|
||||
+ "I{umlaut}{esc}>\u00CF\n"
|
||||
+ "D-{esc}>\u00D0\n"
|
||||
+ "N{tilde}{esc}>\u00D1\n"
|
||||
+ "O{grave}{esc}>\u00D2\n"
|
||||
+ "O{acute}{esc}>\u00D3\n"
|
||||
+ "O{hat}{esc}>\u00D4\n"
|
||||
+ "O{tilde}{esc}>\u00D5\n"
|
||||
+ "O{umlaut}{esc}>\u00D6\n"
|
||||
+ "O{slash}{esc}>\u00D8\n"
|
||||
+ "U{grave}{esc}>\u00D9\n"
|
||||
+ "U{acute}{esc}>\u00DA\n"
|
||||
+ "U{hat}{esc}>\u00DB\n"
|
||||
+ "U{umlaut}{esc}>\u00DC\n"
|
||||
+ "Y{acute}{esc}>\u00DD\n"
|
||||
+ "TH{esc}>\u00DE\n"
|
||||
+ "ss{esc}>\u00DF\n"
|
||||
+ "a{grave}{esc}>\u00E0\n"
|
||||
+ "a{acute}{esc}>\u00E1\n"
|
||||
+ "a{hat}{esc}>\u00E2\n"
|
||||
+ "a{tilde}{esc}>\u00E3\n"
|
||||
+ "a{umlaut}{esc}>\u00E4\n"
|
||||
+ "a{ring}{esc}>\u00E5\n"
|
||||
+ "ae{esc}>\u00E6\n"
|
||||
+ "c{cedilla}{esc}>\u00E7\n"
|
||||
+ "c{esc}>\u00A9\n" // copyright - after c{cedilla}
|
||||
+ "e{grave}{esc}>\u00E8\n"
|
||||
+ "e{acute}{esc}>\u00E9\n"
|
||||
+ "e{hat}{esc}>\u00EA\n"
|
||||
+ "e{umlaut}{esc}>\u00EB\n"
|
||||
+ "i{grave}{esc}>\u00EC\n"
|
||||
+ "i{acute}{esc}>\u00ED\n"
|
||||
+ "i{hat}{esc}>\u00EE\n"
|
||||
+ "i{umlaut}{esc}>\u00EF\n"
|
||||
+ "d-{esc}>\u00F0\n"
|
||||
+ "n{tilde}{esc}>\u00F1\n"
|
||||
+ "o{grave}{esc}>\u00F2\n"
|
||||
+ "o{acute}{esc}>\u00F3\n"
|
||||
+ "o{hat}{esc}>\u00F4\n"
|
||||
+ "o{tilde}{esc}>\u00F5\n"
|
||||
+ "o{umlaut}{esc}>\u00F6\n"
|
||||
+ "o{slash}{esc}>\u00F8\n"
|
||||
+ "o{esc}>\u00B0\n"
|
||||
+ "u{grave}{esc}>\u00F9\n"
|
||||
+ "u{acute}{esc}>\u00FA\n"
|
||||
+ "u{hat}{esc}>\u00FB\n"
|
||||
+ "u{umlaut}{esc}>\u00FC\n"
|
||||
+ "y{acute}{esc}>\u00FD\n"
|
||||
+ "y{esc}>\u00A5\n" // yen sign
|
||||
+ "th{esc}>\u00FE\n"
|
||||
+ "ss{esc}>\u00FF\n"
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
243
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java
Executable file
243
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Arabic.java
Executable file
|
@ -0,0 +1,243 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleLatinArabic extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "HasInverse", "1" },
|
||||
|
||||
{ "Rule",
|
||||
// To Do: finish adding shadda, add sokoon
|
||||
|
||||
"alefmadda=\u0622\n"+
|
||||
"alefuhamza=\u0623\n"+
|
||||
"wauuhamza=\u0624\n"+
|
||||
"alefhamza=\u0625\n"+
|
||||
"yehuhamza=\u0626\n"+
|
||||
"alef=\u0627\n"+
|
||||
"beh=\u0628\n"+
|
||||
"tehmarbuta=\u0629\n"+
|
||||
"teh=\u062A\n"+
|
||||
"theh=\u062B\n"+
|
||||
"geem=\u062C\n"+
|
||||
"hah=\u062D\n"+
|
||||
"kha=\u062E\n"+
|
||||
"dal=\u062F\n"+
|
||||
"dhal=\u0630\n"+
|
||||
"reh=\u0631\n"+
|
||||
"zain=\u0632\n"+
|
||||
"seen=\u0633\n"+
|
||||
"sheen=\u0634\n"+
|
||||
"sad=\u0635\n"+
|
||||
"dad=\u0636\n"+
|
||||
"tah=\u0637\n"+
|
||||
"zah=\u0638\n"+
|
||||
"ein=\u0639\n"+
|
||||
"ghein=\u063A\n"+
|
||||
"feh=\u0641\n"+
|
||||
"qaaf=\u0642\n"+
|
||||
"kaf=\u0643\n"+
|
||||
"lam=\u0644\n"+
|
||||
"meem=\u0645\n"+
|
||||
"noon=\u0646\n"+
|
||||
"heh=\u0647\n"+
|
||||
"wau=\u0648\n"+
|
||||
"yehmaqsura=\u0649\n"+
|
||||
"yeh=\u064A\n"+
|
||||
"peh=\u06A4\n"+
|
||||
|
||||
"hamza=\u0621\n"+
|
||||
"fathatein=\u064B\n"+
|
||||
"dammatein=\u064C\n"+
|
||||
"kasratein=\u064D\n"+
|
||||
"fatha=\u064E\n"+
|
||||
"damma=\u064F\n"+
|
||||
"kasra=\u0650\n"+
|
||||
"shadda=\u0651\n"+
|
||||
"sokoon=\u0652\n"+
|
||||
|
||||
// convert English to Arabic
|
||||
"Arabic>"+
|
||||
"\u062a\u062a\u0645\u062a\u0639\u0020"+
|
||||
"\u0627\u0644\u0644\u063a\u0629\u0020"+
|
||||
"\u0627\u0644\u0639\u0631\u0628\u0628\u064a\u0629\u0020"+
|
||||
"\u0628\u0628\u0646\u0638\u0645\u0020"+
|
||||
"\u0643\u062a\u0627\u0628\u0628\u064a\u0629\u0020"+
|
||||
"\u062c\u0645\u064a\u0644\u0629\n"+
|
||||
|
||||
"ai>{alefmadda}\n"+
|
||||
"ae>{alefuhamza}\n"+
|
||||
"ao>{alefhamza}\n"+
|
||||
"aa>{alef}\n"+
|
||||
"an>{fathatein}\n"+
|
||||
"a>{fatha}\n"+
|
||||
"b>{beh}\n"+
|
||||
"c>{kaf}\n"+
|
||||
"{dhal}]dh>{shadda}\n"+
|
||||
"dh>{dhal}\n"+
|
||||
"{dad}]dd>{shadda}\n"+
|
||||
"dd>{dad}\n"+
|
||||
"{dal}]d>{shadda}\n"+
|
||||
"d>{dal}\n"+
|
||||
"e>{ein}\n"+
|
||||
"f>{feh}\n"+
|
||||
"gh>{ghein}\n"+
|
||||
"g>{geem}\n"+
|
||||
"hh>{hah}\n"+
|
||||
"h>{heh}\n"+
|
||||
"ii>{kasratein}\n"+
|
||||
"i>{kasra}\n"+
|
||||
"j>{geem}\n"+
|
||||
"kh>{kha}\n"+
|
||||
"k>{kaf}\n"+
|
||||
"l>{lam}\n"+
|
||||
"m>{meem}\n"+
|
||||
"n>{noon}\n"+
|
||||
"o>{hamza}\n"+
|
||||
"p>{peh}\n"+
|
||||
"q>{qaaf}\n"+
|
||||
"r>{reh}\n"+
|
||||
"sh>{sheen}\n"+
|
||||
"ss>{sad}\n"+
|
||||
"s>{seen}\n"+
|
||||
"th>{theh}\n"+
|
||||
"tm>{tehmarbuta}\n"+
|
||||
"tt>{tah}\n"+
|
||||
"t>{teh}\n"+
|
||||
"uu>{dammatein}\n"+
|
||||
"u>{damma}\n"+
|
||||
"v>{beh}\n"+
|
||||
"we>{wauuhamza}\n"+
|
||||
"w>{wau}\n"+
|
||||
"x>{kaf}{shadda}{seen}\n"+
|
||||
"ye>{yehuhamza}\n"+
|
||||
"ym>{yehmaqsura}\n"+
|
||||
"y>{yeh}\n"+
|
||||
"zz>{zah}\n"+
|
||||
"z>{zain}\n"+
|
||||
|
||||
"0>\u0660\n"+ // Arabic digit 0
|
||||
"1>\u0661\n"+ // Arabic digit 1
|
||||
"2>\u0662\n"+ // Arabic digit 2
|
||||
"3>\u0663\n"+ // Arabic digit 3
|
||||
"4>\u0664\n"+ // Arabic digit 4
|
||||
"5>\u0665\n"+ // Arabic digit 5
|
||||
"6>\u0666\n"+ // Arabic digit 6
|
||||
"7>\u0667\n"+ // Arabic digit 7
|
||||
"8>\u0668\n"+ // Arabic digit 8
|
||||
"9>\u0669\n"+ // Arabic digit 9
|
||||
"%>\u066A\n"+ // Arabic %
|
||||
".>\u066B\n"+ // Arabic decimal separator
|
||||
",>\u066C\n"+ // Arabic thousands separator
|
||||
"*>\u066D\n"+ // Arabic five-pointed star
|
||||
|
||||
"`0>0\n"+ // Escaped forms of the above
|
||||
"`1>1\n"+
|
||||
"`2>2\n"+
|
||||
"`3>3\n"+
|
||||
"`4>4\n"+
|
||||
"`5>5\n"+
|
||||
"`6>6\n"+
|
||||
"`7>7\n"+
|
||||
"`8>8\n"+
|
||||
"`9>9\n"+
|
||||
"`%>%\n"+
|
||||
"`.>.\n"+
|
||||
"`,>,\n"+
|
||||
"`*>*\n"+
|
||||
"``>`\n"+
|
||||
|
||||
"''>\n"+
|
||||
|
||||
// now Arabic to English
|
||||
|
||||
"''ai<a]{alefmadda}\n"+
|
||||
"ai<{alefmadda}\n"+
|
||||
"''ae<a]{alefuhamza}\n"+
|
||||
"ae<{alefuhamza}\n"+
|
||||
"''ao<a]{alefhamza}\n"+
|
||||
"ao<{alefhamza}\n"+
|
||||
"''aa<a]{alef}\n"+
|
||||
"aa<{alef}\n"+
|
||||
"''an<a]{fathatein}\n"+
|
||||
"an<{fathatein}\n"+
|
||||
"''a<a]{fatha}\n"+
|
||||
"a<{fatha}\n"+
|
||||
"b<{beh}\n"+
|
||||
"''dh<d]{dhal}\n"+
|
||||
"dh<{dhal}\n"+
|
||||
"''dd<d]{dad}\n"+
|
||||
"dd<{dad}\n"+
|
||||
"''d<d]{dal}\n"+
|
||||
"d<{dal}\n"+
|
||||
"''e<a]{ein}\n"+
|
||||
"''e<w]{ein}\n"+
|
||||
"''e<y]{ein}\n"+
|
||||
"e<{ein}\n"+
|
||||
"f<{feh}\n"+
|
||||
"gh<{ghein}\n"+
|
||||
"''hh<d]{hah}\n"+
|
||||
"''hh<t]{hah}\n"+
|
||||
"''hh<k]{hah}\n"+
|
||||
"''hh<s]{hah}\n"+
|
||||
"hh<{hah}\n"+
|
||||
"''h<d]{heh}\n"+
|
||||
"''h<t]{heh}\n"+
|
||||
"''h<k]{heh}\n"+
|
||||
"''h<s]{heh}\n"+
|
||||
"h<{heh}\n"+
|
||||
"''ii<i]{kasratein}\n"+
|
||||
"ii<{kasratein}\n"+
|
||||
"''i<i]{kasra}\n"+
|
||||
"i<{kasra}\n"+
|
||||
"j<{geem}\n"+
|
||||
"kh<{kha}\n"+
|
||||
"x<{kaf}{shadda}{seen}\n"+
|
||||
"k<{kaf}\n"+
|
||||
"l<{lam}\n"+
|
||||
"''m<y]{meem}\n"+
|
||||
"''m<t]{meem}\n"+
|
||||
"m<{meem}\n"+
|
||||
"n<{noon}\n"+
|
||||
"''o<a]{hamza}\n"+
|
||||
"o<{hamza}\n"+
|
||||
"p<{peh}\n"+
|
||||
"q<{qaaf}\n"+
|
||||
"r<{reh}\n"+
|
||||
"sh<{sheen}\n"+
|
||||
"''ss<s]{sad}\n"+
|
||||
"ss<{sad}\n"+
|
||||
"''s<s]{seen}\n"+
|
||||
"s<{seen}\n"+
|
||||
"th<{theh}\n"+
|
||||
"tm<{tehmarbuta}\n"+
|
||||
"''tt<t]{tah}\n"+
|
||||
"tt<{tah}\n"+
|
||||
"''t<t]{teh}\n"+
|
||||
"t<{teh}\n"+
|
||||
"''uu<u]{dammatein}\n"+
|
||||
"uu<{dammatein}\n"+
|
||||
"''u<u]{damma}\n"+
|
||||
"u<{damma}\n"+
|
||||
"we<{wauuhamza}\n"+
|
||||
"w<{wau}\n"+
|
||||
"ye<{yehuhamza}\n"+
|
||||
"ym<{yehmaqsura}\n"+
|
||||
"''y<y]{yeh}\n"+
|
||||
"y<{yeh}\n"+
|
||||
"''zz<z]{zah}\n"+
|
||||
"zz<{zah}\n"+
|
||||
"''z<z]{zain}\n"+
|
||||
"z<{zain}\n"+
|
||||
|
||||
"dh<dh]{shadda}\n"+
|
||||
"dd<dd]{shadda}\n"+
|
||||
"''d<d]{shadda}\n"
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
367
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Cyrillic.java
Executable file
367
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Cyrillic.java
Executable file
|
@ -0,0 +1,367 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleLatinRussian extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"xxxxxxxxxxxx" },
|
||||
|
||||
{ "Rule",
|
||||
// Russian Letters
|
||||
|
||||
"cyA=\u0410\n" +
|
||||
"cyBe=\u0411\n" +
|
||||
"cyVe=\u0412\n" +
|
||||
"cyGe=\u0413\n" +
|
||||
"cyDe=\u0414\n" +
|
||||
"cyYe=\u0415\n" +
|
||||
"cyYo=\u0416\n" +
|
||||
"cyZhe=\u0417\n" +
|
||||
"cyZe=\u0418\n" +
|
||||
"cyYi=\u0419\n" +
|
||||
"cyY=\u0419\n" +
|
||||
"cyKe=\u041a\n" +
|
||||
"cyLe=\u041b\n" +
|
||||
"cyMe=\u041c\n" +
|
||||
"cyNe=\u041d\n" +
|
||||
"cyO=\u041e\n" +
|
||||
"cyPe=\u041f\n" +
|
||||
|
||||
"cyRe=\u0420\n" +
|
||||
"cySe=\u0421\n" +
|
||||
"cyTe=\u0422\n" +
|
||||
"cyU=\u0423\n" +
|
||||
"cyFe=\u0424\n" +
|
||||
"cyKhe=\u0425\n" +
|
||||
"cyTse=\u0426\n" +
|
||||
"cyChe=\u0427\n" +
|
||||
"cyShe=\u0428\n" +
|
||||
"cyShche=\u0429\n" +
|
||||
"cyHard=\u042a\n" +
|
||||
"cyI=\u042b\n" +
|
||||
"cySoft=\u042c\n" +
|
||||
"cyE=\u042d\n" +
|
||||
"cyYu=\u042e\n" +
|
||||
"cyYa=\u042f\n" +
|
||||
|
||||
"cya=\u0430\n" +
|
||||
"cybe=\u0431\n" +
|
||||
"cyve=\u0432\n" +
|
||||
"cyge=\u0433\n" +
|
||||
"cyde=\u0434\n" +
|
||||
"cyye=\u0435\n" +
|
||||
"cyzhe=\u0436\n" +
|
||||
"cyze=\u0437\n" +
|
||||
"cyyi=\u0438\n" +
|
||||
"cyy=\u0439\n" +
|
||||
"cyke=\u043a\n" +
|
||||
"cyle=\u043b\n" +
|
||||
"cyme=\u043c\n" +
|
||||
"cyne=\u043d\n" +
|
||||
"cyo=\u043e\n" +
|
||||
"cype=\u043f\n" +
|
||||
|
||||
"cyre=\u0440\n" +
|
||||
"cyse=\u0441\n" +
|
||||
"cyte=\u0442\n" +
|
||||
"cyu=\u0443\n" +
|
||||
"cyfe=\u0444\n" +
|
||||
"cykhe=\u0445\n" +
|
||||
"cytse=\u0446\n" +
|
||||
"cyche=\u0447\n" +
|
||||
"cyshe=\u0448\n" +
|
||||
"cyshche=\u0449\n" +
|
||||
"cyhard=\u044a\n" +
|
||||
"cyi=\u044b\n" +
|
||||
"cysoft=\u044c\n" +
|
||||
"cye=\u044d\n" +
|
||||
"cyyu=\u044e\n" +
|
||||
"cyya=\u044f\n" +
|
||||
|
||||
"cyyo=\u0451\n" +
|
||||
|
||||
"a=[aA]\n" +
|
||||
"c=[cC]\n" +
|
||||
"e=[eE]\n" +
|
||||
"h=[hH]\n" +
|
||||
"i=[iI]\n" +
|
||||
"o=[oO]\n" +
|
||||
"s=[sS]\n" +
|
||||
"t=[tT]\n" +
|
||||
"u=[uU]\n" +
|
||||
"iey=[ieyIEY]\n" +
|
||||
"lower=[:Lu:]\n" +
|
||||
|
||||
// convert English to Russian
|
||||
"Russian>\u041f\u0420\u0410\u0412\u0414\u0410\u00D1\u0020\u0411\u044d\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f\u002c\u0020\u043a\u044b\u0440\u0433\u044b\u0437\u002c\u0020\u041c\u043e\u043b\u0434\u043e\u0432\u044d\u043d\u044f\u0441\u043a\u044d\u002e\n" +
|
||||
|
||||
//special equivs for ay, oy, ...
|
||||
"Y{a}{i}>{cyYa}{cyY}\n" +
|
||||
"Y{e}{i}>{cyYe}{cyY}\n" +
|
||||
"Y{i}{i}>{cyYi}{cyY}\n" +
|
||||
"Y{o}{i}>{cyYo}{cyY}\n" +
|
||||
"Y{u}{i}>{cyYu}{cyY}\n" +
|
||||
"A{i}>{cyA}{cyY}\n" +
|
||||
"E{i}>{cyE}{cyY}\n" +
|
||||
//skip II, since it is the soft sign
|
||||
"O{i}>{cyO}{cyY}\n" +
|
||||
"U{i}>{cyU}{cyY}\n" +
|
||||
|
||||
"A>{cyA}\n" +
|
||||
"B>{cyBe}\n" +
|
||||
"C{h}>{cyChe}\n" +
|
||||
"C[{iey}>{cySe}\n" +
|
||||
"C>{cyKe}\n" +
|
||||
"D>{cyDe}\n" +
|
||||
"E>{cyE}\n" +
|
||||
"F>{cyFe}\n" +
|
||||
"G>{cyGe}\n" +
|
||||
"H>{cyHard}\n" +
|
||||
"I{i}>{cySoft}\n" +
|
||||
"I>{cyI}\n" +
|
||||
"J>{cyDe}{cyZhe}\n" +
|
||||
"K{h}>{cyKhe}\n" +
|
||||
"K>{cyKe}\n" +
|
||||
"L>{cyLe}\n" +
|
||||
"M>{cyMe}\n" +
|
||||
"N>{cyNe}\n" +
|
||||
"O>{cyO}\n" +
|
||||
"P>{cyPe}\n" +
|
||||
"Q{u}>{cyKe}{cyVe}\n" +
|
||||
"R>{cyRe}\n" +
|
||||
"S{h}{t}{c}{h}>{cyShche}\n" +
|
||||
"S{h}{c}{h}>{cyShche}\n" +
|
||||
"S{h}>{cyShe}\n" +
|
||||
"S>{cySe}\n" +
|
||||
"T{c}{h}>{cyChe}\n" +
|
||||
"T{h}>{cyZe}\n" +
|
||||
"T{s}>{cyTse}\n" +
|
||||
"T>{cyTe}\n" +
|
||||
"U>{cyU}\n" +
|
||||
"V>{cyVe}\n" +
|
||||
"W{h}>{cyVe}\n" +
|
||||
"W>{cyVe}\n" +
|
||||
"X>{cyKe}{cySe}\n" +
|
||||
"Y{e}>{cyYe}\n" +
|
||||
"Y{o}>{cyYo}\n" +
|
||||
"Y{u}>{cyYu}\n" +
|
||||
"Y{a}>{cyYa}\n" +
|
||||
"Y{i}>{cyYi}\n" +
|
||||
"Y>{cyY}\n" +
|
||||
"Z{h}>{cyZhe}\n" +
|
||||
"Z>{cyZe}\n" +
|
||||
"X>{cyKe}{cySe}\n" +
|
||||
|
||||
//lower case: doesn''t solve join bug
|
||||
"y{a}{i}>{cyya}{cyy}\n" +
|
||||
"y{e}{i}>{cyye}{cyy}\n" +
|
||||
"y{i}{i}>{cyyi}{cyy}\n" +
|
||||
"y{o}{i}>{cyyo}{cyy}\n" +
|
||||
"y{u}{i}>{cyyu}{cyy}\n" +
|
||||
"a{i}>{cya}{cyy}\n" +
|
||||
"e{i}>{cye}{cyy}\n" +
|
||||
//skip ii, since it is the soft sign
|
||||
"o{i}>{cyo}{cyy}\n" +
|
||||
"u{i}>{cyu}{cyy}\n" +
|
||||
|
||||
"a>{cya}\n" +
|
||||
"b>{cybe}\n" +
|
||||
"c{h}>{cyche}\n" +
|
||||
"c[{iey}>{cyse}\n" +
|
||||
"c>{cyke}\n" +
|
||||
"d>{cyde}\n" +
|
||||
"e>{cye}\n" +
|
||||
"f>{cyfe}\n" +
|
||||
"g>{cyge}\n" +
|
||||
"h>{cyhard}\n" +
|
||||
"i{i}>{cysoft}\n" +
|
||||
"i>{cyi}\n" +
|
||||
"j>{cyde}{cyzhe}\n" +
|
||||
"k{h}>{cykhe}\n" +
|
||||
"k>{cyke}\n" +
|
||||
"l>{cyle}\n" +
|
||||
"m>{cyme}\n" +
|
||||
"n>{cyne}\n" +
|
||||
"o>{cyo}\n" +
|
||||
"p>{cype}\n" +
|
||||
"q{u}>{cyke}{cyve}\n" +
|
||||
"r>{cyre}\n" +
|
||||
"s{h}{t}{c}{h}>{cyshche}\n" +
|
||||
"s{h}{c}{h}>{cyshche}\n" +
|
||||
"s{h}>{cyshe}\n" +
|
||||
"s>{cyse}\n" +
|
||||
"t{c}{h}>{cyche}\n" +
|
||||
"t{h}>{cyze}\n" +
|
||||
"t{s}>{cytse}\n" +
|
||||
"t>{cyte}\n" +
|
||||
"u>{cyu}\n" +
|
||||
"v>{cyve}\n" +
|
||||
"w{h}>{cyve}\n" +
|
||||
"w>{cyve}\n" +
|
||||
"x>{cyke}{cyse}\n" +
|
||||
"y{e}>{cyye}\n" +
|
||||
"y{o}>{cyyo}\n" +
|
||||
"y{u}>{cyyu}\n" +
|
||||
"y{a}>{cyya}\n" +
|
||||
"y{i}>{cyyi}\n" +
|
||||
"y>{cyy}\n" +
|
||||
"z{h}>{cyzhe}\n" +
|
||||
"z>{cyze}\n" +
|
||||
"x>{cyke}{cyse}\n" +
|
||||
|
||||
//generally the last rule
|
||||
"''>\n" +
|
||||
|
||||
//now Russian to English
|
||||
|
||||
"Y''<{cyY}[{cyA}\n" +
|
||||
"Y''<{cyY}[{cyE}\n" +
|
||||
"Y''<{cyY}[{cyI}\n" +
|
||||
"Y''<{cyY}[{cyO}\n" +
|
||||
"Y''<{cyY}[{cyU}\n" +
|
||||
"Y''<{cyY}[{cya}\n" +
|
||||
"Y''<{cyY}[{cye}\n" +
|
||||
"Y''<{cyY}[{cyi}\n" +
|
||||
"Y''<{cyY}[{cyo}\n" +
|
||||
"Y''<{cyY}[{cyu}\n" +
|
||||
"A<{cyA}\n" +
|
||||
"B<{cyBe}\n" +
|
||||
"J<{cyDe}{cyZhe}\n" +
|
||||
"J<{cyDe}{cyzhe}\n" +
|
||||
"D<{cyDe}\n" +
|
||||
"V<{cyVe}\n" +
|
||||
"G<{cyGe}\n" +
|
||||
"Zh<{cyZhe}[{lower}\n" +
|
||||
"ZH<{cyZhe}\n" +
|
||||
"Z''<{cyZe}[{cyHard}\n" +
|
||||
"Z''<{cyZe}[{cyhard}\n" +
|
||||
"Z<{cyZe}\n" +
|
||||
"Ye<{cyYe}[{lower}\n" +
|
||||
"YE<{cyYe}\n" +
|
||||
"Yo<{cyYo}[{lower}\n" +
|
||||
"YO<{cyYo}\n" +
|
||||
"Yu<{cyYu}[{lower}\n" +
|
||||
"YU<{cyYu}\n" +
|
||||
"Ya<{cyYa}[{lower}\n" +
|
||||
"YA<{cyYa}\n" +
|
||||
"Yi<{cyYi}[{lower}\n" +
|
||||
"YI<{cyYi}\n" +
|
||||
"Y<{cyY}\n" +
|
||||
"Kh<{cyKhe}[{lower}\n" +
|
||||
"KH<{cyKhe}\n" +
|
||||
"K''<{cyKe}[{cyHard}\n" +
|
||||
"K''<{cyKe}[{cyhard}\n" +
|
||||
"X<{cyKe}{cySe}\n" +
|
||||
"X<{cyKe}{cyse}\n" +
|
||||
"K<{cyKe}\n" +
|
||||
"L<{cyLe}\n" +
|
||||
"M<{cyMe}\n" +
|
||||
"N<{cyNe}\n" +
|
||||
"O<{cyO}\n" +
|
||||
"P<{cyPe}\n" +
|
||||
|
||||
"R<{cyRe}\n" +
|
||||
"Shch<{cyShche}[{lower}\n" +
|
||||
"SHCH<{cyShche}\n" +
|
||||
"Sh''<{cyShe}[{cyche}\n" +
|
||||
"SH''<{cyShe}[{cyChe}\n" +
|
||||
"Sh<{cyShe}[{lower}\n" +
|
||||
"SH<{cyShe}\n" +
|
||||
"S''<{cySe}[{cyHard}\n" +
|
||||
"S''<{cySe}[{cyhard}\n" +
|
||||
"S<{cySe}\n" +
|
||||
"Ts<{cyTse}[{lower}\n" +
|
||||
"TS<{cyTse}\n" +
|
||||
"T''<{cyTe}[{cySe}\n" +
|
||||
"T''<{cyTe}[{cyse}\n" +
|
||||
"T''<{cyTe}[{cyHard}\n" +
|
||||
"T''<{cyTe}[{cyhard}\n" +
|
||||
"T<{cyTe}\n" +
|
||||
"U<{cyU}\n" +
|
||||
"F<{cyFe}\n" +
|
||||
"Ch<{cyChe}[{lower}\n" +
|
||||
"CH<{cyChe}\n" +
|
||||
"H<{cyHard}\n" +
|
||||
"I''<{cyI}[{cyI}\n" +
|
||||
"I''<{cyI}[{cyi}\n" +
|
||||
"I<{cyI}\n" +
|
||||
"Ii<{cySoft}[{lower}\n" +
|
||||
"II<{cySoft}\n" +
|
||||
"E<{cyE}\n" +
|
||||
|
||||
//lowercase
|
||||
"y''<{cyy}[{cya}\n" +
|
||||
"y''<{cyy}[{cye}\n" +
|
||||
"y''<{cyy}[{cyi}\n" +
|
||||
"y''<{cyy}[{cyo}\n" +
|
||||
"y''<{cyy}[{cyu}\n" +
|
||||
"y''<{cyy}[{cyA}\n" +
|
||||
"y''<{cyy}[{cyE}\n" +
|
||||
"y''<{cyy}[{cyI}\n" +
|
||||
"y''<{cyy}[{cyO}\n" +
|
||||
"y''<{cyy}[{cyU}\n" +
|
||||
"a<{cya}\n" +
|
||||
"b<{cybe}\n" +
|
||||
"j<{cyde}{cyzhe}\n" +
|
||||
"j<{cyde}{cyZhe}\n" +
|
||||
"d<{cyde}\n" +
|
||||
"v<{cyve}\n" +
|
||||
"g<{cyge}\n" +
|
||||
"zh<{cyzhe}\n" +
|
||||
"z''<{cyze}[{cyhard}\n" +
|
||||
"z''<{cyze}[{cyHard}\n" +
|
||||
"z<{cyze}\n" +
|
||||
"ye<{cyye}\n" +
|
||||
"yo<{cyyo}\n" +
|
||||
"yu<{cyyu}\n" +
|
||||
"ya<{cyya}\n" +
|
||||
"yi<{cyyi}\n" +
|
||||
"y<{cyy}\n" +
|
||||
"kh<{cykhe}\n" +
|
||||
"k''<{cyke}[{cyhard}\n" +
|
||||
"k''<{cyke}[{cyHard}\n" +
|
||||
"x<{cyke}{cyse}\n" +
|
||||
"x<{cyke}{cySe}\n" +
|
||||
"k<{cyke}\n" +
|
||||
"l<{cyle}\n" +
|
||||
"m<{cyme}\n" +
|
||||
"n<{cyne}\n" +
|
||||
"o<{cyo}\n" +
|
||||
"p<{cype}\n" +
|
||||
|
||||
"r<{cyre}\n" +
|
||||
"shch<{cyshche}\n" +
|
||||
"sh''<{cyshe}[{cyche}\n" +
|
||||
"sh''<{cyshe}[{cyChe}\n" +
|
||||
"sh<{cyshe}\n" +
|
||||
"s''<{cyse}[{cyhard}\n" +
|
||||
"s''<{cyse}[{cyHard}\n" +
|
||||
"s<{cyse}\n" +
|
||||
"ts<{cytse}\n" +
|
||||
"t''<{cyte}[{cyse}\n" +
|
||||
"t''<{cyte}[{cySe}\n" +
|
||||
"t''<{cyte}[{cyhard}\n" +
|
||||
"t''<{cyte}[{cyHard}\n" +
|
||||
"t<{cyte}\n" +
|
||||
"u<{cyu}\n" +
|
||||
"f<{cyfe}\n" +
|
||||
"ch<{cyche}\n" +
|
||||
"h<{cyhard}\n" +
|
||||
"i''<{cyi}[{cyI}\n" +
|
||||
"i''<{cyi}[{cyi}\n" +
|
||||
"i<{cyi}\n" +
|
||||
"ii<{cysoft}\n" +
|
||||
"e<{cye}\n" +
|
||||
|
||||
//generally the last rule
|
||||
"''>\n"
|
||||
//the end
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
412
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java
Executable file
412
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Devanagari.java
Executable file
|
@ -0,0 +1,412 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleLatinDevanagari extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"Latin to Devanagari" },
|
||||
|
||||
{ "Rule",
|
||||
//#####################################################################
|
||||
// Keyboard Transliteration Table
|
||||
//#####################################################################
|
||||
// Conversions should be:
|
||||
// 1. complete
|
||||
// * convert every sequence of Latin letters (a to z plus apostrophe)
|
||||
// to a sequence of Native letters
|
||||
// * convert every sequence of Native letters to Latin letters
|
||||
// 2. reversable
|
||||
// * any string of Native converted to Latin and back should be the same
|
||||
// * this is not true for English converted to Native & back, e.g.:
|
||||
// k -> {kaf} -> k
|
||||
// c -> {kaf} -> k
|
||||
//#####################################################################
|
||||
// Sequences of Latin letters may convert to a single Native letter.
|
||||
// When this is the case, an apostrophe can be used to indicate separate
|
||||
// letters.$
|
||||
// E.g. sh -> {shin}
|
||||
// s'h -> {sin}{heh}
|
||||
// ss -> {sad}
|
||||
// s's -> {sin}{shadda}
|
||||
//#####################################################################
|
||||
// To Do:
|
||||
// finish adding shadda, add sokoon, fix uppercase
|
||||
// make two transliteration tables: one with vowels, one without
|
||||
//#####################################################################
|
||||
// Modifications
|
||||
// Devanagari Transliterator: broken up with consonsants/vowels
|
||||
//#####################################################################
|
||||
// Unicode character name definitions
|
||||
//#####################################################################
|
||||
|
||||
//consonants
|
||||
"candrabindu=\u0901\n"
|
||||
+ "bindu=\u0902\n"
|
||||
+ "visarga=\u0903\n"
|
||||
|
||||
// w<vowel> represents the stand-alone form
|
||||
+ "wa=\u0905\n"
|
||||
+ "waa=\u0906\n"
|
||||
+ "wi=\u0907\n"
|
||||
+ "wii=\u0908\n"
|
||||
+ "wu=\u0909\n"
|
||||
+ "wuu=\u090A\n"
|
||||
+ "wr=\u090B\n"
|
||||
+ "wl=\u090C\n"
|
||||
+ "we=\u090F\n"
|
||||
+ "wai=\u0910\n"
|
||||
+ "wo=\u0913\n"
|
||||
+ "wau=\u0914\n"
|
||||
|
||||
+ "ka=\u0915\n"
|
||||
+ "kha=\u0916\n"
|
||||
+ "ga=\u0917\n"
|
||||
+ "gha=\u0918\n"
|
||||
+ "nga=\u0919\n"
|
||||
|
||||
+ "ca=\u091A\n"
|
||||
+ "cha=\u091B\n"
|
||||
+ "ja=\u091C\n"
|
||||
+ "jha=\u091D\n"
|
||||
+ "nya=\u091E\n"
|
||||
|
||||
+ "tta=\u091F\n"
|
||||
+ "ttha=\u0920\n"
|
||||
+ "dda=\u0921\n"
|
||||
+ "ddha=\u0922\n"
|
||||
+ "nna=\u0923\n"
|
||||
|
||||
+ "ta=\u0924\n"
|
||||
+ "tha=\u0925\n"
|
||||
+ "da=\u0926\n"
|
||||
+ "dha=\u0927\n"
|
||||
+ "na=\u0928\n"
|
||||
|
||||
+ "pa=\u092A\n"
|
||||
+ "pha=\u092B\n"
|
||||
+ "ba=\u092C\n"
|
||||
+ "bha=\u092D\n"
|
||||
+ "ma=\u092E\n"
|
||||
|
||||
+ "ya=\u092F\n"
|
||||
+ "ra=\u0930\n"
|
||||
+ "rra=\u0931\n"
|
||||
+ "la=\u0933\n"
|
||||
+ "va=\u0935\n"
|
||||
|
||||
+ "sha=\u0936\n"
|
||||
+ "ssa=\u0937\n"
|
||||
+ "sa=\u0938\n"
|
||||
+ "ha=\u0939\n"
|
||||
|
||||
// <vowel> represents the dependent form
|
||||
+ "aa=\u093E\n"
|
||||
+ "i=\u093F\n"
|
||||
+ "ii=\u0940\n"
|
||||
+ "u=\u0941\n"
|
||||
+ "uu=\u0942\n"
|
||||
+ "rh=\u0943\n"
|
||||
+ "lh=\u0944\n"
|
||||
+ "e=\u0947\n"
|
||||
+ "ai=\u0948\n"
|
||||
+ "o=\u094B\n"
|
||||
+ "au=\u094C\n"
|
||||
|
||||
+ "virama=\u094D\n"
|
||||
|
||||
+ "wrr=\u0960\n"
|
||||
+ "rrh=\u0962\n"
|
||||
|
||||
+ "danda=\u0964\n"
|
||||
+ "doubleDanda=\u0965\n"
|
||||
+ "depVowelAbove=[\u093E-\u0940\u0945-\u094C]\n"
|
||||
+ "depVowelBelow=[\u0941-\u0944]\n"
|
||||
+ "endThing=[{danda}{doubleDanda}\u0000-\u08FF\u0980-\uFFFF]\n"
|
||||
|
||||
+ "&=[{virama}{aa}{ai}{au}{ii}{i}{uu}{u}{rrh}{rh}{lh}{e}{o}]\n"
|
||||
+ "%=[bcdfghjklmnpqrstvwxyz]\n"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Latin letters to Native letters
|
||||
//#####################################################################
|
||||
//Hindi>\u092d\u093e\u0930\u0924--\u0020\u0926\u0947\u0936\u0020\u092c\u0928\u094d\u0927\u0941\u002e
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
+ "mm>{bindu}\n"
|
||||
+ "x>{visarga}\n"
|
||||
|
||||
// convert to independent forms at start of word or syllable:
|
||||
// e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
|
||||
// Moved up [LIU]
|
||||
|
||||
+ "aa>{waa}\n"
|
||||
+ "ai>{wai}\n"
|
||||
+ "au>{wau}\n"
|
||||
+ "ii>{wii}\n"
|
||||
+ "i>{wi}\n"
|
||||
+ "uu>{wuu}\n"
|
||||
+ "u>{wu}\n"
|
||||
+ "rrh>{wrr}\n"
|
||||
+ "rh>{wr}\n"
|
||||
+ "lh>{wl}\n"
|
||||
+ "e>{we}\n"
|
||||
+ "o>{wo}\n"
|
||||
+ "a>{wa}\n"
|
||||
|
||||
// normal consonants
|
||||
|
||||
+ "kh>{kha}|{virama}\n"
|
||||
+ "k>{ka}|{virama}\n"
|
||||
+ "q>{ka}|{virama}\n"
|
||||
+ "gh>{gha}|{virama}\n"
|
||||
+ "g>{ga}|{virama}\n"
|
||||
+ "ng>{nga}|{virama}\n"
|
||||
+ "ch>{cha}|{virama}\n"
|
||||
+ "c>{ca}|{virama}\n"
|
||||
+ "jh>{jha}|{virama}\n"
|
||||
+ "j>{ja}|{virama}\n"
|
||||
+ "ny>{nya}|{virama}\n"
|
||||
+ "tth>{ttha}|{virama}\n"
|
||||
+ "tt>{tta}|{virama}\n"
|
||||
+ "ddh>{ddha}|{virama}\n"
|
||||
+ "dd>{dda}|{virama}\n"
|
||||
+ "nn>{nna}|{virama}\n"
|
||||
+ "th>{tha}|{virama}\n"
|
||||
+ "t>{ta}|{virama}\n"
|
||||
+ "dh>{dha}|{virama}\n"
|
||||
+ "d>{da}|{virama}\n"
|
||||
+ "n>{na}|{virama}\n"
|
||||
+ "ph>{pha}|{virama}\n"
|
||||
+ "p>{pa}|{virama}\n"
|
||||
+ "bh>{bha}|{virama}\n"
|
||||
+ "b>{ba}|{virama}\n"
|
||||
+ "m>{ma}|{virama}\n"
|
||||
+ "y>{ya}|{virama}\n"
|
||||
+ "r>{ra}|{virama}\n"
|
||||
+ "l>{la}|{virama}\n"
|
||||
+ "v>{va}|{virama}\n"
|
||||
+ "f>{va}|{virama}\n"
|
||||
+ "w>{va}|{virama}\n"
|
||||
+ "sh>{sha}|{virama}\n"
|
||||
+ "ss>{ssa}|{virama}\n"
|
||||
+ "s>{sa}|{virama}\n"
|
||||
+ "z>{sa}|{virama}\n"
|
||||
+ "h>{ha}|{virama}\n"
|
||||
|
||||
+ ".>{danda}\n"
|
||||
+ "{danda}.>{doubleDanda}\n"
|
||||
+ "{depVowelAbove}]~>{bindu}\n"
|
||||
+ "{depVowelBelow}]~>{candrabindu}\n"
|
||||
|
||||
// convert to dependent forms after consonant with no vowel:
|
||||
// e.g. kai -> {ka}{virama}ai -> {ka}{ai}
|
||||
|
||||
+ "{virama}aa>{aa}\n"
|
||||
+ "{virama}ai>{ai}\n"
|
||||
+ "{virama}au>{au}\n"
|
||||
+ "{virama}ii>{ii}\n"
|
||||
+ "{virama}i>{i}\n"
|
||||
+ "{virama}uu>{uu}\n"
|
||||
+ "{virama}u>{u}\n"
|
||||
+ "{virama}rrh>{rrh}\n"
|
||||
+ "{virama}rh>{rh}\n"
|
||||
+ "{virama}lh>{lh}\n"
|
||||
+ "{virama}e>{e}\n"
|
||||
+ "{virama}o>{o}\n"
|
||||
+ "{virama}a>\n"
|
||||
|
||||
// otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
|
||||
|
||||
+ "{virama}''aa>{waa}\n"
|
||||
+ "{virama}''ai>{wai}\n"
|
||||
+ "{virama}''au>{wau}\n"
|
||||
+ "{virama}''ii>{wii}\n"
|
||||
+ "{virama}''i>{wi}\n"
|
||||
+ "{virama}''uu>{wuu}\n"
|
||||
+ "{virama}''u>{wu}\n"
|
||||
+ "{virama}''rrh>{wrr}\n"
|
||||
+ "{virama}''rh>{wr}\n"
|
||||
+ "{virama}''lh>{wl}\n"
|
||||
+ "{virama}''e>{we}\n"
|
||||
+ "{virama}''o>{wo}\n"
|
||||
+ "{virama}''a>{wa}\n"
|
||||
|
||||
+ "{virama}[{endThing}>\n"
|
||||
|
||||
// convert any left-over apostrophes used for separation
|
||||
|
||||
+ "''>\n"
|
||||
|
||||
//#####################################################################
|
||||
// convert from Native letters to Latin letters
|
||||
//#####################################################################
|
||||
|
||||
// special forms with no good conversion
|
||||
|
||||
+ "mm<{bindu}\n"
|
||||
+ "x<{visarga}\n"
|
||||
|
||||
// normal consonants
|
||||
|
||||
+ "kh<{kha}[&\n"
|
||||
+ "kha<{kha}\n"
|
||||
+ "k''<{ka}{virama}[{ha}\n"
|
||||
+ "k<{ka}[&\n"
|
||||
+ "ka<{ka}\n"
|
||||
+ "gh<{gha}[&\n"
|
||||
+ "gha<{gha}\n"
|
||||
+ "g''<{ga}{virama}[{ha}\n"
|
||||
+ "g<{ga}[&\n"
|
||||
+ "ga<{ga}\n"
|
||||
+ "ng<{nga}[&\n"
|
||||
+ "nga<{nga}\n"
|
||||
+ "ch<{cha}[&\n"
|
||||
+ "cha<{cha}\n"
|
||||
+ "c''<{ca}{virama}[{ha}\n"
|
||||
+ "c<{ca}[&\n"
|
||||
+ "ca<{ca}\n"
|
||||
+ "jh<{jha}[&\n"
|
||||
+ "jha<{jha}\n"
|
||||
+ "j''<{ja}{virama}[{ha}\n"
|
||||
+ "j<{ja}[&\n"
|
||||
+ "ja<{ja}\n"
|
||||
+ "ny<{nya}[&\n"
|
||||
+ "nya<{nya}\n"
|
||||
+ "tth<{ttha}[&\n"
|
||||
+ "ttha<{ttha}\n"
|
||||
+ "tt''<{tta}{virama}[{ha}\n"
|
||||
+ "tt<{tta}[&\n"
|
||||
+ "tta<{tta}\n"
|
||||
+ "ddh<{ddha}[&\n"
|
||||
+ "ddha<{ddha}\n"
|
||||
+ "dd''<{dda}[&{ha}\n"
|
||||
+ "dd<{dda}[&\n"
|
||||
+ "dda<{dda}\n"
|
||||
+ "dh<{dha}[&\n"
|
||||
+ "dha<{dha}\n"
|
||||
+ "d''<{da}{virama}[{ha}\n"
|
||||
+ "d''<{da}{virama}[{ddha}\n"
|
||||
+ "d''<{da}{virama}[{dda}\n"
|
||||
+ "d''<{da}{virama}[{dha}\n"
|
||||
+ "d''<{da}{virama}[{da}\n"
|
||||
+ "d<{da}[&\n"
|
||||
+ "da<{da}\n"
|
||||
+ "th<{tha}[&\n"
|
||||
+ "tha<{tha}\n"
|
||||
+ "t''<{ta}{virama}[{ha}\n"
|
||||
+ "t''<{ta}{virama}[{ttha}\n"
|
||||
+ "t''<{ta}{virama}[{tta}\n"
|
||||
+ "t''<{ta}{virama}[{tha}\n"
|
||||
+ "t''<{ta}{virama}[{ta}\n"
|
||||
+ "t<{ta}[&\n"
|
||||
+ "ta<{ta}\n"
|
||||
+ "n''<{na}{virama}[{ga}\n"
|
||||
+ "n''<{na}{virama}[{ya}\n"
|
||||
+ "n<{na}[&\n"
|
||||
+ "na<{na}\n"
|
||||
+ "ph<{pha}[&\n"
|
||||
+ "pha<{pha}\n"
|
||||
+ "p''<{pa}{virama}[{ha}\n"
|
||||
+ "p<{pa}[&\n"
|
||||
+ "pa<{pa}\n"
|
||||
+ "bh<{bha}[&\n"
|
||||
+ "bha<{bha}\n"
|
||||
+ "b''<{ba}{virama}[{ha}\n"
|
||||
+ "b<{ba}[&\n"
|
||||
+ "ba<{ba}\n"
|
||||
+ "m''<{ma}{virama}[{ma}\n"
|
||||
+ "m''<{ma}{virama}[{bindu}\n"
|
||||
+ "m<{ma}[&\n"
|
||||
+ "ma<{ma}\n"
|
||||
+ "y<{ya}[&\n"
|
||||
+ "ya<{ya}\n"
|
||||
+ "r''<{ra}{virama}[{ha}\n"
|
||||
+ "r<{ra}[&\n"
|
||||
+ "ra<{ra}\n"
|
||||
+ "l''<{la}{virama}[{ha}\n"
|
||||
+ "l<{la}[&\n"
|
||||
+ "la<{la}\n"
|
||||
+ "v<{va}[&\n"
|
||||
+ "va<{va}\n"
|
||||
+ "sh<{sha}[&\n"
|
||||
+ "sha<{sha}\n"
|
||||
+ "ss<{ssa}[&\n"
|
||||
+ "ssa<{ssa}\n"
|
||||
+ "s''<{sa}{virama}[{ha}\n"
|
||||
+ "s''<{sa}{virama}[{sha}\n"
|
||||
+ "s''<{sa}{virama}[{ssa}\n"
|
||||
+ "s''<{sa}{virama}[{sa}\n"
|
||||
+ "s<{sa}[&\n"
|
||||
+ "sa<{sa}\n"
|
||||
+ "h<{ha}[&\n"
|
||||
+ "ha<{ha}\n"
|
||||
|
||||
// dependent vowels (should never occur except following consonants)
|
||||
|
||||
+ "aa<{aa}\n"
|
||||
+ "ai<{ai}\n"
|
||||
+ "au<{au}\n"
|
||||
+ "ii<{ii}\n"
|
||||
+ "i<{i}\n"
|
||||
+ "uu<{uu}\n"
|
||||
+ "u<{u}\n"
|
||||
+ "rrh<{rrh}\n"
|
||||
+ "rh<{rh}\n"
|
||||
+ "lh<{lh}\n"
|
||||
+ "e<{e}\n"
|
||||
+ "o<{o}\n"
|
||||
|
||||
// independent vowels (when following consonants)
|
||||
|
||||
+ "''aa<a]{waa}\n"
|
||||
+ "''aa<%]{waa}\n"
|
||||
+ "''ai<a]{wai}\n"
|
||||
+ "''ai<%]{wai}\n"
|
||||
+ "''au<a]{wau}\n"
|
||||
+ "''au<%]{wau}\n"
|
||||
+ "''ii<a]{wii}\n"
|
||||
+ "''ii<%]{wii}\n"
|
||||
+ "''i<a]{wi}\n"
|
||||
+ "''i<%]{wi}\n"
|
||||
+ "''uu<a]{wuu}\n"
|
||||
+ "''uu<%]{wuu}\n"
|
||||
+ "''u<a]{wu}\n"
|
||||
+ "''u<%]{wu}\n"
|
||||
+ "''rrh<%]{wrr}\n"
|
||||
+ "''rh<%]{wr}\n"
|
||||
+ "''lh<%]{wl}\n"
|
||||
+ "''e<%]{we}\n"
|
||||
+ "''o<%]{wo}\n"
|
||||
+ "''a<a]{wa}\n"
|
||||
+ "''a<%]{wa}\n"
|
||||
|
||||
|
||||
// independent vowels (otherwise)
|
||||
|
||||
+ "aa<{waa}\n"
|
||||
+ "ai<{wai}\n"
|
||||
+ "au<{wau}\n"
|
||||
+ "ii<{wii}\n"
|
||||
+ "i<{wi}\n"
|
||||
+ "uu<{wuu}\n"
|
||||
+ "u<{wu}\n"
|
||||
+ "rrh<{wrr}\n"
|
||||
+ "rh<{wr}\n"
|
||||
+ "lh<{wl}\n"
|
||||
+ "e<{we}\n"
|
||||
+ "o<{wo}\n"
|
||||
+ "a<{wa}\n"
|
||||
|
||||
// blow away any remaining viramas
|
||||
|
||||
+ "<{virama}\n"
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
384
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Greek.java
Executable file
384
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Greek.java
Executable file
|
@ -0,0 +1,384 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleLatinGreek extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"Latin to Greek" },
|
||||
|
||||
{ "Rule",
|
||||
// Greek Letters
|
||||
|
||||
"grAl=\u0391\n"
|
||||
+ "grBe=\u0392\n"
|
||||
+ "grGa=\u0393\n"
|
||||
+ "grDe=\u0394\n"
|
||||
+ "grEp=\u0395\n"
|
||||
+ "grZe=\u0396\n"
|
||||
+ "grEt=\u0397\n"
|
||||
+ "grTh=\u0398\n"
|
||||
+ "grIo=\u0399\n"
|
||||
+ "grKa=\u039A\n"
|
||||
+ "grLa=\u039B\n"
|
||||
+ "grMu=\u039C\n"
|
||||
+ "grNu=\u039D\n"
|
||||
+ "grKs=\u039E\n"
|
||||
+ "grOm=\u039F\n"
|
||||
+ "grPi=\u03A0\n"
|
||||
+ "grRh=\u03A1\n"
|
||||
+ "grSi=\u03A3\n"
|
||||
+ "grTa=\u03A4\n"
|
||||
+ "grUp=\u03A5\n"
|
||||
+ "grPh=\u03A6\n"
|
||||
+ "grKh=\u03A7\n"
|
||||
+ "grPs=\u03A8\n"
|
||||
+ "grOme=\u03A9\n"
|
||||
|
||||
+ "gral=\u03B1\n"
|
||||
+ "grbe=\u03B2\n"
|
||||
+ "grga=\u03B3\n"
|
||||
+ "grde=\u03B4\n"
|
||||
+ "grep=\u03B5\n"
|
||||
+ "grze=\u03B6\n"
|
||||
+ "gret=\u03B7\n"
|
||||
+ "grth=\u03B8\n"
|
||||
+ "grio=\u03B9\n"
|
||||
+ "grka=\u03BA\n"
|
||||
+ "grla=\u03BB\n"
|
||||
+ "grmu=\u03BC\n"
|
||||
+ "grnu=\u03BD\n"
|
||||
+ "grks=\u03BE\n"
|
||||
+ "grom=\u03BF\n"
|
||||
+ "grpi=\u03C0\n"
|
||||
+ "grrh=\u03C1\n"
|
||||
+ "grsi=\u03C3\n"
|
||||
+ "grta=\u03C4\n"
|
||||
+ "grup=\u03C5\n"
|
||||
+ "grph=\u03C6\n"
|
||||
+ "grkh=\u03C7\n"
|
||||
+ "grps=\u03C8\n"
|
||||
+ "grome=\u03C9\n"
|
||||
|
||||
//forms
|
||||
+ "grfinal=\u03C2\n"
|
||||
|
||||
+ "grAcAl=\u0386\n"
|
||||
+ "grAcEp=\u0388\n"
|
||||
+ "grAcEt=\u0389\n"
|
||||
+ "grAcIo=\u038A\n"
|
||||
+ "grAcOm=\u038C\n"
|
||||
+ "grAcUp=\u038E\n"
|
||||
+ "grAcOme=\u038F\n"
|
||||
+ "grDiIo=\u03AA\n"
|
||||
+ "grDiUp=\u03AB\n"
|
||||
|
||||
+ "gracal=\u03AC\n"
|
||||
+ "gracep=\u03AD\n"
|
||||
+ "gracet=\u03AE\n"
|
||||
+ "gracio=\u03AF\n"
|
||||
+ "gracom=\u03CC\n"
|
||||
+ "gracup=\u03CD\n"
|
||||
+ "gracome=\u03CE\n"
|
||||
+ "grdiio=\u03CA\n"
|
||||
+ "grdiup=\u03CB\n"
|
||||
|
||||
//gracdiio=\u00FD
|
||||
//gracdiup=\u00FE
|
||||
|
||||
+ "letter=[[:Lu:][:Ll:]]\n"
|
||||
|
||||
// convert Roman to Native
|
||||
+ "Greek>\u039c\u0397\u039d\u0399\u039d\u0020\u0391\u0395\u0399\u0394\u0395\u002c\u0020\u0398\u0395\u0391\u002c\u0020--\u0397\u039b\u0397\u0399\u0391\u0394\u0395\u03a9\u0020\u0391\u03a7\u0399\u039b\u0397\u039f\u03a3\n"
|
||||
|
||||
+ "AV`>{grAl}{grAcUp}\n"
|
||||
+ "EV`>{grEp}{grAcUp}\n"
|
||||
+ "AV>{grAl}{grUp}\n"
|
||||
+ "EV>{grEp}{grUp}\n"
|
||||
+ "NG>{grGa}{grGa}\n"
|
||||
+ "NK>{grGa}{grKa}\n"
|
||||
+ "NX>{grGa}{grKs}\n"
|
||||
+ "NCH>{grGa}{grKh}\n"
|
||||
|
||||
//+ "final = [ .;]\n" // Syntax error, unused anyway - Liu
|
||||
|
||||
+ "A`>{grAcAl}\n"
|
||||
+ "EE`>{grAcEt}\n"
|
||||
+ "E`>{grAcEp}\n"
|
||||
+ "I`>{grAcIo}\n"
|
||||
+ "U`>{grAcUp}\n"
|
||||
+ "OO`>{grAcOme}\n"
|
||||
+ "O`>{grAcOm}\n"
|
||||
+ "''I>{grDiIo}\n"
|
||||
+ "''U>{grDiUp}\n"
|
||||
+ "A>{grAl}\n"
|
||||
+ "B>{grBe}\n"
|
||||
+ "C[I>{grSi}\n"
|
||||
+ "C[E>{grSi}\n"
|
||||
+ "C[Y>{grSi}\n"
|
||||
+ "CH>{grKh}\n"
|
||||
+ "C>{grKa}\n"
|
||||
+ "D>{grDe}\n"
|
||||
+ "EE>{grEt}\n"
|
||||
+ "E>{grEp}\n"
|
||||
+ "F>{grPh}\n"
|
||||
+ "G>{grGa}\n"
|
||||
+ "H>{grKh}\n"
|
||||
+ "I>{grIo}\n"
|
||||
+ "J>{grIo}\n"
|
||||
+ "KS>{grKs}\n"
|
||||
+ "KH>{grKh}\n"
|
||||
+ "K>{grKa}\n"
|
||||
+ "L>{grLa}\n"
|
||||
+ "M>{grMu}\n"
|
||||
+ "N>{grNu}\n"
|
||||
+ "OO>{grOme}\n"
|
||||
+ "O>{grOm}\n"
|
||||
+ "PS>{grPs}\n"
|
||||
+ "PH>{grPh}\n"
|
||||
+ "P>{grPi}\n"
|
||||
+ "Q>{grKa}\n"
|
||||
+ "R>{grRh}\n"
|
||||
+ "S>{grSi}\n"
|
||||
+ "TH>{grTh}\n"
|
||||
+ "T>{grTa}\n"
|
||||
+ "W>{grUp}{grUp}\n"
|
||||
+ "U>{grUp}\n"
|
||||
+ "V>{grUp}\n"
|
||||
+ "X>{grKs}\n"
|
||||
+ "Y>{grUp}\n"
|
||||
+ "Z>{grZe}\n"
|
||||
|
||||
//now Native to Roman
|
||||
|
||||
+ "AV<{grAl}{grUp}\n"
|
||||
+ "EV<{grEp}{grUp}\n"
|
||||
+ "AV`<{grAl}{grAcUp}\n"
|
||||
+ "EV`<{grEp}{grAcUp}\n"
|
||||
+ "N''<{grNu}[{grGa}\n"
|
||||
+ "NG<{grGa}{grGa}\n"
|
||||
+ "N''<{grNu}[{grKa}\n"
|
||||
+ "NK<{grGa}{grKa}\n"
|
||||
+ "N''<{grNu}[{grKs}\n"
|
||||
+ "NX<{grGa}{grKs}\n"
|
||||
+ "N''<{grNu}[{grKh}\n"
|
||||
+ "NCH<{grGa}{grKh}\n"
|
||||
|
||||
+ "A<{grAl}\n"
|
||||
+ "B<{grBe}\n"
|
||||
+ "G<{grGa}\n"
|
||||
+ "D<{grDe}\n"
|
||||
+ "E''<{grEp}[{grEp}\n"
|
||||
+ "E''<{grEp}[{grEt}\n"
|
||||
+ "E''<{grEp}[{grAcEp}\n"
|
||||
+ "E''<{grEp}[{grAcEt}\n"
|
||||
+ "E<{grEp}\n"
|
||||
+ "Z<{grZe}\n"
|
||||
+ "EE<{grEt}\n"
|
||||
+ "TH<{grTh}\n"
|
||||
+ "I<{grIo}\n"
|
||||
+ "K<{grKa}\n"
|
||||
+ "L<{grLa}\n"
|
||||
+ "M<{grMu}\n"
|
||||
+ "N<{grNu}\n"
|
||||
+ "X<{grKs}\n"
|
||||
+ "O''<{grOm}[{grOm}\n"
|
||||
+ "O''<{grOm}[{grOme}\n"
|
||||
+ "O''<{grOm}[{grAcOm}\n"
|
||||
+ "O''<{grOm}[{grAcOme}\n"
|
||||
+ "O<{grOm}\n"
|
||||
+ "P''<{grPi}[{grSi}\n"
|
||||
+ "P''<{grPi}[{grfinal}\n"
|
||||
+ "P<{grPi}\n"
|
||||
+ "R<{grRh}\n"
|
||||
+ "S<{grSi}\n"
|
||||
+ "T<{grTa}\n"
|
||||
+ "W<{grUp}{grUp}\n"
|
||||
|
||||
+ "V<{grUp}[{grAcAl}\n"
|
||||
+ "V<{grUp}[{grAcEp}\n"
|
||||
+ "V<{grUp}[{grAcEt}\n"
|
||||
+ "V<{grUp}[{grAcIo}\n"
|
||||
+ "V<{grUp}[{grAcOm}\n"
|
||||
+ "V<{grUp}[{grAcUp}\n"
|
||||
+ "V<{grUp}[{grAcOme}\n"
|
||||
|
||||
+ "V<{grUp}[{grAl}\n"
|
||||
+ "V<{grUp}[{grEp}\n"
|
||||
+ "V<{grUp}[{grEt}\n"
|
||||
+ "V<{grUp}[{grIo}\n"
|
||||
+ "V<{grUp}[{grOm}\n"
|
||||
//{grUp}[{grUp}<V
|
||||
+ "V<{grUp}[{grOme}\n"
|
||||
|
||||
+ "U<{grUp}\n"
|
||||
+ "PH<{grPh}\n"
|
||||
+ "CH<{grKh}\n"
|
||||
+ "PS<{grPs}\n"
|
||||
+ "OO<{grOme}\n"
|
||||
//forms
|
||||
+ "A`<{grAcAl}\n"
|
||||
+ "E`<{grAcEp}\n"
|
||||
+ "EE`<{grAcEt}\n"
|
||||
+ "I`<{grAcIo}\n"
|
||||
+ "O`<{grAcOm}\n"
|
||||
+ "U`<{grAcUp}\n"
|
||||
+ "OO`<{grAcOme}\n"
|
||||
+ "''I<{grDiIo}\n"
|
||||
+ "''U<{grDiUp}\n"
|
||||
|
||||
//{gracdiio}<XX
|
||||
//{gracdiup}<XX
|
||||
//{grfinal}<XX
|
||||
|
||||
+ "av`>{gral}{gracup}\n"
|
||||
+ "ev`>{grep}{gracup}\n"
|
||||
+ "av>{gral}{grup}\n"
|
||||
+ "ev>{grep}{grup}\n"
|
||||
+ "ng>{grga}{grga}\n"
|
||||
+ "nk>{grga}{grka}\n"
|
||||
+ "nx>{grga}{grks}\n"
|
||||
+ "nch>{grga}{grkh}\n"
|
||||
|
||||
+ "a`>{gracal}\n"
|
||||
+ "ee`>{gracet}\n"
|
||||
+ "e`>{gracep}\n"
|
||||
+ "i`>{gracio}\n"
|
||||
+ "u`>{gracup}\n"
|
||||
+ "oo`>{gracome}\n"
|
||||
+ "o`>{gracom}\n"
|
||||
+ "''i>{grdiio}\n"
|
||||
+ "''u>{grdiup}\n"
|
||||
+ "a>{gral}\n"
|
||||
+ "b>{grbe}\n"
|
||||
+ "c[i>{grsi}\n"
|
||||
+ "c[e>{grsi}\n"
|
||||
+ "c[y>{grsi}\n"
|
||||
+ "ch>{grkh}\n"
|
||||
+ "c>{grka}\n"
|
||||
+ "d>{grde}\n"
|
||||
+ "ee>{gret}\n"
|
||||
+ "e>{grep}\n"
|
||||
+ "f>{grph}\n"
|
||||
+ "g>{grga}\n"
|
||||
+ "h>{grkh}\n"
|
||||
+ "i>{grio}\n"
|
||||
+ "j>{grio}\n"
|
||||
+ "ks>{grks}\n"
|
||||
+ "kh>{grkh}\n"
|
||||
+ "k>{grka}\n"
|
||||
+ "l>{grla}\n"
|
||||
+ "m>{grmu}\n"
|
||||
+ "n>{grnu}\n"
|
||||
+ "oo>{grome}\n"
|
||||
+ "o>{grom}\n"
|
||||
+ "ps>{grps}\n"
|
||||
+ "ph>{grph}\n"
|
||||
+ "p>{grpi}\n"
|
||||
+ "q>{grka}\n"
|
||||
+ "r>{grrh}\n"
|
||||
+ "s>|{grfinal}\n"
|
||||
+ "{grfinal}[{letter}>{grsi}\n"
|
||||
+ "th>{grth}\n"
|
||||
+ "t>{grta}\n"
|
||||
+ "w>{grup}{grup}\n"
|
||||
+ "u>{grup}\n"
|
||||
+ "v>{grup}\n"
|
||||
+ "x>{grks}\n"
|
||||
+ "y>{grup}\n"
|
||||
+ "z>{grze}\n"
|
||||
|
||||
|
||||
//forms
|
||||
+ "''>\n"
|
||||
//now native to roman
|
||||
|
||||
+ "av<{gral}{grup}\n"
|
||||
+ "ev<{grep}{grup}\n"
|
||||
+ "av`<{gral}{gracup}\n"
|
||||
+ "ev`<{grep}{gracup}\n"
|
||||
+ "n''<{grnu}[{grga}\n"
|
||||
+ "ng<{grga}{grga}\n"
|
||||
+ "n''<{grnu}[{grka}\n"
|
||||
+ "nk<{grga}{grka}\n"
|
||||
+ "n''<{grnu}[{grks}\n"
|
||||
+ "nx<{grga}{grks}\n"
|
||||
+ "n''<{grnu}[{grkh}\n"
|
||||
+ "nch<{grga}{grkh}\n"
|
||||
|
||||
+ "a<{gral}\n"
|
||||
+ "b<{grbe}\n"
|
||||
+ "g<{grga}\n"
|
||||
+ "d<{grde}\n"
|
||||
+ "e''<{grep}[{grep}\n"
|
||||
+ "e''<{grep}[{gret}\n"
|
||||
+ "e''<{grep}[{gracep}\n"
|
||||
+ "e''<{grep}[{gracet}\n"
|
||||
+ "e<{grep}\n"
|
||||
+ "z<{grze}\n"
|
||||
+ "ee<{gret}\n"
|
||||
+ "th<{grth}\n"
|
||||
+ "i<{grio}\n"
|
||||
+ "k<{grka}\n"
|
||||
+ "l<{grla}\n"
|
||||
+ "m<{grmu}\n"
|
||||
+ "n<{grnu}\n"
|
||||
+ "x<{grks}\n"
|
||||
+ "o''<{grom}[{grom}\n"
|
||||
+ "o''<{grom}[{grome}\n"
|
||||
+ "o''<{grom}[{gracom}\n"
|
||||
+ "o''<{grom}[{gracome}\n"
|
||||
+ "o<{grom}\n"
|
||||
+ "p''<{grpi}[{grsi}\n"
|
||||
+ "p''<{grpi}[{grfinal}\n"
|
||||
+ "p<{grpi}\n"
|
||||
+ "r<{grrh}\n"
|
||||
+ "s<{grsi}\n"
|
||||
+ "s<{grfinal}\n"
|
||||
+ "t<{grta}\n"
|
||||
+ "w<{grup}{grup}\n"
|
||||
|
||||
+ "v<{grup}[{gracal}\n"
|
||||
+ "v<{grup}[{gracep}\n"
|
||||
+ "v<{grup}[{gracet}\n"
|
||||
+ "v<{grup}[{gracio}\n"
|
||||
+ "v<{grup}[{gracom}\n"
|
||||
+ "v<{grup}[{gracup}\n"
|
||||
+ "v<{grup}[{gracome}\n"
|
||||
|
||||
+ "v<{grup}[{gral}\n"
|
||||
+ "v<{grup}[{grep}\n"
|
||||
+ "v<{grup}[{gret}\n"
|
||||
+ "v<{grup}[{grio}\n"
|
||||
+ "v<{grup}[{grom}\n"
|
||||
//{grup}[{grup}<v
|
||||
+ "v<{grup}[{grome}\n"
|
||||
|
||||
+ "u<{grup}\n"
|
||||
+ "ph<{grph}\n"
|
||||
+ "ch<{grkh}\n"
|
||||
+ "ps<{grps}\n"
|
||||
+ "oo<{grome}\n"
|
||||
//forms
|
||||
+ "a`<{gracal}\n"
|
||||
+ "e`<{gracep}\n"
|
||||
+ "ee`<{gracet}\n"
|
||||
+ "i`<{gracio}\n"
|
||||
+ "o`<{gracom}\n"
|
||||
+ "u`<{gracup}\n"
|
||||
+ "oo`<{gracome}\n"
|
||||
+ "''i<{grdiio}\n"
|
||||
+ "''u<{grdiup}\n"
|
||||
+ "<''\n"
|
||||
|
||||
//{gracdiio}<xx
|
||||
//{gracdiup}<xx
|
||||
//{grfinal}<xx
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
283
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Hebrew.java
Executable file
283
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Hebrew.java
Executable file
|
@ -0,0 +1,283 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleLatinHebrew extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"Latin to Hebrew" },
|
||||
|
||||
{ "Rule",
|
||||
//variable names, derived from the Unicode names.
|
||||
|
||||
"POINT_SHEVA=\u05B0\n"
|
||||
+ "POINT_HATAF_SEGOL=\u05B1\n"
|
||||
+ "POINT_HATAF_PATAH=\u05B2\n"
|
||||
+ "POINT_HATAF_QAMATS=\u05B3\n"
|
||||
+ "POINT_HIRIQ=\u05B4\n"
|
||||
+ "POINT_TSERE=\u05B5\n"
|
||||
+ "POINT_SEGOL=\u05B6\n"
|
||||
+ "POINT_PATAH=\u05B7\n"
|
||||
+ "POINT_QAMATS=\u05B8\n"
|
||||
+ "POINT_HOLAM=\u05B9\n"
|
||||
+ "POINT_QUBUTS=\u05BB\n"
|
||||
+ "POINT_DAGESH_OR_MAPIQ=\u05BC\n"
|
||||
+ "POINT_METEG=\u05BD\n"
|
||||
+ "PUNCTUATION_MAQAF=\u05BE\n"
|
||||
+ "POINT_RAFE=\u05BF\n"
|
||||
+ "PUNCTUATION_PASEQ=\u05C0\n"
|
||||
+ "POINT_SHIN_DOT=\u05C1\n"
|
||||
+ "POINT_SIN_DOT=\u05C2\n"
|
||||
+ "PUNCTUATION_SOF_PASUQ=\u05C3\n"
|
||||
+ "ALEF=\u05D0\n"
|
||||
+ "BET=\u05D1\n"
|
||||
+ "GIMEL=\u05D2\n"
|
||||
+ "DALET=\u05D3\n"
|
||||
+ "HE=\u05D4\n"
|
||||
+ "VAV=\u05D5\n"
|
||||
+ "ZAYIN=\u05D6\n"
|
||||
+ "HET=\u05D7\n"
|
||||
+ "TET=\u05D8\n"
|
||||
+ "YOD=\u05D9\n"
|
||||
+ "FINAL_KAF=\u05DA\n"
|
||||
+ "KAF=\u05DB\n"
|
||||
+ "LAMED=\u05DC\n"
|
||||
+ "FINAL_MEM=\u05DD\n"
|
||||
+ "MEM=\u05DE\n"
|
||||
+ "FINAL_NUN=\u05DF\n"
|
||||
+ "NUN=\u05E0\n"
|
||||
+ "SAMEKH=\u05E1\n"
|
||||
+ "AYIN=\u05E2\n"
|
||||
+ "FINAL_PE=\u05E3\n"
|
||||
+ "PE=\u05E4\n"
|
||||
+ "FINAL_TSADI=\u05E5\n"
|
||||
+ "TSADI=\u05E6\n"
|
||||
+ "QOF=\u05E7\n"
|
||||
+ "RESH=\u05E8\n"
|
||||
+ "SHIN=\u05E9\n"
|
||||
+ "TAV=\u05EA\n"
|
||||
+ "YIDDISH_DOUBLE_VAV=\u05F0\n"
|
||||
+ "YIDDISH_VAV_YOD=\u05F1\n"
|
||||
+ "YIDDISH_DOUBLE_YOD=\u05F2\n"
|
||||
+ "PUNCTUATION_GERESH=\u05F3\n"
|
||||
+ "PUNCTUATION_GERSHAYIM=\u05F4\n"
|
||||
|
||||
//wildcards
|
||||
//The values can be anything we don't use in this file: start at E000.
|
||||
|
||||
+ "letter=[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]\n"
|
||||
|
||||
+ "softvowel=[eiyEIY]\n"
|
||||
|
||||
+ "vowellike=[{ALEF}{AYIN}{YOD}{VAV}]\n"
|
||||
|
||||
//?>{POINT_SHEVA}
|
||||
//?>{POINT_HATAF_SEGOL}
|
||||
//?>{POINT_HATAF_PATAH}
|
||||
//?>{POINT_HATAF_QAMATS}
|
||||
//?>{POINT_HIRIQ}
|
||||
//?>{POINT_TSERE}
|
||||
//?>{POINT_SEGOL}
|
||||
//?>{POINT_PATAH}
|
||||
//?>{POINT_QAMATS}
|
||||
//?>{POINT_HOLAM}
|
||||
//?>{POINT_QUBUTS}
|
||||
//?>{POINT_DAGESH_OR_MAPIQ}
|
||||
//?>{POINT_METEG}
|
||||
//?>{PUNCTUATION_MAQAF}
|
||||
//?>{POINT_RAFE}
|
||||
//?>{PUNCTUATION_PASEQ}
|
||||
//?>{POINT_SHIN_DOT}
|
||||
//?>{POINT_SIN_DOT}
|
||||
//?>{PUNCTUATION_SOF_PASUQ}
|
||||
|
||||
+ "a>{ALEF}\n"
|
||||
+ "A>{ALEF}\n"
|
||||
|
||||
+ "b>{BET}\n"
|
||||
+ "B>{BET}\n"
|
||||
|
||||
+ "c[{softvowel}>{SAMEKH}\n"
|
||||
+ "C[{softvowel}>{SAMEKH}\n"
|
||||
+ "c[{letter}>{KAF}\n"
|
||||
+ "C[{letter}>{KAF}\n"
|
||||
+ "c>{FINAL_KAF}\n"
|
||||
+ "C>{FINAL_KAF}\n"
|
||||
|
||||
+ "d>{DALET}\n"
|
||||
+ "D>{DALET}\n"
|
||||
|
||||
+ "e>{AYIN}\n"
|
||||
+ "E>{AYIN}\n"
|
||||
|
||||
+ "f[{letter}>{PE}\n"
|
||||
+ "f>{FINAL_PE}\n"
|
||||
+ "F[{letter}>{PE}\n"
|
||||
+ "F>{FINAL_PE}\n"
|
||||
|
||||
+ "g>{GIMEL}\n"
|
||||
+ "G>{GIMEL}\n"
|
||||
|
||||
+ "h>{HE}\n"
|
||||
+ "H>{HE}\n"
|
||||
|
||||
+ "i>{YOD}\n"
|
||||
+ "I>{YOD}\n"
|
||||
|
||||
+ "j>{DALET}{SHIN}\n"
|
||||
+ "J>{DALET}{SHIN}\n"
|
||||
|
||||
+ "kH>{HET}\n"
|
||||
+ "kh>{HET}\n"
|
||||
+ "KH>{HET}\n"
|
||||
+ "Kh>{HET}\n"
|
||||
+ "k[{letter}>{KAF}\n"
|
||||
+ "K[{letter}>{KAF}\n"
|
||||
+ "k>{FINAL_KAF}\n"
|
||||
+ "K>{FINAL_KAF}\n"
|
||||
|
||||
+ "l>{LAMED}\n"
|
||||
+ "L>{LAMED}\n"
|
||||
|
||||
+ "m[{letter}>{MEM}\n"
|
||||
+ "m>{FINAL_MEM}\n"
|
||||
+ "M[{letter}>{MEM}\n"
|
||||
+ "M>{FINAL_MEM}\n"
|
||||
|
||||
+ "n[{letter}>{NUN}\n"
|
||||
+ "n>{FINAL_NUN}\n"
|
||||
+ "N[{letter}>{NUN}\n"
|
||||
+ "N>{FINAL_NUN}\n"
|
||||
|
||||
+ "o>{VAV}\n"
|
||||
+ "O>{VAV}\n"
|
||||
|
||||
+ "p[{letter}>{PE}\n"
|
||||
+ "p>{FINAL_PE}\n"
|
||||
+ "P[{letter}>{PE}\n"
|
||||
+ "P>{FINAL_PE}\n"
|
||||
|
||||
+ "q>{QOF}\n"
|
||||
+ "Q>{QOF}\n"
|
||||
|
||||
+ "r>{RESH}\n"
|
||||
+ "R>{RESH}\n"
|
||||
|
||||
+ "sH>{SHIN}\n"
|
||||
+ "sh>{SHIN}\n"
|
||||
+ "SH>{SHIN}\n"
|
||||
+ "Sh>{SHIN}\n"
|
||||
+ "s>{SAMEKH}\n"
|
||||
+ "S>{SAMEKH}\n"
|
||||
|
||||
+ "th>{TAV}\n"
|
||||
+ "tH>{TAV}\n"
|
||||
+ "TH>{TAV}\n"
|
||||
+ "Th>{TAV}\n"
|
||||
+ "tS[{letter}>{TSADI}\n"
|
||||
+ "ts[{letter}>{TSADI}\n"
|
||||
+ "Ts[{letter}>{TSADI}\n"
|
||||
+ "TS[{letter}>{TSADI}\n"
|
||||
+ "tS>{FINAL_TSADI}\n"
|
||||
+ "ts>{FINAL_TSADI}\n"
|
||||
+ "Ts>{FINAL_TSADI}\n"
|
||||
+ "TS>{FINAL_TSADI}\n"
|
||||
+ "t>{TET}\n"
|
||||
+ "T>{TET}\n"
|
||||
|
||||
+ "u>{VAV}\n"
|
||||
+ "U>{VAV}\n"
|
||||
|
||||
+ "v>{VAV}\n"
|
||||
+ "V>{VAV}\n"
|
||||
|
||||
+ "w>{VAV}\n"
|
||||
+ "W>{VAV}\n"
|
||||
|
||||
+ "x>{KAF}{SAMEKH}\n"
|
||||
+ "X>{KAF}{SAMEKH}\n"
|
||||
|
||||
+ "y>{YOD}\n"
|
||||
+ "Y>{YOD}\n"
|
||||
|
||||
+ "z>{ZAYIN}\n"
|
||||
+ "Z>{ZAYIN}\n"
|
||||
|
||||
//#?>{YIDDISH_DOUBLE_VAV}
|
||||
//?>{YIDDISH_VAV_YOD}
|
||||
//?>{YIDDISH_DOUBLE_YOD}
|
||||
//?>{PUNCTUATION_GERESH}
|
||||
//?>{PUNCTUATION_GERSHAYIM}
|
||||
|
||||
+ "''>\n"
|
||||
|
||||
//{POINT_SHEVA}>@
|
||||
//{POINT_HATAF_SEGOL}>@
|
||||
//{POINT_HATAF_PATAH}>@
|
||||
//{POINT_HATAF_QAMATS}>@
|
||||
//{POINT_HIRIQ}>@
|
||||
//{POINT_TSERE}>@
|
||||
//{POINT_SEGOL}>@
|
||||
//{POINT_PATAH}>@
|
||||
//{POINT_QAMATS}>@
|
||||
//{POINT_HOLAM}>@
|
||||
//{POINT_QUBUTS}>@
|
||||
//{POINT_DAGESH_OR_MAPIQ}>@
|
||||
//{POINT_METEG}>@
|
||||
//{PUNCTUATION_MAQAF}>@
|
||||
//{POINT_RAFE}>@
|
||||
//{PUNCTUATION_PASEQ}>@
|
||||
//{POINT_SHIN_DOT}>@
|
||||
//{POINT_SIN_DOT}>@
|
||||
//{PUNCTUATION_SOF_PASUQ}>@
|
||||
|
||||
+ "a<{ALEF}\n"
|
||||
+ "e<{AYIN}\n"
|
||||
+ "b<{BET}\n"
|
||||
+ "d<{DALET}\n"
|
||||
+ "k<{FINAL_KAF}\n"
|
||||
+ "m<{FINAL_MEM}\n"
|
||||
+ "n<{FINAL_NUN}\n"
|
||||
+ "p<{FINAL_PE}\n"
|
||||
+ "ts<{FINAL_TSADI}\n"
|
||||
+ "g<{GIMEL}\n"
|
||||
+ "kh<{HET}\n"
|
||||
+ "h<{HE}\n"
|
||||
+ "k''<{KAF}[{HE}\n"
|
||||
+ "k<{KAF}\n"
|
||||
+ "l<{LAMED}\n"
|
||||
+ "m<{MEM}\n"
|
||||
+ "n<{NUN}\n"
|
||||
+ "p<{PE}\n"
|
||||
+ "q<{QOF}\n"
|
||||
+ "r<{RESH}\n"
|
||||
+ "s''<{SAMEKH}[{HE}\n"
|
||||
+ "s<{SAMEKH}\n"
|
||||
+ "sh<{SHIN}\n"
|
||||
+ "th<{TAV}\n"
|
||||
+ "t''<{TET}[{HE}\n"
|
||||
+ "t''<{TET}[{HE}\n"
|
||||
+ "t''<{TET}[{SAMEKH}\n"
|
||||
+ "t''<{TET}[{SHIN}\n"
|
||||
+ "t<{TET}\n"
|
||||
+ "ts<{TSADI}\n"
|
||||
+ "v<{VAV}[{vowellike}\n"
|
||||
+ "u<{VAV}\n"
|
||||
+ "y<{YOD}\n"
|
||||
+ "z<{ZAYIN}\n"
|
||||
|
||||
//{YIDDISH_DOUBLE_VAV}>@
|
||||
//{YIDDISH_VAV_YOD}>@
|
||||
//{YIDDISH_DOUBLE_YOD}>@
|
||||
//{PUNCTUATION_GERESH}>@
|
||||
//{PUNCTUATION_GERSHAYIM}>@
|
||||
|
||||
+ "<''\n"
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
883
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java
Executable file
883
icu4j/src/com/ibm/text/resources/TransliterationRule$Latin$Kana.java
Executable file
|
@ -0,0 +1,883 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
/**
|
||||
* Rewritten April 1999 to implement Hepburn (kebon shiki)
|
||||
* transliteration. Reference: CJKV Information Processing, Lunde,
|
||||
* 1999, pp. 30-35.
|
||||
* @author Alan Liu
|
||||
*/
|
||||
public class TransliterationRuleLatinKana extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"Lowercase Latin to Hiragana; Uppercase Latin to Katakana" },
|
||||
|
||||
{ "Rule",
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Variables
|
||||
//------------------------------------------------------------
|
||||
|
||||
// Hiragana. These are named according to the
|
||||
// regularized Nippon romanization (the naming system
|
||||
// used by Unicode). Thus \u3062 is called "di", not
|
||||
// "ji". "x_" is the small form of "_", e.g. "xa" is
|
||||
// small "a".
|
||||
|
||||
"xa=\u3041\n"
|
||||
+ "a=\u3042\n"
|
||||
+ "xi=\u3043\n"
|
||||
+ "i=\u3044\n"
|
||||
+ "xu=\u3045\n"
|
||||
+ "u=\u3046\n"
|
||||
+ "xe=\u3047\n"
|
||||
+ "e=\u3048\n"
|
||||
+ "xo=\u3049\n"
|
||||
+ "o=\u304A\n"
|
||||
|
||||
+ "ka=\u304B\n"
|
||||
+ "ga=\u304C\n"
|
||||
+ "ki=\u304D\n"
|
||||
+ "gi=\u304E\n"
|
||||
+ "ku=\u304F\n"
|
||||
+ "gu=\u3050\n"
|
||||
+ "ke=\u3051\n"
|
||||
+ "ge=\u3052\n"
|
||||
+ "ko=\u3053\n"
|
||||
+ "go=\u3054\n"
|
||||
|
||||
+ "sa=\u3055\n"
|
||||
+ "za=\u3056\n"
|
||||
+ "si=\u3057\n"
|
||||
+ "zi=\u3058\n"
|
||||
+ "su=\u3059\n"
|
||||
+ "zu=\u305A\n"
|
||||
+ "se=\u305B\n"
|
||||
+ "ze=\u305C\n"
|
||||
+ "so=\u305D\n"
|
||||
+ "zo=\u305E\n"
|
||||
|
||||
+ "ta=\u305F\n"
|
||||
+ "da=\u3060\n"
|
||||
+ "ti=\u3061\n"
|
||||
+ "di=\u3062\n"
|
||||
+ "xtu=\u3063\n"
|
||||
+ "tu=\u3064\n"
|
||||
+ "du=\u3065\n"
|
||||
+ "te=\u3066\n"
|
||||
+ "de=\u3067\n"
|
||||
+ "to=\u3068\n"
|
||||
+ "do=\u3069\n"
|
||||
|
||||
+ "na=\u306A\n"
|
||||
+ "ni=\u306B\n"
|
||||
+ "nu=\u306C\n"
|
||||
+ "ne=\u306D\n"
|
||||
+ "no=\u306E\n"
|
||||
|
||||
+ "ha=\u306F\n"
|
||||
+ "ba=\u3070\n"
|
||||
+ "pa=\u3071\n"
|
||||
+ "hi=\u3072\n"
|
||||
+ "bi=\u3073\n"
|
||||
+ "pi=\u3074\n"
|
||||
+ "hu=\u3075\n"
|
||||
+ "bu=\u3076\n"
|
||||
+ "pu=\u3077\n"
|
||||
+ "he=\u3078\n"
|
||||
+ "be=\u3079\n"
|
||||
+ "pe=\u307A\n"
|
||||
+ "ho=\u307B\n"
|
||||
+ "bo=\u307C\n"
|
||||
+ "po=\u307D\n"
|
||||
|
||||
+ "ma=\u307E\n"
|
||||
+ "mi=\u307F\n"
|
||||
+ "mu=\u3080\n"
|
||||
+ "me=\u3081\n"
|
||||
+ "mo=\u3082\n"
|
||||
|
||||
+ "xya=\u3083\n"
|
||||
+ "ya=\u3084\n"
|
||||
+ "xyu=\u3085\n"
|
||||
+ "yu=\u3086\n"
|
||||
+ "xyo=\u3087\n"
|
||||
+ "yo=\u3088\n"
|
||||
|
||||
+ "ra=\u3089\n"
|
||||
+ "ri=\u308A\n"
|
||||
+ "ru=\u308B\n"
|
||||
+ "re=\u308C\n"
|
||||
+ "ro=\u308D\n"
|
||||
|
||||
+ "xwa=\u308E\n"
|
||||
+ "wa=\u308F\n"
|
||||
+ "wi=\u3090\n"
|
||||
+ "we=\u3091\n"
|
||||
+ "wo=\u3092\n"
|
||||
|
||||
+ "n=\u3093\n"
|
||||
+ "vu=\u3094\n"
|
||||
|
||||
// Katakana. "X_" is the small form of "_", e.g. "XA"
|
||||
// is small "A".
|
||||
|
||||
+ "XA=\u30A1\n"
|
||||
+ "A=\u30A2\n"
|
||||
+ "XI=\u30A3\n"
|
||||
+ "I=\u30A4\n"
|
||||
+ "XU=\u30A5\n"
|
||||
+ "U=\u30A6\n"
|
||||
+ "XE=\u30A7\n"
|
||||
+ "E=\u30A8\n"
|
||||
+ "XO=\u30A9\n"
|
||||
+ "O=\u30AA\n"
|
||||
|
||||
+ "KA=\u30AB\n"
|
||||
+ "GA=\u30AC\n"
|
||||
+ "KI=\u30AD\n"
|
||||
+ "GI=\u30AE\n"
|
||||
+ "KU=\u30AF\n"
|
||||
+ "GU=\u30B0\n"
|
||||
+ "KE=\u30B1\n"
|
||||
+ "GE=\u30B2\n"
|
||||
+ "KO=\u30B3\n"
|
||||
+ "GO=\u30B4\n"
|
||||
|
||||
+ "SA=\u30B5\n"
|
||||
+ "ZA=\u30B6\n"
|
||||
+ "SI=\u30B7\n"
|
||||
+ "ZI=\u30B8\n"
|
||||
+ "SU=\u30B9\n"
|
||||
+ "ZU=\u30BA\n"
|
||||
+ "SE=\u30BB\n"
|
||||
+ "ZE=\u30BC\n"
|
||||
+ "SO=\u30BD\n"
|
||||
+ "ZO=\u30BE\n"
|
||||
|
||||
+ "TA=\u30BF\n"
|
||||
+ "DA=\u30C0\n"
|
||||
+ "TI=\u30C1\n"
|
||||
+ "DI=\u30C2\n"
|
||||
+ "XTU=\u30C3\n"
|
||||
+ "TU=\u30C4\n"
|
||||
+ "DU=\u30C5\n"
|
||||
+ "TE=\u30C6\n"
|
||||
+ "DE=\u30C7\n"
|
||||
+ "TO=\u30C8\n"
|
||||
+ "DO=\u30C9\n"
|
||||
|
||||
+ "NA=\u30CA\n"
|
||||
+ "NI=\u30CB\n"
|
||||
+ "NU=\u30CC\n"
|
||||
+ "NE=\u30CD\n"
|
||||
+ "NO=\u30CE\n"
|
||||
|
||||
+ "HA=\u30CF\n"
|
||||
+ "BA=\u30D0\n"
|
||||
+ "PA=\u30D1\n"
|
||||
+ "HI=\u30D2\n"
|
||||
+ "BI=\u30D3\n"
|
||||
+ "PI=\u30D4\n"
|
||||
+ "HU=\u30D5\n"
|
||||
+ "BU=\u30D6\n"
|
||||
+ "PU=\u30D7\n"
|
||||
+ "HE=\u30D8\n"
|
||||
+ "BE=\u30D9\n"
|
||||
+ "PE=\u30DA\n"
|
||||
+ "HO=\u30DB\n"
|
||||
+ "BO=\u30DC\n"
|
||||
+ "PO=\u30DD\n"
|
||||
|
||||
+ "MA=\u30DE\n"
|
||||
+ "MI=\u30DF\n"
|
||||
+ "MU=\u30E0\n"
|
||||
+ "ME=\u30E1\n"
|
||||
+ "MO=\u30E2\n"
|
||||
|
||||
+ "XYA=\u30E3\n"
|
||||
+ "YA=\u30E4\n"
|
||||
+ "XYU=\u30E5\n"
|
||||
+ "YU=\u30E6\n"
|
||||
+ "XYO=\u30E7\n"
|
||||
+ "YO=\u30E8\n"
|
||||
|
||||
+ "RA=\u30E9\n"
|
||||
+ "RI=\u30EA\n"
|
||||
+ "RU=\u30EB\n"
|
||||
+ "RE=\u30EC\n"
|
||||
+ "RO=\u30ED\n"
|
||||
|
||||
+ "XWA=\u30EE\n"
|
||||
+ "WA=\u30EF\n"
|
||||
+ "WI=\u30F0\n"
|
||||
+ "WE=\u30F1\n"
|
||||
+ "WO=\u30F2\n"
|
||||
|
||||
+ "N=\u30F3\n"
|
||||
+ "VU=\u30F4\n"
|
||||
|
||||
+ "XKA=\u30F5\n"
|
||||
+ "XKE=\u30F6\n"
|
||||
|
||||
+ "VA=\u30F7\n"
|
||||
+ "VI=\u30F8\n"
|
||||
+ "VE=\u30F9\n"
|
||||
+ "VO=\u30FA\n"
|
||||
|
||||
+ "DOT=\u30FB\n" // Middle dot
|
||||
+ "LONG=\u30FC\n" // Prolonged sound mark
|
||||
|
||||
// Categories and programmatic variables
|
||||
|
||||
+ "vowel=[aiueo]\n"
|
||||
+ "small=\uE000\n"
|
||||
+ "hvr=\uE001\n"
|
||||
+ "hv=[{xya}{xi}{xyu}{xe}{xyo}]\n"
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Rules
|
||||
//------------------------------------------------------------
|
||||
/*
|
||||
// Hepburn equivalents
|
||||
|
||||
shi>|si
|
||||
ji>|zi
|
||||
chi>|ti
|
||||
// ji>|di // By default we use the ji-zi mapping
|
||||
tsu>|tu
|
||||
fu>|hu
|
||||
|
||||
sh[{vowel}>|sy
|
||||
ja>|zya
|
||||
// ji = zi
|
||||
ju>|zyu
|
||||
je>|zye
|
||||
jo>|zyo
|
||||
cha>|tya
|
||||
// chi = ti
|
||||
chu>|tyu
|
||||
che>|tye
|
||||
cho>|tyo
|
||||
// j[{vowel} = dy{vowel}, but we use zy{vowel} by default
|
||||
|
||||
// Historically, m preceded b, p, or m; now n is used
|
||||
// in all cases
|
||||
m[b>n
|
||||
m[p>n
|
||||
m[m>n
|
||||
|
||||
// Compatibility
|
||||
|
||||
// 'f' group
|
||||
fa>{fu}{xa}
|
||||
fi>{fu}{xi}
|
||||
// fu = hu
|
||||
fe>{fu}{xe}
|
||||
fo>{fu}{xo}
|
||||
|
||||
// 'jy' group; these will not round-trip, except for "jyi"
|
||||
// See also the 'j' group.
|
||||
jya>|zya
|
||||
jyi>{zi}{xyi}
|
||||
jyu>|zyu
|
||||
jye>|zye
|
||||
jyo>|zyo
|
||||
|
||||
// Nippon romanized forms
|
||||
|
||||
a>{a}
|
||||
i>{i}
|
||||
u>{u}
|
||||
e>{e}
|
||||
o>{o}
|
||||
ka>{ka}
|
||||
ki>{ki}
|
||||
ku>{ku}
|
||||
ke>{ke}
|
||||
ko>{ko}
|
||||
ga>{ga}
|
||||
gi>{gi}
|
||||
gu>{gu}
|
||||
ge>{ge}
|
||||
go>{go}
|
||||
sa>{sa}
|
||||
si>{si}
|
||||
su>{su}
|
||||
se>{se}
|
||||
so>{so}
|
||||
za>{za}
|
||||
zi>{zi}
|
||||
zu>{zu}
|
||||
ze>{ze}
|
||||
zo>{zo}
|
||||
ta>{ta}
|
||||
ti>{ti}
|
||||
tu>{tu}
|
||||
te>{te}
|
||||
to>{to}
|
||||
da>{da}
|
||||
di>{di}
|
||||
du>{du}
|
||||
de>{de}
|
||||
do>{do}
|
||||
na>{na}
|
||||
ni>{ni}
|
||||
nu>{nu}
|
||||
ne>{ne}
|
||||
no>{no}
|
||||
ha>{ha}
|
||||
hi>{hi}
|
||||
hu>{hu}
|
||||
he>{he}
|
||||
ho>{ho}
|
||||
ba>{ba}
|
||||
bi>{bi}
|
||||
bu>{bu}
|
||||
be>{be}
|
||||
bo>{bo}
|
||||
pa>{pa}
|
||||
pi>{pi}
|
||||
pu>{pu}
|
||||
pe>{pe}
|
||||
po>{po}
|
||||
ma>{ma}
|
||||
mi>{mi}
|
||||
mu>{mu}
|
||||
me>{me}
|
||||
mo>{mo}
|
||||
ya>{ya}
|
||||
yu>{yu}
|
||||
yo>{yo}
|
||||
ra>{ra}
|
||||
ri>{ri}
|
||||
ru>{ru}
|
||||
re>{re}
|
||||
ro>{ro}
|
||||
wa>{wa}
|
||||
wi>{wi}
|
||||
// No "wu"
|
||||
we>{we}
|
||||
wo>{wo} // Reverse {wo} to "o", not "wo"
|
||||
n''>{n}
|
||||
n>{n}
|
||||
|
||||
// Palatized Nippon romanized syllables
|
||||
|
||||
ky[{vowel}>{ki}|{small}
|
||||
gy[{vowel}>{gi}|{small}
|
||||
sy[{vowel}>{si}|{small}
|
||||
zy[{vowel}>{zi}|{small}
|
||||
ty[{vowel}>{ti}|{small}
|
||||
dy[{vowel}>{di}|{small}
|
||||
ny[{vowel}>{ni}|{small}
|
||||
my[{vowel}>{mi}|{small}
|
||||
hy[{vowel}>{hi}|{small}
|
||||
by[{vowel}>{bi}|{small}
|
||||
py[{vowel}>{pi}|{small}
|
||||
ry[{vowel}>{ri}|{small}
|
||||
|
||||
// Doubled consonants
|
||||
|
||||
c[c>{xtu}
|
||||
k[k>{xtu}
|
||||
g[g>{xtu}
|
||||
s[s>{xtu}
|
||||
z[z>{xtu}
|
||||
j[j>{xtu}
|
||||
t[t>{xtu}
|
||||
d[d>{xtu}
|
||||
h[h>{xtu}
|
||||
f[f>{xtu}
|
||||
p[p>{xtu}
|
||||
b[b>{xtu}
|
||||
m[m>{xtu}
|
||||
y[y>{xtu}
|
||||
r[r>{xtu}
|
||||
w[w>{xtu}
|
||||
*/
|
||||
|
||||
+ "a>{a}\n"
|
||||
|
||||
+ "ba>{ba}\n"
|
||||
+ "bi>{bi}\n"
|
||||
+ "bu>{bu}\n"
|
||||
+ "be>{be}\n"
|
||||
+ "bo>{bo}\n"
|
||||
+ "by[{vowel}>{bi}|{small}\n"
|
||||
+ "b[b>{xtu}\n"
|
||||
|
||||
+ "da>{da}\n"
|
||||
+ "di>{di}\n"
|
||||
+ "du>{du}\n"
|
||||
+ "de>{de}\n"
|
||||
+ "do>{do}\n"
|
||||
+ "dy[{vowel}>{di}|{small}\n"
|
||||
+ "dh[{vowel}>{de}|{small}\n"
|
||||
+ "d[d>{xtu}\n"
|
||||
|
||||
+ "e>{e}\n"
|
||||
|
||||
+ "fa>{hu}{xa}\n"
|
||||
+ "fi>{hu}{xi}\n"
|
||||
+ "fe>{hu}{xe}\n"
|
||||
+ "fo>{hu}{xo}\n"
|
||||
+ "fya>{hu}{xya}\n"
|
||||
+ "fyu>{hu}{xyu}\n"
|
||||
+ "fyo>{hu}{xyo}\n"
|
||||
+ "f[f>{xtu}\n"
|
||||
|
||||
+ "ga>{ga}\n"
|
||||
+ "gi>{gi}\n"
|
||||
+ "gu>{gu}\n"
|
||||
+ "ge>{ge}\n"
|
||||
+ "go>{go}\n"
|
||||
+ "gy[{vowel}>{gi}|{small}\n"
|
||||
+ "gwa>{gu}{xwa}\n"
|
||||
+ "gwi>{gu}{xi}\n"
|
||||
+ "gwu>{gu}{xu}\n"
|
||||
+ "gwe>{gu}{xe}\n"
|
||||
+ "gwo>{gu}{xo}\n"
|
||||
+ "g[g>{xtu}\n"
|
||||
|
||||
+ "ha>{ha}\n"
|
||||
+ "hi>{hi}\n"
|
||||
+ "hu>{hu}\n"
|
||||
+ "he>{he}\n"
|
||||
+ "ho>{ho}\n"
|
||||
+ "hy[{vowel}>{hi}|{small}\n"
|
||||
+ "h[h>{xtu}\n"
|
||||
|
||||
+ "i>{i}\n"
|
||||
|
||||
+ "ka>{ka}\n"
|
||||
+ "ki>{ki}\n"
|
||||
+ "ku>{ku}\n"
|
||||
+ "ke>{ke}\n"
|
||||
+ "ko>{ko}\n"
|
||||
+ "kwa>{ku}{xwa}\n"
|
||||
+ "kwi>{ku}{xi}\n"
|
||||
+ "kwu>{ku}{xu}\n"
|
||||
+ "kwe>{ku}{xe}\n"
|
||||
+ "kwo>{ku}{xo}\n"
|
||||
+ "ky[{vowel}>{ki}|{small}\n"
|
||||
+ "k[k>{xtu}\n"
|
||||
|
||||
+ "ma>{ma}\n"
|
||||
+ "mi>{mi}\n"
|
||||
+ "mu>{mu}\n"
|
||||
+ "me>{me}\n"
|
||||
+ "mo>{mo}\n"
|
||||
+ "my[{vowel}>{mi}|{small}\n"
|
||||
+ "m[b>{n}\n"
|
||||
+ "m[f>{n}\n"
|
||||
+ "m[m>{n}\n"
|
||||
+ "m[p>{n}\n"
|
||||
+ "m[v>{n}\n"
|
||||
+ "m''>{n}\n"
|
||||
|
||||
+ "na>{na}\n"
|
||||
+ "ni>{ni}\n"
|
||||
+ "nu>{nu}\n"
|
||||
+ "ne>{ne}\n"
|
||||
+ "no>{no}\n"
|
||||
+ "ny[{vowel}>{ni}|{small}\n"
|
||||
+ "nn>{n}\n"
|
||||
+ "n''>{n}\n"
|
||||
+ "n>{n}\n"
|
||||
|
||||
+ "o>{o}\n"
|
||||
|
||||
+ "pa>{pa}\n"
|
||||
+ "pi>{pi}\n"
|
||||
+ "pu>{pu}\n"
|
||||
+ "pe>{pe}\n"
|
||||
+ "po>{po}\n"
|
||||
+ "py[{vowel}>{pi}|{small}\n"
|
||||
+ "p[p>{xtu}\n"
|
||||
|
||||
+ "qa>{ku}{xa}\n"
|
||||
+ "qi>{ku}{xi}\n"
|
||||
+ "qu>{ku}{xu}\n"
|
||||
+ "qe>{ku}{xe}\n"
|
||||
+ "qo>{ku}{xo}\n"
|
||||
+ "qy[{vowel}>{ku}|{small}\n"
|
||||
+ "q[q>{xtu}\n"
|
||||
|
||||
+ "ra>{ra}\n"
|
||||
+ "ri>{ri}\n"
|
||||
+ "ru>{ru}\n"
|
||||
+ "re>{re}\n"
|
||||
+ "ro>{ro}\n"
|
||||
+ "ry[{vowel}>{ri}|{small}\n"
|
||||
+ "r[r>{xtu}\n"
|
||||
|
||||
+ "sa>{sa}\n"
|
||||
+ "si>{si}\n"
|
||||
+ "su>{su}\n"
|
||||
+ "se>{se}\n"
|
||||
+ "so>{so}\n"
|
||||
+ "sy[{vowel}>{si}|{small}\n"
|
||||
+ "s[sh>{xtu}\n"
|
||||
+ "s[s>{xtu}\n"
|
||||
|
||||
+ "ta>{ta}\n"
|
||||
+ "ti>{ti}\n"
|
||||
+ "tu>{tu}\n"
|
||||
+ "te>{te}\n"
|
||||
+ "to>{to}\n"
|
||||
+ "th[{vowel}>{te}|{small}\n"
|
||||
+ "tsa>{tu}{xa}\n"
|
||||
+ "tsi>{tu}{xi}\n"
|
||||
+ "tse>{tu}{xe}\n"
|
||||
+ "tso>{tu}{xo}\n"
|
||||
+ "ty[{vowel}>{ti}|{small}\n"
|
||||
+ "t[ts>{xtu}\n"
|
||||
+ "t[ch>{xtu}\n"
|
||||
+ "t[t>{xtu}\n"
|
||||
|
||||
+ "u>{u}\n"
|
||||
|
||||
+ "va>{VA}\n"
|
||||
+ "vi>{VI}\n"
|
||||
+ "vu>{vu}\n"
|
||||
+ "ve>{VE}\n"
|
||||
+ "vo>{VO}\n"
|
||||
+ "vy[{vowel}>{VI}|{small}\n"
|
||||
+ "v[v>{xtu}\n"
|
||||
|
||||
+ "wa>{wa}\n"
|
||||
+ "wi>{wi}\n"
|
||||
+ "we>{we}\n"
|
||||
+ "wo>{wo}\n"
|
||||
+ "w[w>{xtu}\n"
|
||||
|
||||
+ "ya>{ya}\n"
|
||||
+ "yu>{yu}\n"
|
||||
+ "ye>{i}{xe}\n"
|
||||
+ "yo>{yo}\n"
|
||||
+ "y[y>{xtu}\n"
|
||||
|
||||
+ "za>{za}\n"
|
||||
+ "zi>{zi}\n"
|
||||
+ "zu>{zu}\n"
|
||||
+ "ze>{ze}\n"
|
||||
+ "zo>{zo}\n"
|
||||
+ "zy[{vowel}>{zi}|{small}\n"
|
||||
+ "z[z>{xtu}\n"
|
||||
|
||||
+ "xa>{xa}\n"
|
||||
+ "xi>{xi}\n"
|
||||
+ "xu>{xu}\n"
|
||||
+ "xe>{xe}\n"
|
||||
+ "xo>{xo}\n"
|
||||
+ "xka>{XKA}\n"
|
||||
+ "xke>{XKE}\n"
|
||||
+ "xtu>{xtu}\n"
|
||||
+ "xwa>{xwa}\n"
|
||||
+ "xya>{xya}\n"
|
||||
+ "xyu>{xyu}\n"
|
||||
+ "xyo>{xyo}\n"
|
||||
|
||||
// optional mappings
|
||||
+ "wu>{u}\n"
|
||||
|
||||
+ "ca>{ka}\n"
|
||||
+ "ci>{si}\n"
|
||||
+ "cu>{ku}\n"
|
||||
+ "ce>{se}\n"
|
||||
+ "co>{ko}\n"
|
||||
+ "cha>{ti}{xya}\n"
|
||||
+ "chi>{ti}\n"
|
||||
+ "chu>{ti}{xyu}\n"
|
||||
+ "che>{ti}{xe}\n"
|
||||
+ "cho>{ti}{xyo}\n"
|
||||
+ "cy[{vowel}>{ti}|{small}\n"
|
||||
+ "c[k>{xtu}\n"
|
||||
+ "c[c>{xtu}\n"
|
||||
|
||||
+ "fu>{hu}\n"
|
||||
|
||||
+ "ja>{zi}{xya}\n"
|
||||
+ "ji>{zi}\n"
|
||||
+ "ju>{zi}{xyu}\n"
|
||||
+ "je>{zi}{xe}\n"
|
||||
+ "jo>{zi}{xyo}\n"
|
||||
+ "jy[{vowel}>{zi}|{small}\n"
|
||||
+ "j[j>{xtu}\n"
|
||||
|
||||
+ "la>{ra}\n"
|
||||
+ "li>{ri}\n"
|
||||
+ "lu>{ru}\n"
|
||||
+ "le>{re}\n"
|
||||
+ "lo>{ro}\n"
|
||||
+ "ly[{vowel}>{ri}|{small}\n"
|
||||
+ "l[l>{xtu}\n"
|
||||
|
||||
+ "sha>{si}{xya}\n"
|
||||
+ "shi>{si}\n"
|
||||
+ "shu>{si}{xyu}\n"
|
||||
+ "she>{si}{xe}\n"
|
||||
+ "sho>{si}{xyo}\n"
|
||||
|
||||
+ "tsu>{tu}\n"
|
||||
|
||||
+ "yi>{i}\n"
|
||||
|
||||
+ "xtsu>{xtu}\n"
|
||||
+ "xyi>{xi}\n"
|
||||
+ "xye>{xe}\n"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Convert vowels to small form
|
||||
+ "{small}a>{xya}\n"
|
||||
+ "{small}i>{xi}\n"
|
||||
+ "{small}u>{xyu}\n"
|
||||
+ "{small}e>{xe}\n"
|
||||
+ "{small}o>{xyo}\n"
|
||||
|
||||
|
||||
|
||||
|
||||
+ "gy|{hvr}<{gi}[{hv}\n"
|
||||
+ "gwa<{gu}{xwa}\n"
|
||||
+ "gwi<{gu}{xi}\n"
|
||||
+ "gwu<{gu}{xu}\n"
|
||||
+ "gwe<{gu}{xe}\n"
|
||||
+ "gwo<{gu}{xo}\n"
|
||||
+ "ga<{ga}\n"
|
||||
+ "gi<{gi}\n"
|
||||
+ "gu<{gu}\n"
|
||||
+ "ge<{ge}\n"
|
||||
+ "go<{go}\n"
|
||||
|
||||
+ "ky|{hvr}<{ki}[{hv}\n"
|
||||
+ "kwa<{ku}{xwa}\n"
|
||||
+ "kwi<{ku}{xi}\n"
|
||||
+ "kwu<{ku}{xu}\n"
|
||||
+ "kwe<{ku}{xe}\n"
|
||||
+ "kwo<{ku}{xo}\n"
|
||||
+ "qa<{ku}{xa}\n"
|
||||
+ "qya<{ku}{xya}\n"
|
||||
+ "qyu<{ku}{xyu}\n"
|
||||
+ "qyo<{ku}{xyo}\n"
|
||||
+ "ka<{ka}\n"
|
||||
+ "ki<{ki}\n"
|
||||
+ "ku<{ku}\n"
|
||||
+ "ke<{ke}\n"
|
||||
+ "ko<{ko}\n"
|
||||
|
||||
+ "j|{hvr}<{zi}[{hv}\n" // Hepburn
|
||||
+ "za<{za}\n"
|
||||
+ "ji<{zi}\n" // Hepburn
|
||||
+ "zu<{zu}\n"
|
||||
+ "ze<{ze}\n"
|
||||
+ "zo<{zo}\n"
|
||||
|
||||
+ "sh|{hvr}<{si}[{hv}\n" // Hepburn
|
||||
+ "sa<{sa}\n"
|
||||
+ "shi<{si}\n"
|
||||
+ "su<{su}\n"
|
||||
+ "se<{se}\n"
|
||||
+ "so<{so}\n"
|
||||
|
||||
+ "j|{hvr}<{di}[{hv}\n" // Hepburn
|
||||
+ "dh|{hvr}<{de}[{hv}\n"
|
||||
+ "da<{da}\n"
|
||||
+ "ji<{di}\n" // Hepburn
|
||||
+ "de<{de}\n"
|
||||
+ "do<{do}\n"
|
||||
+ "zu<{du}\n" // Hepburn
|
||||
|
||||
+ "ch|{hvr}<{ti}[{hv}\n" // Hepburn
|
||||
+ "tsa<{tu}{xa}\n"
|
||||
+ "tsi<{tu}{xi}\n"
|
||||
+ "tse<{tu}{xe}\n"
|
||||
+ "tso<{tu}{xo}\n"
|
||||
+ "th|{hvr}<{te}[{hv}\n"
|
||||
+ "ta<{ta}\n"
|
||||
+ "chi<{ti}\n" // Hepburn
|
||||
+ "tsu<{tu}\n" // Hepburn
|
||||
+ "te<{te}\n"
|
||||
+ "to<{to}\n"
|
||||
|
||||
+ "ny|{hvr}<{ni}[{hv}\n"
|
||||
+ "na<{na}\n"
|
||||
+ "ni<{ni}\n"
|
||||
+ "nu<{nu}\n"
|
||||
+ "ne<{ne}\n"
|
||||
+ "no<{no}\n"
|
||||
|
||||
+ "by|{hvr}<{bi}[{hv}\n"
|
||||
+ "ba<{ba}\n"
|
||||
+ "bi<{bi}\n"
|
||||
+ "bu<{bu}\n"
|
||||
+ "be<{be}\n"
|
||||
+ "bo<{bo}\n"
|
||||
|
||||
+ "py|{hvr}<{pi}[{hv}\n"
|
||||
+ "pa<{pa}\n"
|
||||
+ "pi<{pi}\n"
|
||||
+ "pu<{pu}\n"
|
||||
+ "pe<{pe}\n"
|
||||
+ "po<{po}\n"
|
||||
|
||||
+ "hy|{hvr}<{hi}[{hv}\n"
|
||||
+ "fa<{hu}{xa}\n"
|
||||
+ "fi<{hu}{xi}\n"
|
||||
+ "fe<{hu}{xe}\n"
|
||||
+ "fo<{hu}{xo}\n"
|
||||
+ "fya<{hu}{xya}\n"
|
||||
+ "fyu<{hu}{xyu}\n"
|
||||
+ "fyo<{hu}{xyo}\n"
|
||||
+ "ha<{ha}\n"
|
||||
+ "hi<{hi}\n"
|
||||
+ "fu<{hu}\n" // Hepburn
|
||||
+ "he<{he}\n"
|
||||
+ "ho<{ho}\n"
|
||||
|
||||
+ "my|{hvr}<{mi}[{hv}\n"
|
||||
+ "ma<{ma}\n"
|
||||
+ "mi<{mi}\n"
|
||||
+ "mu<{mu}\n"
|
||||
+ "me<{me}\n"
|
||||
+ "mo<{mo}\n"
|
||||
|
||||
+ "ya<{ya}\n"
|
||||
+ "yu<{yu}\n"
|
||||
+ "ye<{i}{xe}\n"
|
||||
+ "yo<{yo}\n"
|
||||
+ "xya<{xya}\n"
|
||||
+ "xyu<{xyu}\n"
|
||||
+ "xyo<{xyo}\n"
|
||||
|
||||
+ "ry|{hvr}<{ri}[{hv}\n"
|
||||
+ "ra<{ra}\n"
|
||||
+ "ri<{ri}\n"
|
||||
+ "ru<{ru}\n"
|
||||
+ "re<{re}\n"
|
||||
+ "ro<{ro}\n"
|
||||
|
||||
+ "wa<{wa}\n"
|
||||
+ "wi<{wi}\n"
|
||||
+ "we<{we}\n"
|
||||
+ "wo<{wo}\n"
|
||||
|
||||
+ "vu<{vu}\n"
|
||||
+ "vy|{hvr}<{VI}[{hv}\n"
|
||||
+ "v<{xtu}[{vu}\n"
|
||||
|
||||
+ "xa<{xa}\n"
|
||||
+ "xi<{xi}\n"
|
||||
+ "xu<{xu}\n"
|
||||
+ "xe<{xe}\n"
|
||||
+ "xo<{xo}\n"
|
||||
|
||||
+ "n''<{n}[{a}\n"
|
||||
+ "n''<{n}[{i}\n"
|
||||
+ "n''<{n}[{u}\n"
|
||||
+ "n''<{n}[{e}\n"
|
||||
+ "n''<{n}[{o}\n"
|
||||
+ "n''<{n}[{na}\n"
|
||||
+ "n''<{n}[{ni}\n"
|
||||
+ "n''<{n}[{nu}\n"
|
||||
+ "n''<{n}[{ne}\n"
|
||||
+ "n''<{n}[{no}\n"
|
||||
+ "n''<{n}[{ya}\n"
|
||||
+ "n''<{n}[{yu}\n"
|
||||
+ "n''<{n}[{yo}\n"
|
||||
+ "n''<{n}[{n}\n"
|
||||
+ "n<{n}\n"
|
||||
|
||||
|
||||
+ "g<{xtu}[{ga}\n"
|
||||
+ "g<{xtu}[{gi}\n"
|
||||
+ "g<{xtu}[{gu}\n"
|
||||
+ "g<{xtu}[{ge}\n"
|
||||
+ "g<{xtu}[{go}\n"
|
||||
+ "k<{xtu}[{ka}\n"
|
||||
+ "k<{xtu}[{ki}\n"
|
||||
+ "k<{xtu}[{ku}\n"
|
||||
+ "k<{xtu}[{ke}\n"
|
||||
+ "k<{xtu}[{ko}\n"
|
||||
|
||||
+ "z<{xtu}[{za}\n"
|
||||
+ "z<{xtu}[{zi}\n"
|
||||
+ "z<{xtu}[{zu}\n"
|
||||
+ "z<{xtu}[{ze}\n"
|
||||
+ "z<{xtu}[{zo}\n"
|
||||
+ "s<{xtu}[{sa}\n"
|
||||
+ "s<{xtu}[{si}\n"
|
||||
+ "s<{xtu}[{su}\n"
|
||||
+ "s<{xtu}[{se}\n"
|
||||
+ "s<{xtu}[{so}\n"
|
||||
|
||||
+ "d<{xtu}[{da}\n"
|
||||
+ "d<{xtu}[{di}\n"
|
||||
+ "d<{xtu}[{du}\n"
|
||||
+ "d<{xtu}[{de}\n"
|
||||
+ "d<{xtu}[{do}\n"
|
||||
+ "t<{xtu}[{ta}\n"
|
||||
+ "t<{xtu}[{ti}\n"
|
||||
+ "t<{xtu}[{tu}\n"
|
||||
+ "t<{xtu}[{te}\n"
|
||||
+ "t<{xtu}[{to}\n"
|
||||
|
||||
|
||||
+ "b<{xtu}[{ba}\n"
|
||||
+ "b<{xtu}[{bi}\n"
|
||||
+ "b<{xtu}[{bu}\n"
|
||||
+ "b<{xtu}[{be}\n"
|
||||
+ "b<{xtu}[{bo}\n"
|
||||
+ "p<{xtu}[{pa}\n"
|
||||
+ "p<{xtu}[{pi}\n"
|
||||
+ "p<{xtu}[{pu}\n"
|
||||
+ "p<{xtu}[{pe}\n"
|
||||
+ "p<{xtu}[{po}\n"
|
||||
+ "h<{xtu}[{ha}\n"
|
||||
+ "h<{xtu}[{hi}\n"
|
||||
+ "h<{xtu}[{hu}\n"
|
||||
+ "h<{xtu}[{he}\n"
|
||||
+ "h<{xtu}[{ho}\n"
|
||||
|
||||
|
||||
+ "r<{xtu}[{ra}\n"
|
||||
+ "r<{xtu}[{ri}\n"
|
||||
+ "r<{xtu}[{ru}\n"
|
||||
+ "r<{xtu}[{re}\n"
|
||||
+ "r<{xtu}[{ro}\n"
|
||||
|
||||
+ "w<{xtu}[{wa}\n"
|
||||
+ "xtu<{xtu}\n"
|
||||
|
||||
+ "a<{a}\n"
|
||||
+ "i<{i}\n"
|
||||
+ "u<{u}\n"
|
||||
+ "e<{e}\n"
|
||||
+ "o<{o}\n"
|
||||
|
||||
|
||||
|
||||
// Convert small forms to vowels
|
||||
+ "a<{hvr}{xya}\n"
|
||||
+ "i<{hvr}{xi}\n"
|
||||
+ "u<{hvr}{xyu}\n"
|
||||
+ "e<{hvr}{xe}\n"
|
||||
+ "o<{hvr}{xyo}\n"
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
package com.ibm.text.resources;
|
||||
|
||||
import java.util.ListResourceBundle;
|
||||
|
||||
public class TransliterationRuleStraightQuotesCurlyQuotes extends ListResourceBundle {
|
||||
/**
|
||||
* Overrides ListResourceBundle
|
||||
*/
|
||||
public Object[][] getContents() {
|
||||
return new Object[][] {
|
||||
{ "Description",
|
||||
"Use left and right double quotes" },
|
||||
|
||||
{ "Rule",
|
||||
// Rewritten using character codes [LIU]
|
||||
"white=[[:Zs:][:Zl:][:Zp:]]\n"
|
||||
+ "black=[^[:Zs:][:Zl:][:Zp:]]\n"
|
||||
+ "open=[[:Ps:]]\n"
|
||||
+ "dquote=\"\n"
|
||||
|
||||
+ "lAng=\u3008\n"
|
||||
+ "ldAng=\u300A\n"
|
||||
+ "lBrk='['\n"
|
||||
+ "lBrc='{'\n"
|
||||
|
||||
+ "lquote=\u2018\n"
|
||||
+ "rquote=\u2019\n"
|
||||
+ "ldquote=\u201C\n"
|
||||
+ "rdquote=\u201D\n"
|
||||
|
||||
+ "ldguill=\u00AB\n"
|
||||
+ "rdguill=\u00BB\n"
|
||||
+ "lguill=\u2039\n"
|
||||
+ "rguill=\u203A\n"
|
||||
|
||||
+ "mdash=\u2014\n"
|
||||
|
||||
//#######################################
|
||||
// Conversions from input
|
||||
//#######################################
|
||||
|
||||
// join single quotes
|
||||
+ "{lquote}''>{ldquote}\n"
|
||||
+ "{lquote}{lquote}>{ldquote}\n"
|
||||
+ "{rquote}''>{rdquote}\n"
|
||||
+ "{rquote}{rquote}>{rdquote}\n"
|
||||
|
||||
//smart single quotes
|
||||
+ "{white}]''>{lquote}\n"
|
||||
+ "{open}]''>{lquote}\n"
|
||||
+ "{black}]''>{rquote}\n"
|
||||
+ "''>{lquote}\n"
|
||||
|
||||
//smart doubles
|
||||
+ "{white}]{dquote}>{ldquote}\n"
|
||||
+ "{open}]{dquote}>{ldquote}\n"
|
||||
+ "{black}]{dquote}>{rdquote}\n"
|
||||
+ "{dquote}>{ldquote}\n"
|
||||
|
||||
// join single guillemets
|
||||
+ "{rguill}{rguill}>{rdguill}\n"
|
||||
+ "'>>'>{rdguill}\n"
|
||||
+ "{lguill}{lguill}>{ldguill}\n"
|
||||
+ "'<<'>{ldguill}\n"
|
||||
|
||||
// prevent double spaces
|
||||
+ " ] >\n"
|
||||
|
||||
// join hyphens into dash
|
||||
+ "-->{mdash}\n"
|
||||
|
||||
//#######################################
|
||||
// Conversions back to input
|
||||
//#######################################
|
||||
|
||||
//smart quotes
|
||||
+ "''<{lquote}\n"
|
||||
+ "''<{rquote}\n"
|
||||
+ "{dquote}<{ldquote}\n"
|
||||
+ "{dquote}<{rdquote}\n"
|
||||
|
||||
//hyphens
|
||||
+ "--<{mdash}\n"
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue