mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1532 initial implementation of AnyTransliterator
X-SVN-Rev: 8861
This commit is contained in:
parent
361f87b9c2
commit
d7bacfbd2b
1 changed files with 338 additions and 0 deletions
338
icu4j/src/com/ibm/icu/text/AnyTransliterator.java
Normal file
338
icu4j/src/com/ibm/icu/text/AnyTransliterator.java
Normal file
|
@ -0,0 +1,338 @@
|
|||
/*
|
||||
*****************************************************************
|
||||
* Copyright (c) 2002, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
*****************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/AnyTransliterator.java,v $
|
||||
* $Revision: 1.1 $
|
||||
*****************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/2002 aliu Creation.
|
||||
*****************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import java.lang.Math;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A transliterator that translates multiple input scripts to a single
|
||||
* output script. It is named Any-T or Any-T/V, where T is the target
|
||||
* and V is the optional variant. The target T is a script.
|
||||
*
|
||||
* <p>An AnyTransliterator partitions text into runs of the same
|
||||
* script, together with adjacent COMMON or INHERITED characters.
|
||||
* After determining the script of each run, it transliterates from
|
||||
* that script to the given target/variant. It does so by
|
||||
* instantiating a transliterator from the source script to the
|
||||
* target/variant. If a run consists only of the target script,
|
||||
* COMMON, or INHERITED characters, then the run is not changed.
|
||||
*
|
||||
* <p>At startup, all possible AnyTransliterators are registered with
|
||||
* the system, as determined by examining the registered script
|
||||
* transliterators.
|
||||
*
|
||||
* @since ICU 2.2
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class AnyTransliterator extends Transliterator {
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Constants
|
||||
|
||||
static final char TARGET_SEP = '-';
|
||||
static final char VARIANT_SEP = '/';
|
||||
static final String ANY = "Any";
|
||||
static final String NULL_ID = "Null";
|
||||
static final String LATIN_PIVOT = "-Latin;Latin-";
|
||||
|
||||
/**
|
||||
* Cache mapping UScriptCode values to Transliterator*.
|
||||
*/
|
||||
private Map cache;
|
||||
|
||||
/**
|
||||
* The target or target/variant string.
|
||||
*/
|
||||
private String target;
|
||||
|
||||
/**
|
||||
* The target script code. Never USCRIPT_INVALID_CODE.
|
||||
*/
|
||||
private int targetScript;
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position pos, boolean isIncremental) {
|
||||
int allStart = pos.start;
|
||||
int allLimit = pos.limit;
|
||||
|
||||
ScriptRunIterator it =
|
||||
new ScriptRunIterator(text, pos.contextStart, pos.contextLimit);
|
||||
|
||||
while (it.next()) {
|
||||
// Ignore runs in the ante context
|
||||
if (it.limit <= allStart) continue;
|
||||
|
||||
// Try to instantiate transliterator from it.scriptCode to
|
||||
// our target or target/variant
|
||||
Transliterator t = getTransliterator(it.scriptCode);
|
||||
|
||||
if (t == null) {
|
||||
// We have no transliterator. Do nothing, but keep
|
||||
// pos.start up to date.
|
||||
pos.start = it.limit;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If the run end is before the transliteration limit, do
|
||||
// a non-incremental transliteration. Otherwise do an
|
||||
// incremental one.
|
||||
boolean incremental = isIncremental && (it.limit >= allLimit);
|
||||
|
||||
pos.start = Math.max(allStart, it.start);
|
||||
pos.limit = Math.min(allLimit, it.limit);
|
||||
int limit = pos.limit;
|
||||
t.filteredTransliterate(text, pos, incremental);
|
||||
int delta = pos.limit - limit;
|
||||
allLimit += delta;
|
||||
it.adjustLimit(delta);
|
||||
|
||||
// We're done if we enter the post context
|
||||
if (it.limit >= allLimit) break;
|
||||
}
|
||||
|
||||
// Restore limit. pos.start is fine where the last transliterator
|
||||
// left it, or at the end of the last run.
|
||||
pos.limit = allLimit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Private constructor
|
||||
* @param id the ID of the form S-T or S-T/V, where T is theTarget
|
||||
* and V is theVariant. Must not be empty.
|
||||
* @param theTarget the target name. Must not be empty, and must
|
||||
* name a script corresponding to theTargetScript.
|
||||
* @param theVariant the variant name, or the empty string if
|
||||
* there is no variant
|
||||
* @param theTargetScript the script code corresponding to
|
||||
* theTarget.
|
||||
* @param ec error code, fails if the internal hashtable cannot be
|
||||
* allocated
|
||||
*/
|
||||
private AnyTransliterator(String id,
|
||||
String theTarget,
|
||||
String theVariant,
|
||||
int theTargetScript) {
|
||||
super(id, null);
|
||||
targetScript = theTargetScript;
|
||||
cache = new HashMap();
|
||||
|
||||
target = theTarget;
|
||||
if (theVariant.length() > 0) {
|
||||
target = theTarget + VARIANT_SEP + theVariant;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a transliterator from the given source to our target or
|
||||
* target/variant. Returns NULL if the source is the same as our
|
||||
* target script, or if the source is USCRIPT_INVALID_CODE.
|
||||
* Caches the result and returns the same transliterator the next
|
||||
* time. The caller does NOT own the result and must not delete
|
||||
* it.
|
||||
*/
|
||||
private Transliterator getTransliterator(int source) {
|
||||
if (source == targetScript || source == UScript.INVALID_CODE) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Integer key = new Integer(source);
|
||||
Transliterator t = (Transliterator) cache.get(key);
|
||||
if (t == null) {
|
||||
String sourceName = UScript.getName(source);
|
||||
String id = sourceName + TARGET_SEP + target;
|
||||
|
||||
t = Transliterator.getInstance(id, FORWARD);
|
||||
if (t == null) {
|
||||
|
||||
// Try to pivot around Latin, our most common script
|
||||
id = sourceName + LATIN_PIVOT + target;
|
||||
t = Transliterator.getInstance(id, FORWARD);
|
||||
}
|
||||
|
||||
if (t != null) {
|
||||
cache.put(key, t);
|
||||
}
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers standard transliterators with the system. Called by
|
||||
* Transliterator during initialization. Scan all current targets
|
||||
* and register those that are scripts T as Any-T/V.
|
||||
*/
|
||||
static void register() {
|
||||
|
||||
HashSet seen = new HashSet();
|
||||
|
||||
for (Enumeration s=Transliterator.getAvailableSources(); s.hasMoreElements(); ) {
|
||||
String source = (String) s.nextElement();
|
||||
|
||||
// Ignore the "Any" source
|
||||
if (source.equalsIgnoreCase(ANY)) continue;
|
||||
|
||||
for (Enumeration t=Transliterator.getAvailableTargets(source);
|
||||
t.hasMoreElements(); ) {
|
||||
String target = (String) t.nextElement();
|
||||
|
||||
// Only process each target once
|
||||
if (seen.contains(target)) continue;
|
||||
seen.add(target);
|
||||
|
||||
// Get the script code for the target. If not a script, ignore.
|
||||
int targetScript = scriptNameToCode(target);
|
||||
if (targetScript == UScript.INVALID_CODE) continue;
|
||||
|
||||
for (Enumeration v=Transliterator.getAvailableVariants(source, target);
|
||||
v.hasMoreElements(); ) {
|
||||
String variant = (String) v.nextElement();
|
||||
|
||||
String id;
|
||||
id = TransliteratorIDParser.STVtoID(ANY, target, variant);
|
||||
AnyTransliterator trans = new AnyTransliterator(id, target, variant,
|
||||
targetScript);
|
||||
Transliterator.registerInstance(trans);
|
||||
Transliterator.registerSpecialInverse(target, NULL_ID, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the script code for a given name, or
|
||||
* UScript.INVALID_CODE if not found.
|
||||
*/
|
||||
private static int scriptNameToCode(String name) {
|
||||
int[] codes = UScript.getCode(name);
|
||||
return codes != null ? codes[0] : UScript.INVALID_CODE;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------
|
||||
// ScriptRunIterator
|
||||
|
||||
/**
|
||||
* Returns a series of ranges corresponding to scripts. They will be
|
||||
* of the form:
|
||||
*
|
||||
* ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
|
||||
* | | - first run (start, limit)
|
||||
* | | - second run (start, limit)
|
||||
*
|
||||
* That is, the runs will overlap. The reason for this is so that a
|
||||
* transliterator can consider common characters both before and after
|
||||
* the scripts.
|
||||
*/
|
||||
private static class ScriptRunIterator {
|
||||
|
||||
private Replaceable text;
|
||||
private int textStart;
|
||||
private int textLimit;
|
||||
|
||||
/**
|
||||
* The code of the current run, valid after next() returns. May
|
||||
* be UScript.INVALID_CODE if and only if the entire text is
|
||||
* COMMON/INHERITED.
|
||||
*/
|
||||
public int scriptCode;
|
||||
|
||||
/**
|
||||
* The start of the run, inclusive, valid after next() returns.
|
||||
*/
|
||||
public int start;
|
||||
|
||||
/**
|
||||
* The end of the run, exclusive, valid after next() returns.
|
||||
*/
|
||||
public int limit;
|
||||
|
||||
/**
|
||||
* Constructs a run iterator over the given text from start
|
||||
* (inclusive) to limit (exclusive).
|
||||
*/
|
||||
public ScriptRunIterator(Replaceable text, int start, int limit) {
|
||||
this.text = text;
|
||||
this.textStart = start;
|
||||
this.textLimit = limit;
|
||||
this.limit = start;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns TRUE if there are any more runs. TRUE is always
|
||||
* returned at least once. Upon return, the caller should
|
||||
* examine scriptCode, start, and limit.
|
||||
*/
|
||||
public boolean next() {
|
||||
int ch;
|
||||
int s;
|
||||
|
||||
scriptCode = UScript.INVALID_CODE; // don't know script yet
|
||||
start = limit;
|
||||
|
||||
// Are we done?
|
||||
if (start == textLimit) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Move start back to include adjacent COMMON or INHERITED
|
||||
// characters
|
||||
while (start > textStart) {
|
||||
ch = text.char32At(start - 1); // look back
|
||||
s = UScript.getScript(ch);
|
||||
if (s == UScript.COMMON || s == UScript.INHERITED) {
|
||||
--start;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Move limit ahead to include COMMON, INHERITED, and characters
|
||||
// of the current script.
|
||||
while (limit < textLimit) {
|
||||
ch = text.char32At(limit); // look ahead
|
||||
s = UScript.getScript(ch);
|
||||
if (s != UScript.COMMON && s != UScript.INHERITED) {
|
||||
if (scriptCode == UScript.INVALID_CODE) {
|
||||
scriptCode = s;
|
||||
} else if (s != scriptCode) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
++limit;
|
||||
}
|
||||
|
||||
// Return TRUE even if the entire text is COMMON / INHERITED, in
|
||||
// which case scriptCode will be UScript.INVALID_CODE.
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjusts internal indices for a change in the limit index of the
|
||||
* given delta. A positive delta means the limit has increased.
|
||||
*/
|
||||
public void adjustLimit(int delta) {
|
||||
limit += delta;
|
||||
textLimit += delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//eof
|
Loading…
Add table
Reference in a new issue