diff --git a/icu4j/src/com/ibm/icu/text/AnyTransliterator.java b/icu4j/src/com/ibm/icu/text/AnyTransliterator.java new file mode 100644 index 00000000000..e6743ddfbb2 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/AnyTransliterator.java @@ -0,0 +1,338 @@ +/* +***************************************************************** +* Copyright (c) 2002, International Business Machines Corporation +* and others. All Rights Reserved. +***************************************************************** +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/AnyTransliterator.java,v $ +* $Revision: 1.1 $ +***************************************************************** +* Date Name Description +* 06/06/2002 aliu Creation. +***************************************************************** +*/ +package com.ibm.icu.text; +import com.ibm.icu.lang.UScript; +import java.lang.Math; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; + +/** + * A transliterator that translates multiple input scripts to a single + * output script. It is named Any-T or Any-T/V, where T is the target + * and V is the optional variant. The target T is a script. + * + *

An AnyTransliterator partitions text into runs of the same + * script, together with adjacent COMMON or INHERITED characters. + * After determining the script of each run, it transliterates from + * that script to the given target/variant. It does so by + * instantiating a transliterator from the source script to the + * target/variant. If a run consists only of the target script, + * COMMON, or INHERITED characters, then the run is not changed. + * + *

At startup, all possible AnyTransliterators are registered with + * the system, as determined by examining the registered script + * transliterators. + * + * @since ICU 2.2 + * @author Alan Liu + */ +class AnyTransliterator extends Transliterator { + + //------------------------------------------------------------ + // Constants + + static final char TARGET_SEP = '-'; + static final char VARIANT_SEP = '/'; + static final String ANY = "Any"; + static final String NULL_ID = "Null"; + static final String LATIN_PIVOT = "-Latin;Latin-"; + + /** + * Cache mapping UScriptCode values to Transliterator*. + */ + private Map cache; + + /** + * The target or target/variant string. + */ + private String target; + + /** + * The target script code. Never USCRIPT_INVALID_CODE. + */ + private int targetScript; + + /** + * Implements {@link Transliterator#handleTransliterate}. + */ + protected void handleTransliterate(Replaceable text, + Position pos, boolean isIncremental) { + int allStart = pos.start; + int allLimit = pos.limit; + + ScriptRunIterator it = + new ScriptRunIterator(text, pos.contextStart, pos.contextLimit); + + while (it.next()) { + // Ignore runs in the ante context + if (it.limit <= allStart) continue; + + // Try to instantiate transliterator from it.scriptCode to + // our target or target/variant + Transliterator t = getTransliterator(it.scriptCode); + + if (t == null) { + // We have no transliterator. Do nothing, but keep + // pos.start up to date. + pos.start = it.limit; + continue; + } + + // If the run end is before the transliteration limit, do + // a non-incremental transliteration. Otherwise do an + // incremental one. + boolean incremental = isIncremental && (it.limit >= allLimit); + + pos.start = Math.max(allStart, it.start); + pos.limit = Math.min(allLimit, it.limit); + int limit = pos.limit; + t.filteredTransliterate(text, pos, incremental); + int delta = pos.limit - limit; + allLimit += delta; + it.adjustLimit(delta); + + // We're done if we enter the post context + if (it.limit >= allLimit) break; + } + + // Restore limit. pos.start is fine where the last transliterator + // left it, or at the end of the last run. + pos.limit = allLimit; + } + + /** + * Private constructor + * @param id the ID of the form S-T or S-T/V, where T is theTarget + * and V is theVariant. Must not be empty. + * @param theTarget the target name. Must not be empty, and must + * name a script corresponding to theTargetScript. + * @param theVariant the variant name, or the empty string if + * there is no variant + * @param theTargetScript the script code corresponding to + * theTarget. + * @param ec error code, fails if the internal hashtable cannot be + * allocated + */ + private AnyTransliterator(String id, + String theTarget, + String theVariant, + int theTargetScript) { + super(id, null); + targetScript = theTargetScript; + cache = new HashMap(); + + target = theTarget; + if (theVariant.length() > 0) { + target = theTarget + VARIANT_SEP + theVariant; + } + } + + /** + * Returns a transliterator from the given source to our target or + * target/variant. Returns NULL if the source is the same as our + * target script, or if the source is USCRIPT_INVALID_CODE. + * Caches the result and returns the same transliterator the next + * time. The caller does NOT own the result and must not delete + * it. + */ + private Transliterator getTransliterator(int source) { + if (source == targetScript || source == UScript.INVALID_CODE) { + return null; + } + + Integer key = new Integer(source); + Transliterator t = (Transliterator) cache.get(key); + if (t == null) { + String sourceName = UScript.getName(source); + String id = sourceName + TARGET_SEP + target; + + t = Transliterator.getInstance(id, FORWARD); + if (t == null) { + + // Try to pivot around Latin, our most common script + id = sourceName + LATIN_PIVOT + target; + t = Transliterator.getInstance(id, FORWARD); + } + + if (t != null) { + cache.put(key, t); + } + } + + return t; + } + + /** + * Registers standard transliterators with the system. Called by + * Transliterator during initialization. Scan all current targets + * and register those that are scripts T as Any-T/V. + */ + static void register() { + + HashSet seen = new HashSet(); + + for (Enumeration s=Transliterator.getAvailableSources(); s.hasMoreElements(); ) { + String source = (String) s.nextElement(); + + // Ignore the "Any" source + if (source.equalsIgnoreCase(ANY)) continue; + + for (Enumeration t=Transliterator.getAvailableTargets(source); + t.hasMoreElements(); ) { + String target = (String) t.nextElement(); + + // Only process each target once + if (seen.contains(target)) continue; + seen.add(target); + + // Get the script code for the target. If not a script, ignore. + int targetScript = scriptNameToCode(target); + if (targetScript == UScript.INVALID_CODE) continue; + + for (Enumeration v=Transliterator.getAvailableVariants(source, target); + v.hasMoreElements(); ) { + String variant = (String) v.nextElement(); + + String id; + id = TransliteratorIDParser.STVtoID(ANY, target, variant); + AnyTransliterator trans = new AnyTransliterator(id, target, variant, + targetScript); + Transliterator.registerInstance(trans); + Transliterator.registerSpecialInverse(target, NULL_ID, false); + } + } + } + } + + /** + * Return the script code for a given name, or + * UScript.INVALID_CODE if not found. + */ + private static int scriptNameToCode(String name) { + int[] codes = UScript.getCode(name); + return codes != null ? codes[0] : UScript.INVALID_CODE; + } + + //------------------------------------------------------------ + // ScriptRunIterator + + /** + * Returns a series of ranges corresponding to scripts. They will be + * of the form: + * + * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second + * | | - first run (start, limit) + * | | - second run (start, limit) + * + * That is, the runs will overlap. The reason for this is so that a + * transliterator can consider common characters both before and after + * the scripts. + */ + private static class ScriptRunIterator { + + private Replaceable text; + private int textStart; + private int textLimit; + + /** + * The code of the current run, valid after next() returns. May + * be UScript.INVALID_CODE if and only if the entire text is + * COMMON/INHERITED. + */ + public int scriptCode; + + /** + * The start of the run, inclusive, valid after next() returns. + */ + public int start; + + /** + * The end of the run, exclusive, valid after next() returns. + */ + public int limit; + + /** + * Constructs a run iterator over the given text from start + * (inclusive) to limit (exclusive). + */ + public ScriptRunIterator(Replaceable text, int start, int limit) { + this.text = text; + this.textStart = start; + this.textLimit = limit; + this.limit = start; + } + + + /** + * Returns TRUE if there are any more runs. TRUE is always + * returned at least once. Upon return, the caller should + * examine scriptCode, start, and limit. + */ + public boolean next() { + int ch; + int s; + + scriptCode = UScript.INVALID_CODE; // don't know script yet + start = limit; + + // Are we done? + if (start == textLimit) { + return false; + } + + // Move start back to include adjacent COMMON or INHERITED + // characters + while (start > textStart) { + ch = text.char32At(start - 1); // look back + s = UScript.getScript(ch); + if (s == UScript.COMMON || s == UScript.INHERITED) { + --start; + } else { + break; + } + } + + // Move limit ahead to include COMMON, INHERITED, and characters + // of the current script. + while (limit < textLimit) { + ch = text.char32At(limit); // look ahead + s = UScript.getScript(ch); + if (s != UScript.COMMON && s != UScript.INHERITED) { + if (scriptCode == UScript.INVALID_CODE) { + scriptCode = s; + } else if (s != scriptCode) { + break; + } + } + ++limit; + } + + // Return TRUE even if the entire text is COMMON / INHERITED, in + // which case scriptCode will be UScript.INVALID_CODE. + return true; + } + + /** + * Adjusts internal indices for a change in the limit index of the + * given delta. A positive delta means the limit has increased. + */ + public void adjustLimit(int delta) { + limit += delta; + textLimit += delta; + } + } +} + +//eof