mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1532 move STVtoID to TransliteratorIDParser
X-SVN-Rev: 8845
This commit is contained in:
parent
0c224a31ef
commit
c18249ca2b
6 changed files with 364 additions and 337 deletions
|
@ -4,7 +4,7 @@
|
|||
* and others. All Rights Reserved.
|
||||
*****************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu/source/i18n/anytrans.cpp,v $
|
||||
* $Revision: 1.1 $
|
||||
* $Revision: 1.2 $
|
||||
*****************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/2002 aliu Creation.
|
||||
|
@ -12,195 +12,186 @@
|
|||
*/
|
||||
#include "anytrans.h"
|
||||
#include "uvector.h"
|
||||
#include "tridpars.h"
|
||||
#include "hash.h"
|
||||
#include "unicode/nultrans.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Constants
|
||||
|
||||
static const UChar HYPHEN = 45; // '-'
|
||||
static const UChar ANY[] = {65,110,121,45,0}; // "Any-"
|
||||
static const UChar TARGET_SEP = 45; // '-'
|
||||
static const UChar VARIANT_SEP = 47; // '/'
|
||||
static const UChar ANY[] = {65,110,121,0}; // "Any"
|
||||
static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
|
||||
static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
|
||||
|
||||
//------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Deleter function for Transliterator*.
|
||||
*/
|
||||
static void _deleteTransliterator(void *obj) {
|
||||
delete (Transliterator*) obj;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//------------------------------------------------------------
|
||||
// ScriptRunIterator
|
||||
|
||||
/**
|
||||
* Returns a series of ranges corresponding to scripts. They will be
|
||||
* of the form:
|
||||
*
|
||||
* ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
|
||||
* | | - first run (start, limit)
|
||||
* | | - second run (start, limit)
|
||||
*
|
||||
* That is, the runs will overlap. The reason for this is so that a
|
||||
* transliterator can consider common characters both before and after
|
||||
* the scripts.
|
||||
*/
|
||||
class ScriptRunIterator {
|
||||
private:
|
||||
const Replaceable& text;
|
||||
int32_t textStart;
|
||||
int32_t textLimit;
|
||||
|
||||
public:
|
||||
/**
|
||||
* The code of the current run, valid after next() returns. May
|
||||
* be USCRIPT_INVALID_CODE if and only if the entire text is
|
||||
* COMMON/INHERITED.
|
||||
*/
|
||||
UScriptCode scriptCode;
|
||||
|
||||
/**
|
||||
* The start of the run, inclusive, valid after next() returns.
|
||||
*/
|
||||
int32_t start;
|
||||
|
||||
/**
|
||||
* The end of the run, exclusive, valid after next() returns.
|
||||
*/
|
||||
int32_t limit;
|
||||
|
||||
/**
|
||||
* Constructs a run iterator over the given text from start
|
||||
* (inclusive) to limit (exclusive).
|
||||
*/
|
||||
ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
|
||||
|
||||
/**
|
||||
* Returns TRUE if there are any more runs. TRUE is always
|
||||
* returned at least once. Upon return, the caller should
|
||||
* examine scriptCode, start, and limit.
|
||||
*/
|
||||
UBool next();
|
||||
|
||||
/**
|
||||
* Adjusts internal indices for a change in the limit index of the
|
||||
* given delta. A positive delta means the limit has increased.
|
||||
*/
|
||||
void adjustLimit(int32_t delta);
|
||||
};
|
||||
|
||||
ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
|
||||
int32_t start, int32_t limit) :
|
||||
text(theText) {
|
||||
this->textStart = start;
|
||||
this->textLimit = limit;
|
||||
this->limit = start;
|
||||
}
|
||||
|
||||
UBool ScriptRunIterator::next() {
|
||||
UChar32 ch;
|
||||
UScriptCode s;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
|
||||
scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
|
||||
start = limit;
|
||||
|
||||
// Are we done?
|
||||
if (start == textLimit) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Move start back to include adjacent COMMON or INHERITED
|
||||
// characters
|
||||
while (start > textStart) {
|
||||
ch = text.char32At(start - 1); // look back
|
||||
s = uscript_getScript(ch, &ec);
|
||||
if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
|
||||
--start;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Move limit ahead to include COMMON, INHERITED, and characters
|
||||
// of the current script.
|
||||
while (limit < textLimit) {
|
||||
ch = text.char32At(limit); // look ahead
|
||||
s = uscript_getScript(ch, &ec);
|
||||
if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
|
||||
if (scriptCode == USCRIPT_INVALID_CODE) {
|
||||
scriptCode = s;
|
||||
} else if (s != scriptCode) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
++limit;
|
||||
}
|
||||
|
||||
// Return TRUE even if the entire text is COMMON / INHERITED, in
|
||||
// which case scriptCode will be USCRIPT_INVALID_CODE.
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void ScriptRunIterator::adjustLimit(int32_t delta) {
|
||||
limit += delta;
|
||||
textLimit += delta;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------
|
||||
// AnyTransliterator
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Try to create a transliterator with the given ID, which should be
|
||||
* of the form "Any-X". The "X" will be pulled off and passed to
|
||||
* createInstance().
|
||||
*/
|
||||
Transliterator* AnyTransliterator::_create(const UnicodeString& ID, Token /*context*/) {
|
||||
UnicodeString target(ID);
|
||||
int32_t i = target.indexOf(HYPHEN);
|
||||
if (i >= 0) {
|
||||
target.remove(0, i+1);
|
||||
}
|
||||
return AnyTransliterator::createInstance(target, TRUE, TRUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
void AnyTransliterator::registerIDs() {
|
||||
Token t = integerToken(0);
|
||||
|
||||
// Register Any-Latin and make its inverse Null
|
||||
Transliterator::_registerFactory("Any-Latin", _create, t);
|
||||
Transliterator::_registerSpecialInverse("Latin", "Null", FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the script code for a given name, or -1 if not found.
|
||||
*/
|
||||
int32_t AnyTransliterator::scriptNameToCode(const UnicodeString& name) {
|
||||
char buf[128];
|
||||
UScriptCode code;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
|
||||
name.extract(0, 128, buf, 128, "");
|
||||
if (uscript_getCode(buf, &code, 1, &ec) != 1 ||
|
||||
U_FAILURE(ec)) {
|
||||
code = (UScriptCode) -1;
|
||||
}
|
||||
return (int32_t) code;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method to create an Any-X transliterator. Relies on
|
||||
* registered transliterators at the time of the call to build the
|
||||
* Any-X transliterator. If there are no registered transliterators
|
||||
* of the form Y-X, then the logical result is Any-Null. If there is
|
||||
* exactly one transliterator of the form Y-X, then the logical result
|
||||
* is Y-X, a degenerate result. If there are 2 or more
|
||||
* transliterators of the form Y-X, then an AnyTransliterator is
|
||||
* instantiated and returned.
|
||||
* @param allowNull if true, then return Any-Null if there are no
|
||||
* transliterator to the given script; otherwise return NULL
|
||||
* @param allowDegenerate if true, then return a transliterator of the
|
||||
* form X-Y if there is only one such transliterator
|
||||
* the given script; otherwise return NULL
|
||||
*/
|
||||
Transliterator* AnyTransliterator::createInstance(const UnicodeString& toTarget,
|
||||
UBool allowNull,
|
||||
UBool allowDegenerate) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UVector translits(ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Count transliterators _to_ the given target. This is
|
||||
// inconvenient since we have to iterate over all sources.
|
||||
int32_t sourceCount = Transliterator::countAvailableSources();
|
||||
for (int32_t s=0; s<sourceCount; ++s) {
|
||||
UnicodeString source;
|
||||
Transliterator::getAvailableSource(s, source);
|
||||
int32_t targetCount = Transliterator::countAvailableTargets(source);
|
||||
for (int32_t t=0; t<targetCount; ++t) {
|
||||
UnicodeString target;
|
||||
Transliterator::getAvailableTarget(t, source, target);
|
||||
if (target.caseCompare(toTarget, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) {
|
||||
// We have a source match. It must also be a script
|
||||
// or we can't use it.
|
||||
int32_t code = scriptNameToCode(source);
|
||||
if (code < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to instantiate the given transliterator
|
||||
UnicodeString id(source);
|
||||
id.append(HYPHEN).append(toTarget);
|
||||
Transliterator* t = Transliterator::createInstance(
|
||||
id, UTRANS_FORWARD, ec);
|
||||
if (U_FAILURE(ec) || t == NULL) {
|
||||
delete t;
|
||||
continue;
|
||||
}
|
||||
|
||||
// We have a script code and a transliterator; save
|
||||
// them.
|
||||
translits.addElement(new Elem((UScriptCode) code, t), ec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (translits.size()) {
|
||||
case 0:
|
||||
// There is nothing registered going to the requested target,
|
||||
// so return Any-Null, if allowed
|
||||
return allowNull ? new NullTransliterator() : NULL;
|
||||
case 1:
|
||||
// Exactly one transliterator goes to the requested target, so
|
||||
// return it, if allowed
|
||||
{
|
||||
Transliterator* t = NULL;
|
||||
if (allowDegenerate) {
|
||||
Elem *e = (Elem*) translits.orphanElementAt(0);
|
||||
t = e->translit;
|
||||
delete e;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
// We have 2 or more script-toTarget transliterators. Assemble an
|
||||
// AnyTransliterator and return it.
|
||||
UnicodeString id(ANY);
|
||||
id.append(toTarget);
|
||||
return new AnyTransliterator(id, translits);
|
||||
}
|
||||
|
||||
//|/**
|
||||
//| * Factory method to create an Any-X transliterator. Convenience
|
||||
//| * function that takes a script code.
|
||||
//| */
|
||||
//|Transliterator* AnyTransliterator::createInstance(UScriptCode target,
|
||||
//| UBool allowNull,
|
||||
//| UBool allowDegenerate) {
|
||||
//| UnicodeString name(uscript_getName(target), "");
|
||||
//| return createInstance(name, allowNull, allowDegenerate);
|
||||
//|}
|
||||
|
||||
/**
|
||||
* Constructs aa transliterator with the given ID. The vector should
|
||||
* contain Elem objects. Each will be removed from the vector and
|
||||
* ownership taken of its storage, including the contained
|
||||
* transliterator. Upon return the vector will be empty.
|
||||
*/
|
||||
AnyTransliterator::AnyTransliterator(const UnicodeString& id, UVector& vec) :
|
||||
Transliterator(id, NULL)
|
||||
AnyTransliterator::AnyTransliterator(const UnicodeString& id,
|
||||
const UnicodeString& theTarget,
|
||||
const UnicodeString& theVariant,
|
||||
UScriptCode theTargetScript,
|
||||
UErrorCode& ec) :
|
||||
Transliterator(id, NULL),
|
||||
targetScript(theTargetScript)
|
||||
{
|
||||
count = vec.size();
|
||||
elems = new Elem[count];
|
||||
for (int32_t i=count-1; i>=0; --i) {
|
||||
Elem* e = (Elem*) vec.orphanElementAt(i);
|
||||
elems[i] = *e;
|
||||
delete e;
|
||||
cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec);
|
||||
uhash_setValueDeleter(cache, _deleteTransliterator);
|
||||
|
||||
target = theTarget;
|
||||
if (theVariant.length() > 0) {
|
||||
target.append(VARIANT_SEP).append(theVariant);
|
||||
}
|
||||
}
|
||||
|
||||
AnyTransliterator::~AnyTransliterator() {
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
delete elems[i].translit;
|
||||
}
|
||||
delete[] elems;
|
||||
uhash_close(cache);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
|
||||
Transliterator(o)
|
||||
Transliterator(o),
|
||||
target(o.target),
|
||||
targetScript(o.targetScript)
|
||||
{
|
||||
count = o.count;
|
||||
elems = new Elem[count];
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
elems[i] = o.elems[i];
|
||||
elems[i].translit = elems[i].translit->clone();
|
||||
}
|
||||
// Don't copy the cache contents
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec);
|
||||
uhash_setValueDeleter(cache, _deleteTransliterator);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -215,85 +206,151 @@ Transliterator* AnyTransliterator::clone() const {
|
|||
*/
|
||||
void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
|
||||
UBool isIncremental) const {
|
||||
int32_t allStart = pos.start;
|
||||
int32_t allLimit = pos.limit;
|
||||
|
||||
// Compute indices relative to contextStart
|
||||
int32_t start = pos.start - pos.contextStart;
|
||||
int32_t limit = pos.limit - pos.contextStart;
|
||||
int32_t contextLimit = pos.contextLimit - pos.contextStart;
|
||||
ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
|
||||
|
||||
if (start == limit) return; // Short circuit
|
||||
while (it.next()) {
|
||||
// Ignore runs in the ante context
|
||||
if (it.limit <= allStart) continue;
|
||||
|
||||
// Extract contextStart..contextLimit
|
||||
UnicodeString ustext;
|
||||
text.extractBetween(pos.contextStart, pos.contextLimit, ustext);
|
||||
// Try to instantiate transliterator from it.scriptCode to
|
||||
// our target or target/variant
|
||||
Transliterator* t = getTransliterator(it.scriptCode);
|
||||
|
||||
if (t == NULL) {
|
||||
// We have no transliterator. Do nothing, but keep
|
||||
// pos.start up to date.
|
||||
pos.start = it.limit;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Work directly on the buffer. We don't need to release the
|
||||
// buffer since the UnicodeString is automatic scope.
|
||||
UChar* utext = ustext.getBuffer(-1);
|
||||
// If the run end is before the transliteration limit, do
|
||||
// a non-incremental transliteration. Otherwise do an
|
||||
// incremental one.
|
||||
UBool incremental = isIncremental && (it.limit >= allLimit);
|
||||
|
||||
pos.start = uprv_max(allStart, it.start);
|
||||
pos.limit = uprv_min(allLimit, it.limit);
|
||||
int32_t limit = pos.limit;
|
||||
t->filteredTransliterate(text, pos, incremental);
|
||||
int32_t delta = pos.limit - limit;
|
||||
allLimit += delta;
|
||||
it.adjustLimit(delta);
|
||||
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UScriptRun* run = uscript_openRun(utext, contextLimit, &ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
pos.start = pos.limit; // we're done
|
||||
uscript_closeRun(run);
|
||||
return;
|
||||
// We're done if we enter the post context
|
||||
if (it.limit >= allLimit) break;
|
||||
}
|
||||
|
||||
int32_t origLimit = pos.limit; // save original limit
|
||||
int32_t delta = 0; // cumulative change in length
|
||||
// Restore limit. pos.start is fine where the last transliterator
|
||||
// left it, or at the end of the last run.
|
||||
pos.limit = allLimit;
|
||||
}
|
||||
|
||||
// Iterate over runs
|
||||
int32_t runStart, runLimit;
|
||||
UScriptCode runScript;
|
||||
Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
|
||||
|
||||
// We're done if we've entered the post context or when there are
|
||||
// no more script runs (which should only happen when we call
|
||||
// nextRun _after_ runLimit has been returned at contextLimit).
|
||||
runLimit = 0;
|
||||
while (runLimit < limit &&
|
||||
uscript_nextRun(run, &runStart, &runLimit, &runScript)) {
|
||||
if (source == targetScript || source == USCRIPT_INVALID_CODE) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Do nothing if we're still in the ante context
|
||||
if (runLimit <= start) continue;
|
||||
|
||||
// See if we have a transliterator for this run
|
||||
Transliterator* t = NULL;
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
if (elems[i].script == runScript) {
|
||||
t = elems[i].translit;
|
||||
break;
|
||||
Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
|
||||
if (t == NULL) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UnicodeString sourceName(uscript_getName(source), "");
|
||||
UnicodeString id(sourceName);
|
||||
id.append(TARGET_SEP).append(target);
|
||||
|
||||
t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
|
||||
if (U_FAILURE(ec) || t == NULL) {
|
||||
delete t;
|
||||
|
||||
// Try to pivot around Latin, our most common script
|
||||
id = sourceName;
|
||||
id.append(LATIN_PIVOT).append(target);
|
||||
t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
|
||||
if (U_FAILURE(ec) || t == NULL) {
|
||||
delete t;
|
||||
t = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Transliterate max(start, runStart) to min(limit, runLimit).
|
||||
// Adjust indices to text-relative ones
|
||||
pos.start = uprv_max(start, runStart) + pos.contextStart + delta;
|
||||
pos.limit = uprv_min(limit, runLimit) + pos.contextStart + delta;
|
||||
|
||||
// If we don't have a transliterator for this script, then
|
||||
// leave the text unchanged.
|
||||
if (t == NULL) {
|
||||
pos.start = pos.limit;
|
||||
}
|
||||
|
||||
else {
|
||||
// If the run end is before the transliteration limit, do
|
||||
// a non-incremental transliteration. Otherwise do an
|
||||
// incremental one.
|
||||
UBool incremental = isIncremental && (runLimit >= limit);
|
||||
|
||||
// Transliterate and record change in length
|
||||
int32_t l = pos.limit;
|
||||
t->filteredTransliterate(text, pos, incremental);
|
||||
delta += pos.limit - l;
|
||||
if (t != NULL) {
|
||||
uhash_iput(cache, (int32_t) source, t, &ec);
|
||||
}
|
||||
}
|
||||
|
||||
uscript_closeRun(run);
|
||||
return t;
|
||||
}
|
||||
|
||||
// pos.start can stay where the last transliterator left it. pos.limit
|
||||
// needs to be adjusted for changes in length.
|
||||
pos.limit = origLimit + delta;
|
||||
/**
|
||||
* Return the script code for a given name, or -1 if not found.
|
||||
*/
|
||||
UScriptCode AnyTransliterator::scriptNameToCode(const UnicodeString& name) {
|
||||
char buf[128];
|
||||
UScriptCode code;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
|
||||
name.extract(0, 128, buf, 128, "");
|
||||
if (uscript_getCode(buf, &code, 1, &ec) != 1 ||
|
||||
U_FAILURE(ec)) {
|
||||
code = USCRIPT_INVALID_CODE;
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers standard transliterators with the system. Called by
|
||||
* Transliterator during initialization. Scan all current targets and
|
||||
* register those that are scripts T as Any-T/V.
|
||||
*/
|
||||
void AnyTransliterator::registerIDs() {
|
||||
|
||||
UErrorCode ec;
|
||||
Hashtable seen(TRUE);
|
||||
|
||||
int32_t sourceCount = Transliterator::countAvailableSources();
|
||||
for (int32_t s=0; s<sourceCount; ++s) {
|
||||
UnicodeString source;
|
||||
Transliterator::getAvailableSource(s, source);
|
||||
|
||||
// Ignore the "Any" source
|
||||
if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
|
||||
|
||||
int32_t targetCount = Transliterator::countAvailableTargets(source);
|
||||
for (int32_t t=0; t<targetCount; ++t) {
|
||||
UnicodeString target;
|
||||
Transliterator::getAvailableTarget(t, source, target);
|
||||
|
||||
// Only process each target once
|
||||
if (seen.geti(target) != 0) continue;
|
||||
ec = U_ZERO_ERROR;
|
||||
seen.puti(target, 1, ec);
|
||||
|
||||
// Get the script code for the target. If not a script, ignore.
|
||||
UScriptCode targetScript = scriptNameToCode(target);
|
||||
if (targetScript == USCRIPT_INVALID_CODE) continue;
|
||||
|
||||
int32_t variantCount = Transliterator::countAvailableVariants(source, target);
|
||||
// assert(variantCount >= 1);
|
||||
for (int32_t v=0; v<variantCount; ++v) {
|
||||
UnicodeString variant;
|
||||
Transliterator::getAvailableVariant(v, source, target, variant);
|
||||
|
||||
UnicodeString id;
|
||||
TransliteratorIDParser::STVtoID(ANY, target, variant, id);
|
||||
ec = U_ZERO_ERROR;
|
||||
AnyTransliterator* t = new AnyTransliterator(id, target, variant,
|
||||
targetScript, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
delete t;
|
||||
} else {
|
||||
Transliterator::_registerInstance(t);
|
||||
Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
* and others. All Rights Reserved.
|
||||
*****************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu/source/i18n/anytrans.h,v $
|
||||
* $Revision: 1.1 $
|
||||
* $Revision: 1.2 $
|
||||
*****************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/2002 aliu Creation.
|
||||
|
@ -15,94 +15,40 @@
|
|||
|
||||
#include "unicode/translit.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* A transliterator named Any-X, where X is the target, that contains
|
||||
* multiple transliterators, all going to X, all with script sources.
|
||||
* The target need not be a script. It uses the script run API
|
||||
* (uscript.h) to partition text into runs of the same script, and
|
||||
* then based on the script of each run, transliterates from that
|
||||
* script to the given target.
|
||||
*
|
||||
* <p>For example, "Any-Latin" might contain two transliterators,
|
||||
* "Greek-Latin" and "Hiragana-Latin". It would then transliterate
|
||||
* runs of Greek with Greek-Latin, runs of Hiragana with
|
||||
* Hirgana-Latin, and pass other runs through unchanged.
|
||||
*
|
||||
* <p>There is no inverse of an Any-X transliterator. Although it
|
||||
* would be possible to tag the output text with script markers to
|
||||
* make inversion possible, this is not currently implemented.
|
||||
* A transliterator named Any-T or Any-T/V, where T is the target
|
||||
* script and V is the optional variant, that uses multiple
|
||||
* transliterators, all going to T or T/V, all with script sources.
|
||||
* The target must be a script. It partitions text into runs of the
|
||||
* same script, and then based on the script of each run,
|
||||
* transliterates from that script to the given target or
|
||||
* target/variant. Adjacent COMMON or INHERITED script characters are
|
||||
* included in each run.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API AnyTransliterator : public Transliterator {
|
||||
|
||||
/**
|
||||
* A script code and associated transliterator. It does _not_ own
|
||||
* the transliterator.
|
||||
* Cache mapping UScriptCode values to Transliterator*.
|
||||
*/
|
||||
class Elem {
|
||||
public:
|
||||
UScriptCode script;
|
||||
Transliterator* translit;
|
||||
Elem(UScriptCode s=(UScriptCode)0, Transliterator* t=NULL) {
|
||||
script = s;
|
||||
translit = t;
|
||||
}
|
||||
Elem& operator=(const Elem& o) {
|
||||
script = o.script;
|
||||
translit = o.translit;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
UHashtable* cache;
|
||||
|
||||
/**
|
||||
* Array of script codes and associated transliterators. We
|
||||
* own the transliterators.
|
||||
* The target or target/variant string.
|
||||
*/
|
||||
Elem* elems;
|
||||
UnicodeString target;
|
||||
|
||||
/**
|
||||
* Length of elems, always at least 2.
|
||||
* The target script code. Never USCRIPT_INVALID_CODE.
|
||||
*/
|
||||
int32_t count;
|
||||
UScriptCode targetScript;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Factory method to create an Any-X transliterator. Relies on
|
||||
* registered transliterators at the time of the call to build the
|
||||
* Any-X transliterator. If there are no registered transliterators
|
||||
* of the form Y-X, then the logical result is Any-Null. If there is
|
||||
* exactly one transliterator of the form Y-X, then the logical result
|
||||
* is Y-X, a degenerate result. If there are 2 or more
|
||||
* transliterators of the form Y-X, then an AnyTransliterator is
|
||||
* instantiated and returned.
|
||||
* @param target the target, which need not be a script. This
|
||||
* be a string such as "Latin", <em>not</em> "Any-Latin".
|
||||
* @param allowNull if true, then return Any-Null if there are no
|
||||
* transliterator to the given script; otherwise return NULL
|
||||
* @param allowDegenerate if true, then return a transliterator of the
|
||||
* form X-Y if there is only one such transliterator
|
||||
* the given script; otherwise return NULL
|
||||
* @return a new Transliterator, or NULL. If allowNull or
|
||||
* allowDegenerate is TRUE, the result may not be an
|
||||
* AnyTransliterator. If they are both false, the result will be
|
||||
* an AnyTransliterator.
|
||||
*/
|
||||
static Transliterator* createInstance(const UnicodeString& target,
|
||||
UBool allowNull,
|
||||
UBool allowDegenerate);
|
||||
|
||||
//| /**
|
||||
//| * Factory method to create an Any-X transliterator. Convenience
|
||||
//| * function that takes a script code.
|
||||
//| */
|
||||
//| static Transliterator* createInstance(UScriptCode target,
|
||||
//| UBool allowNull,
|
||||
//| UBool allowDegenerate);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
|
@ -128,18 +74,36 @@ public:
|
|||
private:
|
||||
|
||||
/**
|
||||
* Private constructor for Transliterator.
|
||||
* Private constructor
|
||||
* @param id the ID of the form S-T or S-T/V, where T is theTarget
|
||||
* and V is theVariant. Must not be empty.
|
||||
* @param theTarget the target name. Must not be empty, and must
|
||||
* name a script corresponding to theTargetScript.
|
||||
* @param theVariant the variant name, or the empty string if
|
||||
* there is no variant
|
||||
* @param theTargetScript the script code corresponding to
|
||||
* theTarget.
|
||||
* @param ec error code, fails if the internal hashtable cannot be
|
||||
* allocated
|
||||
*/
|
||||
AnyTransliterator(const UnicodeString& id, UVector& vec);
|
||||
AnyTransliterator(const UnicodeString& id,
|
||||
const UnicodeString& theTarget,
|
||||
const UnicodeString& theVariant,
|
||||
UScriptCode theTargetScript,
|
||||
UErrorCode& ec);
|
||||
|
||||
/**
|
||||
* Try to create a transliterator with the given ID, which should
|
||||
* be of the form "Any-X".
|
||||
* Returns a transliterator from the given source to our target or
|
||||
* target/variant. Returns NULL if the source is the same as our
|
||||
* target script, or if the source is USCRIPT_INVALID_CODE.
|
||||
* Caches the result and returns the same transliterator the next
|
||||
* time. The caller does NOT own the result and must not delete
|
||||
* it.
|
||||
*/
|
||||
static Transliterator* _create(const UnicodeString& ID, Token /*context*/);
|
||||
|
||||
Transliterator* getTransliterator(UScriptCode source) const;
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Registers standard transliterators with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
static void registerIDs();
|
||||
|
@ -147,9 +111,10 @@ private:
|
|||
friend class Transliterator; // for registerIDs()
|
||||
|
||||
/**
|
||||
* Return the script code for a given name, or -1 if not found.
|
||||
* Return the script code for a given name, or
|
||||
* USCRIPT_INVALID_CODE if not found.
|
||||
*/
|
||||
static int32_t scriptNameToCode(const UnicodeString& name);
|
||||
static UScriptCode scriptNameToCode(const UnicodeString& name);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -465,7 +465,7 @@ void TransliteratorRegistry::remove(const UnicodeString& ID) {
|
|||
TransliteratorIDParser::IDtoSTV(ID, source, target, variant, sawSource);
|
||||
// Only need to do this if ID.indexOf('-') < 0
|
||||
UnicodeString id;
|
||||
STVtoID(source, target, variant, id);
|
||||
TransliteratorIDParser::STVtoID(source, target, variant, id);
|
||||
registry.remove(id);
|
||||
removeSTV(source, target, variant);
|
||||
availableIDs.removeElement((void*) &id);
|
||||
|
@ -585,25 +585,6 @@ UnicodeString& TransliteratorRegistry::getAvailableVariant(int32_t index,
|
|||
// class TransliteratorRegistry: internal
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Given source, target, and variant strings, concatenate them into a
|
||||
* full ID. If the source is empty, then "Any" will be used for the
|
||||
* source, so the ID will always be of the form s-t/v or s-t.
|
||||
*/
|
||||
void TransliteratorRegistry::STVtoID(const UnicodeString& source,
|
||||
const UnicodeString& target,
|
||||
const UnicodeString& variant,
|
||||
UnicodeString& id) {
|
||||
id = source;
|
||||
if (id.length() == 0) {
|
||||
id = ANY;
|
||||
}
|
||||
id.append(ID_SEP).append(target);
|
||||
if (variant.length() != 0) {
|
||||
id.append(VARIANT_SEP).append(variant);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method. Calls 6-arg registerEntry().
|
||||
*/
|
||||
|
@ -617,7 +598,7 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& source,
|
|||
if (s.length() == 0) {
|
||||
s = ANY;
|
||||
}
|
||||
STVtoID(source, target, variant, ID);
|
||||
TransliteratorIDParser::STVtoID(source, target, variant, ID);
|
||||
registerEntry(ID, s, target, variant, adopted, visible);
|
||||
}
|
||||
|
||||
|
@ -632,7 +613,7 @@ void TransliteratorRegistry::registerEntry(const UnicodeString& ID,
|
|||
TransliteratorIDParser::IDtoSTV(ID, source, target, variant, sawSource);
|
||||
// Only need to do this if ID.indexOf('-') < 0
|
||||
UnicodeString id;
|
||||
STVtoID(source, target, variant, id);
|
||||
TransliteratorIDParser::STVtoID(source, target, variant, id);
|
||||
registerEntry(id, source, target, variant, adopted, visible);
|
||||
}
|
||||
|
||||
|
@ -737,7 +718,7 @@ Entry* TransliteratorRegistry::findInDynamicStore(const Spec& src,
|
|||
const Spec& trg,
|
||||
const UnicodeString& variant) {
|
||||
UnicodeString ID;
|
||||
STVtoID(src, trg, variant, ID);
|
||||
TransliteratorIDParser::STVtoID(src, trg, variant, ID);
|
||||
Entry *e = (Entry*) registry.get(ID);
|
||||
DEBUG_useEntry(e);
|
||||
return e;
|
||||
|
|
|
@ -307,11 +307,6 @@ class TransliteratorRegistry {
|
|||
UParseError& parseError,
|
||||
UErrorCode& status);
|
||||
|
||||
static void STVtoID(const UnicodeString& source,
|
||||
const UnicodeString& target,
|
||||
const UnicodeString& variant,
|
||||
UnicodeString& id);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
|
|
|
@ -567,6 +567,25 @@ void TransliteratorIDParser::IDtoSTV(const UnicodeString& id,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given source, target, and variant strings, concatenate them into a
|
||||
* full ID. If the source is empty, then "Any" will be used for the
|
||||
* source, so the ID will always be of the form s-t/v or s-t.
|
||||
*/
|
||||
void TransliteratorIDParser::STVtoID(const UnicodeString& source,
|
||||
const UnicodeString& target,
|
||||
const UnicodeString& variant,
|
||||
UnicodeString& id) {
|
||||
id = source;
|
||||
if (id.length() == 0) {
|
||||
id = ANY;
|
||||
}
|
||||
id.append(TARGET_SEP).append(target);
|
||||
if (variant.length() != 0) {
|
||||
id.append(VARIANT_SEP).append(variant);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Register two targets as being inverses of one another. For
|
||||
* example, calling registerSpecialInverse("NFC", "NFD", TRUE) causes
|
||||
|
|
|
@ -222,6 +222,16 @@ class TransliteratorIDParser {
|
|||
UnicodeString& variant,
|
||||
UBool& isSourcePresent);
|
||||
|
||||
/**
|
||||
* Given source, target, and variant strings, concatenate them into a
|
||||
* full ID. If the source is empty, then "Any" will be used for the
|
||||
* source, so the ID will always be of the form s-t/v or s-t.
|
||||
*/
|
||||
static void STVtoID(const UnicodeString& source,
|
||||
const UnicodeString& target,
|
||||
const UnicodeString& variant,
|
||||
UnicodeString& id);
|
||||
|
||||
/**
|
||||
* Register two targets as being inverses of one another. For
|
||||
* example, calling registerSpecialInverse("NFC", "NFD", true) causes
|
||||
|
|
Loading…
Add table
Reference in a new issue