mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1532 initial implementation
X-SVN-Rev: 8815
This commit is contained in:
parent
cce2d770b6
commit
584503b5fc
5 changed files with 469 additions and 0 deletions
301
icu4c/source/i18n/anytrans.cpp
Normal file
301
icu4c/source/i18n/anytrans.cpp
Normal file
|
@ -0,0 +1,301 @@
|
|||
/*
|
||||
*****************************************************************
|
||||
* Copyright (c) 2002, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
*****************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu/source/i18n/anytrans.cpp,v $
|
||||
* $Revision: 1.1 $
|
||||
*****************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/2002 aliu Creation.
|
||||
*****************************************************************
|
||||
*/
|
||||
#include "anytrans.h"
|
||||
#include "uvector.h"
|
||||
#include "unicode/nultrans.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
//------------------------------------------------------------
|
||||
// Constants
|
||||
|
||||
static const UChar HYPHEN = 45; // '-'
|
||||
static const UChar ANY[] = {65,110,121,45,0}; // "Any-"
|
||||
|
||||
//------------------------------------------------------------
|
||||
// AnyTransliterator
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Try to create a transliterator with the given ID, which should be
|
||||
* of the form "Any-X". The "X" will be pulled off and passed to
|
||||
* createInstance().
|
||||
*/
|
||||
Transliterator* AnyTransliterator::_create(const UnicodeString& ID, Token /*context*/) {
|
||||
UnicodeString target(ID);
|
||||
int32_t i = target.indexOf(HYPHEN);
|
||||
if (i >= 0) {
|
||||
target.remove(0, i+1);
|
||||
}
|
||||
return AnyTransliterator::createInstance(target, TRUE, TRUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
void AnyTransliterator::registerIDs() {
|
||||
Token t = integerToken(0);
|
||||
|
||||
// Register Any-Latin and make its inverse Null
|
||||
Transliterator::_registerFactory("Any-Latin", _create, t);
|
||||
Transliterator::_registerSpecialInverse("Latin", "Null", FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the script code for a given name, or -1 if not found.
|
||||
*/
|
||||
int32_t AnyTransliterator::scriptNameToCode(const UnicodeString& name) {
|
||||
char buf[128];
|
||||
UScriptCode code;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
|
||||
name.extract(0, 128, buf, 128, "");
|
||||
if (uscript_getCode(buf, &code, 1, &ec) != 1 ||
|
||||
U_FAILURE(ec)) {
|
||||
code = (UScriptCode) -1;
|
||||
}
|
||||
return (int32_t) code;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method to create an Any-X transliterator. Relies on
|
||||
* registered transliterators at the time of the call to build the
|
||||
* Any-X transliterator. If there are no registered transliterators
|
||||
* of the form Y-X, then the logical result is Any-Null. If there is
|
||||
* exactly one transliterator of the form Y-X, then the logical result
|
||||
* is Y-X, a degenerate result. If there are 2 or more
|
||||
* transliterators of the form Y-X, then an AnyTransliterator is
|
||||
* instantiated and returned.
|
||||
* @param allowNull if true, then return Any-Null if there are no
|
||||
* transliterator to the given script; otherwise return NULL
|
||||
* @param allowDegenerate if true, then return a transliterator of the
|
||||
* form X-Y if there is only one such transliterator
|
||||
* the given script; otherwise return NULL
|
||||
*/
|
||||
Transliterator* AnyTransliterator::createInstance(const UnicodeString& toTarget,
|
||||
UBool allowNull,
|
||||
UBool allowDegenerate) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UVector translits(ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Count transliterators _to_ the given target. This is
|
||||
// inconvenient since we have to iterate over all sources.
|
||||
int32_t sourceCount = Transliterator::countAvailableSources();
|
||||
for (int32_t s=0; s<sourceCount; ++s) {
|
||||
UnicodeString source;
|
||||
Transliterator::getAvailableSource(s, source);
|
||||
int32_t targetCount = Transliterator::countAvailableTargets(source);
|
||||
for (int32_t t=0; t<targetCount; ++t) {
|
||||
UnicodeString target;
|
||||
Transliterator::getAvailableTarget(t, source, target);
|
||||
if (target.caseCompare(toTarget, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) {
|
||||
// We have a source match. It must also be a script
|
||||
// or we can't use it.
|
||||
int32_t code = scriptNameToCode(source);
|
||||
if (code < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to instantiate the given transliterator
|
||||
UnicodeString id(source);
|
||||
id.append(HYPHEN).append(toTarget);
|
||||
Transliterator* t = Transliterator::createInstance(
|
||||
id, UTRANS_FORWARD, ec);
|
||||
if (U_FAILURE(ec) || t == NULL) {
|
||||
delete t;
|
||||
continue;
|
||||
}
|
||||
|
||||
// We have a script code and a transliterator; save
|
||||
// them.
|
||||
translits.addElement(new Elem((UScriptCode) code, t), ec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (translits.size()) {
|
||||
case 0:
|
||||
// There is nothing registered going to the requested target,
|
||||
// so return Any-Null, if allowed
|
||||
return allowNull ? new NullTransliterator() : NULL;
|
||||
case 1:
|
||||
// Exactly one transliterator goes to the requested target, so
|
||||
// return it, if allowed
|
||||
{
|
||||
Transliterator* t = NULL;
|
||||
if (allowDegenerate) {
|
||||
Elem *e = (Elem*) translits.orphanElementAt(0);
|
||||
t = e->translit;
|
||||
delete e;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
// We have 2 or more script-toTarget transliterators. Assemble an
|
||||
// AnyTransliterator and return it.
|
||||
UnicodeString id(ANY);
|
||||
id.append(toTarget);
|
||||
return new AnyTransliterator(id, translits);
|
||||
}
|
||||
|
||||
//|/**
|
||||
//| * Factory method to create an Any-X transliterator. Convenience
|
||||
//| * function that takes a script code.
|
||||
//| */
|
||||
//|Transliterator* AnyTransliterator::createInstance(UScriptCode target,
|
||||
//| UBool allowNull,
|
||||
//| UBool allowDegenerate) {
|
||||
//| UnicodeString name(uscript_getName(target), "");
|
||||
//| return createInstance(name, allowNull, allowDegenerate);
|
||||
//|}
|
||||
|
||||
/**
|
||||
* Constructs aa transliterator with the given ID. The vector should
|
||||
* contain Elem objects. Each will be removed from the vector and
|
||||
* ownership taken of its storage, including the contained
|
||||
* transliterator. Upon return the vector will be empty.
|
||||
*/
|
||||
AnyTransliterator::AnyTransliterator(const UnicodeString& id, UVector& vec) :
|
||||
Transliterator(id, NULL)
|
||||
{
|
||||
count = vec.size();
|
||||
elems = new Elem[count];
|
||||
for (int32_t i=count-1; i>=0; --i) {
|
||||
Elem* e = (Elem*) vec.orphanElementAt(i);
|
||||
elems[i] = *e;
|
||||
delete e;
|
||||
}
|
||||
}
|
||||
|
||||
AnyTransliterator::~AnyTransliterator() {
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
delete elems[i].translit;
|
||||
}
|
||||
delete[] elems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
|
||||
Transliterator(o)
|
||||
{
|
||||
count = o.count;
|
||||
elems = new Elem[count];
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
elems[i] = o.elems[i];
|
||||
elems[i].translit = elems[i].translit->clone();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* AnyTransliterator::clone() const {
|
||||
return new AnyTransliterator(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
|
||||
UBool isIncremental) const {
|
||||
|
||||
// Compute indices relative to contextStart
|
||||
int32_t start = pos.start - pos.contextStart;
|
||||
int32_t limit = pos.limit - pos.contextStart;
|
||||
int32_t contextLimit = pos.contextLimit - pos.contextStart;
|
||||
|
||||
if (start == limit) return; // Short circuit
|
||||
|
||||
// Extract contextStart..contextLimit
|
||||
UnicodeString ustext;
|
||||
text.extractBetween(pos.contextStart, pos.contextLimit, ustext);
|
||||
|
||||
// Work directly on the buffer. We don't need to release the
|
||||
// buffer since the UnicodeString is automatic scope.
|
||||
UChar* utext = ustext.getBuffer(-1);
|
||||
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UScriptRun* run = uscript_openRun(utext, contextLimit, &ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
pos.start = pos.limit; // we're done
|
||||
uscript_closeRun(run);
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t origLimit = pos.limit; // save original limit
|
||||
int32_t delta = 0; // cumulative change in length
|
||||
|
||||
// Iterate over runs
|
||||
int32_t runStart, runLimit;
|
||||
UScriptCode runScript;
|
||||
|
||||
// We're done if we've entered the post context or when there are
|
||||
// no more script runs (which should only happen when we call
|
||||
// nextRun _after_ runLimit has been returned at contextLimit).
|
||||
runLimit = 0;
|
||||
while (runLimit < limit &&
|
||||
uscript_nextRun(run, &runStart, &runLimit, &runScript)) {
|
||||
|
||||
// Do nothing if we're still in the ante context
|
||||
if (runLimit <= start) continue;
|
||||
|
||||
// See if we have a transliterator for this run
|
||||
Transliterator* t = NULL;
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
if (elems[i].script == runScript) {
|
||||
t = elems[i].translit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Transliterate max(start, runStart) to min(limit, runLimit).
|
||||
// Adjust indices to text-relative ones
|
||||
pos.start = uprv_max(start, runStart) + pos.contextStart + delta;
|
||||
pos.limit = uprv_min(limit, runLimit) + pos.contextStart + delta;
|
||||
|
||||
// If we don't have a transliterator for this script, then
|
||||
// leave the text unchanged.
|
||||
if (t == NULL) {
|
||||
pos.start = pos.limit;
|
||||
}
|
||||
|
||||
else {
|
||||
// If the run end is before the transliteration limit, do
|
||||
// a non-incremental transliteration. Otherwise do an
|
||||
// incremental one.
|
||||
UBool incremental = isIncremental && (runLimit >= limit);
|
||||
|
||||
// Transliterate and record change in length
|
||||
int32_t l = pos.limit;
|
||||
t->filteredTransliterate(text, pos, incremental);
|
||||
delta += pos.limit - l;
|
||||
}
|
||||
}
|
||||
|
||||
uscript_closeRun(run);
|
||||
|
||||
// pos.start can stay where the last transliterator left it. pos.limit
|
||||
// needs to be adjusted for changes in length.
|
||||
pos.limit = origLimit + delta;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
//eof
|
157
icu4c/source/i18n/anytrans.h
Normal file
157
icu4c/source/i18n/anytrans.h
Normal file
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
*****************************************************************
|
||||
* Copyright (c) 2002, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
*****************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu/source/i18n/anytrans.h,v $
|
||||
* $Revision: 1.1 $
|
||||
*****************************************************************
|
||||
* Date Name Description
|
||||
* 06/06/2002 aliu Creation.
|
||||
*****************************************************************
|
||||
*/
|
||||
#ifndef _ANYTRANS_H_
|
||||
#define _ANYTRANS_H_
|
||||
|
||||
#include "unicode/translit.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* A transliterator named Any-X, where X is the target, that contains
|
||||
* multiple transliterators, all going to X, all with script sources.
|
||||
* The target need not be a script. It uses the script run API
|
||||
* (uscript.h) to partition text into runs of the same script, and
|
||||
* then based on the script of each run, transliterates from that
|
||||
* script to the given target.
|
||||
*
|
||||
* <p>For example, "Any-Latin" might contain two transliterators,
|
||||
* "Greek-Latin" and "Hiragana-Latin". It would then transliterate
|
||||
* runs of Greek with Greek-Latin, runs of Hiragana with
|
||||
* Hirgana-Latin, and pass other runs through unchanged.
|
||||
*
|
||||
* <p>There is no inverse of an Any-X transliterator. Although it
|
||||
* would be possible to tag the output text with script markers to
|
||||
* make inversion possible, this is not currently implemented.
|
||||
*
|
||||
* @author Alan Liu
|
||||
*/
|
||||
class U_I18N_API AnyTransliterator : public Transliterator {
|
||||
|
||||
/**
|
||||
* A script code and associated transliterator. It does _not_ own
|
||||
* the transliterator.
|
||||
*/
|
||||
class Elem {
|
||||
public:
|
||||
UScriptCode script;
|
||||
Transliterator* translit;
|
||||
Elem(UScriptCode s=(UScriptCode)0, Transliterator* t=NULL) {
|
||||
script = s;
|
||||
translit = t;
|
||||
}
|
||||
Elem& operator=(const Elem& o) {
|
||||
script = o.script;
|
||||
translit = o.translit;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Array of script codes and associated transliterators. We
|
||||
* own the transliterators.
|
||||
*/
|
||||
Elem* elems;
|
||||
|
||||
/**
|
||||
* Length of elems, always at least 2.
|
||||
*/
|
||||
int32_t count;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Factory method to create an Any-X transliterator. Relies on
|
||||
* registered transliterators at the time of the call to build the
|
||||
* Any-X transliterator. If there are no registered transliterators
|
||||
* of the form Y-X, then the logical result is Any-Null. If there is
|
||||
* exactly one transliterator of the form Y-X, then the logical result
|
||||
* is Y-X, a degenerate result. If there are 2 or more
|
||||
* transliterators of the form Y-X, then an AnyTransliterator is
|
||||
* instantiated and returned.
|
||||
* @param target the target, which need not be a script. This
|
||||
* be a string such as "Latin", <em>not</em> "Any-Latin".
|
||||
* @param allowNull if true, then return Any-Null if there are no
|
||||
* transliterator to the given script; otherwise return NULL
|
||||
* @param allowDegenerate if true, then return a transliterator of the
|
||||
* form X-Y if there is only one such transliterator
|
||||
* the given script; otherwise return NULL
|
||||
* @return a new Transliterator, or NULL. If allowNull or
|
||||
* allowDegenerate is TRUE, the result may not be an
|
||||
* AnyTransliterator. If they are both false, the result will be
|
||||
* an AnyTransliterator.
|
||||
*/
|
||||
static Transliterator* createInstance(const UnicodeString& target,
|
||||
UBool allowNull,
|
||||
UBool allowDegenerate);
|
||||
|
||||
//| /**
|
||||
//| * Factory method to create an Any-X transliterator. Convenience
|
||||
//| * function that takes a script code.
|
||||
//| */
|
||||
//| static Transliterator* createInstance(UScriptCode target,
|
||||
//| UBool allowNull,
|
||||
//| UBool allowDegenerate);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
virtual ~AnyTransliterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
AnyTransliterator(const AnyTransliterator&);
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
*/
|
||||
Transliterator* clone() const;
|
||||
|
||||
/**
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
virtual void handleTransliterate(Replaceable& text, UTransPosition& index,
|
||||
UBool incremental) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Private constructor for Transliterator.
|
||||
*/
|
||||
AnyTransliterator(const UnicodeString& id, UVector& vec);
|
||||
|
||||
/**
|
||||
* Try to create a transliterator with the given ID, which should
|
||||
* be of the form "Any-X".
|
||||
*/
|
||||
static Transliterator* _create(const UnicodeString& ID, Token /*context*/);
|
||||
|
||||
/**
|
||||
* Registers standard variants with the system. Called by
|
||||
* Transliterator during initialization.
|
||||
*/
|
||||
static void registerIDs();
|
||||
|
||||
friend class Transliterator; // for registerIDs()
|
||||
|
||||
/**
|
||||
* Return the script code for a given name, or -1 if not found.
|
||||
*/
|
||||
static int32_t scriptNameToCode(const UnicodeString& name);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
|
@ -151,6 +151,10 @@ LINK32=link.exe
|
|||
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\anytrans.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\bocsu.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -423,6 +427,10 @@ SOURCE=.\utrans.cpp
|
|||
# PROP Default_Filter "h;hpp;hxx;hm;inl"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\anytrans.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\bocsu.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -38,6 +38,7 @@
|
|||
#include "unesctrn.h"
|
||||
#include "util.h"
|
||||
#include "tridpars.h"
|
||||
#include "anytrans.h"
|
||||
|
||||
static const UChar TARGET_SEP = 0x002D; /*-*/
|
||||
static const UChar ID_DELIM = 0x003B; /*;*/
|
||||
|
@ -1323,6 +1324,7 @@ void Transliterator::initializeRegistry(void) {
|
|||
EscapeTransliterator::registerIDs();
|
||||
UnescapeTransliterator::registerIDs();
|
||||
NormalizationTransliterator::registerIDs();
|
||||
AnyTransliterator::registerIDs();
|
||||
ucln_i18n_registerCleanup();
|
||||
}
|
||||
|
||||
|
|
|
@ -557,6 +557,7 @@ protected:
|
|||
UBool incremental) const;
|
||||
|
||||
friend class CompoundTransliterator; // for filteredTransliterate()
|
||||
friend class AnyTransliterator; // for filteredTransliterate()
|
||||
|
||||
private:
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue