diff --git a/icu4c/data/resfiles.mk b/icu4c/data/resfiles.mk index 4162ea46bfa..33729f97dd8 100644 --- a/icu4c/data/resfiles.mk +++ b/icu4c/data/resfiles.mk @@ -106,7 +106,7 @@ zh.txt zh__PINYIN.txt zh_CN.txt zh_HK.txt zh_SG.txt zh_TW.txt zh_TW_STROKE.txt TRANSLIT_SOURCE=fullhalf.txt translit_index.txt kana.txt kbdescl1.txt\ larabic.txt lcyril.txt ldevan.txt\ lgreek.txt lhebrew.txt ljamo.txt\ -lkana.txt quotes.txt ucname.txt\ +lkana.txt quotes.txt\ Bengali_InterIndic.txt\ Devanagari_InterIndic.txt\ Gujarati_InterIndic.txt\ diff --git a/icu4c/data/translit_index.txt b/icu4c/data/translit_index.txt index b687f273145..04bd4c11d5a 100644 --- a/icu4c/data/translit_index.txt +++ b/icu4c/data/translit_index.txt @@ -97,7 +97,9 @@ translit_index { // Java only: { "Kanji-English", "file", "-", "FORWARD" }, // Java only: { "Kanji-OnRomaji", "file", "-", "FORWARD" }, { "KeyboardEscape-Latin1", "file", "kbdescl1", "FORWARD" }, -{ "UnicodeName-UnicodeChar", "file", "ucname", "FORWARD" }, + +// Replaced by algorithmic transliterator: +// { "UnicodeName-UnicodeChar", "file", "ucname", "FORWARD" }, // Compound rules diff --git a/icu4c/source/data/locales/resfiles.mk b/icu4c/source/data/locales/resfiles.mk index 4162ea46bfa..33729f97dd8 100644 --- a/icu4c/source/data/locales/resfiles.mk +++ b/icu4c/source/data/locales/resfiles.mk @@ -106,7 +106,7 @@ zh.txt zh__PINYIN.txt zh_CN.txt zh_HK.txt zh_SG.txt zh_TW.txt zh_TW_STROKE.txt TRANSLIT_SOURCE=fullhalf.txt translit_index.txt kana.txt kbdescl1.txt\ larabic.txt lcyril.txt ldevan.txt\ lgreek.txt lhebrew.txt ljamo.txt\ -lkana.txt quotes.txt ucname.txt\ +lkana.txt quotes.txt\ Bengali_InterIndic.txt\ Devanagari_InterIndic.txt\ Gujarati_InterIndic.txt\ diff --git a/icu4c/source/data/translit/translit_index.txt b/icu4c/source/data/translit/translit_index.txt index b687f273145..04bd4c11d5a 100644 --- a/icu4c/source/data/translit/translit_index.txt +++ b/icu4c/source/data/translit/translit_index.txt @@ -97,7 +97,9 @@ translit_index { // Java only: { "Kanji-English", "file", "-", "FORWARD" }, // Java only: { "Kanji-OnRomaji", "file", "-", "FORWARD" }, { "KeyboardEscape-Latin1", "file", "kbdescl1", "FORWARD" }, -{ "UnicodeName-UnicodeChar", "file", "ucname", "FORWARD" }, + +// Replaced by algorithmic transliterator: +// { "UnicodeName-UnicodeChar", "file", "ucname", "FORWARD" }, // Compound rules diff --git a/icu4c/source/i18n/hextouni.cpp b/icu4c/source/i18n/hextouni.cpp index 3d7a30d6c69..85113f13c5b 100644 --- a/icu4c/source/i18n/hextouni.cpp +++ b/icu4c/source/i18n/hextouni.cpp @@ -15,7 +15,7 @@ /** * ID for this transliterator. */ -const char* HexToUnicodeTransliterator::_ID = "Hex-Unicode"; +const char* HexToUnicodeTransliterator::_ID = "Hex-Any"; /** * This pattern encodes the following specs for the default constructor: diff --git a/icu4c/source/i18n/translit.cpp b/icu4c/source/i18n/translit.cpp index 62102684675..162acc1a799 100644 --- a/icu4c/source/i18n/translit.cpp +++ b/icu4c/source/i18n/translit.cpp @@ -39,12 +39,17 @@ // keep in sync with CompoundTransliterator static const UChar ID_SEP = 0x002D; /*-*/ static const UChar ID_DELIM = 0x003B; /*;*/ +static const UChar VARIANT_SEP = 0x002F; // '/' static const UChar OPEN_PAREN = 40; static const UChar CLOSE_PAREN = 41; static Hashtable _cache(TRUE); // TRUE = keys are case insensitive static Hashtable _internalCache(TRUE); // TRUE = keys are case insensitive +// Map of source name to (Hashtable mapping target to (UVector of +// target names). +static Hashtable sourceMap(TRUE); + /** * Cache of public system transliterators. Keys are UnicodeString * names, values are CacheEntry objects. @@ -1350,7 +1355,7 @@ void Transliterator::_registerFactory(const UnicodeString& id, CacheEntry* entry = (CacheEntry*) cache->get(id); if (entry == 0) { - cacheIDs.addElement((void*) new UnicodeString(id)); + _registerID(id); entry = new CacheEntry(); } entry->setFactory(factory); @@ -1396,7 +1401,7 @@ void Transliterator::_registerInstance(Transliterator* adoptedPrototype, CacheEntry* entry = (CacheEntry*) cache->get(id); if (entry == 0) { - cacheIDs.addElement((void*) new UnicodeString(id)); + _registerID(id); entry = new CacheEntry(); } @@ -1464,6 +1469,114 @@ const UnicodeString& Transliterator::getAvailableID(int32_t index) { return *(const UnicodeString*) cacheIDs[index]; } +int32_t Transliterator::countAvailableSources(void) { + if (!cacheInitialized) { + initializeCache(); + } + Mutex lock(&cacheMutex); + return sourceMap.count(); +} + +UnicodeString& Transliterator::getAvailableSource(int32_t index, + UnicodeString& result) { + if (!cacheInitialized) { + initializeCache(); + } + Mutex lock(&cacheMutex); + int32_t pos = -1; + const UHashElement *e = 0; + while (index-- >= 0) { + e = sourceMap.nextElement(pos); + if (e == 0) { + break; + } + } + if (e == 0) { + result.truncate(0); + } else { + result = *(UnicodeString*) e->key.pointer; + } + return result; +} + +int32_t Transliterator::countAvailableTargets(const UnicodeString& source) { + if (!cacheInitialized) { + initializeCache(); + } + Mutex lock(&cacheMutex); + Hashtable *targets = (Hashtable*) sourceMap.get(source); + return (targets == 0) ? 0 : targets->count(); +} + +UnicodeString& Transliterator::getAvailableTarget(int32_t index, + const UnicodeString& source, + UnicodeString& result) { + if (!cacheInitialized) { + initializeCache(); + } + Mutex lock(&cacheMutex); + Hashtable *targets = (Hashtable*) sourceMap.get(source); + if (targets == 0) { + result.truncate(0); // invalid source + return result; + } + int32_t pos = -1; + const UHashElement *e = 0; + while (index-- >= 0) { + e = targets->nextElement(pos); + if (e == 0) { + break; + } + } + if (e == 0) { + result.truncate(0); // invalid index + } else { + result = *(UnicodeString*) e->key.pointer; + } + return result; +} + +int32_t Transliterator::countAvailableVariants(const UnicodeString& source, + const UnicodeString& target) { + if (!cacheInitialized) { + initializeCache(); + } + Mutex lock(&cacheMutex); + Hashtable *targets = (Hashtable*) sourceMap.get(source); + if (targets == 0) { + return 0; + } + UVector *variants = (UVector*) targets->get(target); + return (variants == 0) ? 0 : variants->size(); +} + +UnicodeString& Transliterator::getAvailableVariant(int32_t index, + const UnicodeString& source, + const UnicodeString& target, + UnicodeString& result) { + if (!cacheInitialized) { + initializeCache(); + } + Mutex lock(&cacheMutex); + Hashtable *targets = (Hashtable*) sourceMap.get(source); + if (targets == 0) { + result.truncate(0); // invalid source + return result; + } + UVector *variants = (UVector*) targets->get(target); + if (variants == 0) { + result.truncate(0); // invalid target + return result; + } + UnicodeString *v = (UnicodeString*) variants->elementAt(index); + if (v == 0) { + result.truncate(0); // invalid index + } else { + result = *v; + } + return result; +} + /** * Method for subclasses to use to obtain a character in the given * string, with filtering. @@ -1477,6 +1590,68 @@ UChar Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const { (localFilter->contains(c = text.charAt(i)) ? c : (UChar)0xFFFE); } +/** + * Register an ID (with no whitespace in it, no inline filter, and + * not compound) in the Source-Target/Variant record. + */ +void Transliterator::_registerID(const UnicodeString& id) { + // cacheMutex must already be held (by caller) + cacheIDs.addElement((void*) new UnicodeString(id)); + + UnicodeString source, target, variant; + int32_t dash = id.indexOf(ID_SEP); + int32_t stroke = id.indexOf(VARIANT_SEP); + int32_t start = 0; + int32_t limit = id.length(); + if (dash < 0) { + source = UnicodeString("Any", ""); + } else { + id.extractBetween(0, dash, source); + start = dash + 1; + } + if (stroke >= 0) { + id.extractBetween(stroke + 1, id.length(), variant); + limit = stroke; + } + id.extractBetween(start, limit, target); + _registerSTV(source, target, variant); +} + +/** + * Register a source-target/variant in the Source-Target/Variant record. + * Variant may be empty, but source and target must not be. + */ +void Transliterator::_registerSTV(const UnicodeString& source, + const UnicodeString& target, + const UnicodeString& variant) { + // cacheMutex must already be held (by caller) + // assert(source.length() > 0); + // assert(target.length() > 0); + UErrorCode status = U_ZERO_ERROR; + Hashtable *targets = (Hashtable*) sourceMap.get(source); + if (targets == 0) { + targets = new Hashtable(TRUE); + if (targets == 0) { + return; + } + targets->setValueDeleter(uhash_deleteUVector); + sourceMap.put(source, targets, status); + } + UVector *variants = (UVector*) targets->get(target); + if (variants == 0) { + variants = new UVector(uhash_deleteUnicodeString, + uhash_compareCaselessUnicodeString); + if (variants == 0) { + return; + } + targets->put(target, variants, status); + } + if (variant.length() > 0 && + !variants->contains((void*) &variant)) { + variants->addElement(new UnicodeString(variant)); + } +} + void Transliterator::initializeCache(void) { // Lock first, check init boolean second Mutex lock(&cacheMutex); @@ -1492,6 +1667,8 @@ void Transliterator::initializeCache(void) { cacheIDs.setDeleter(uhash_deleteUnicodeString); cacheIDs.setComparer(uhash_compareCaselessUnicodeString); + sourceMap.setValueDeleter(uhash_deleteHashtable); + /* The following code parses the index table located in * icu/data/translit_index.txt. The index is an n x 4 table * that follows this format: @@ -1555,9 +1732,8 @@ void Transliterator::initializeCache(void) { Hashtable* c = isInternal ? internalCache : cache; c->put(id, entry, status); - // cacheIDs owns & should delete the following string if (!isInternal) { - cacheIDs.addElement((void*) new UnicodeString(id)); + _registerID(id); } } } diff --git a/icu4c/source/i18n/unicode/translit.h b/icu4c/source/i18n/unicode/translit.h index 6bce71a093f..d0ea253d3c3 100644 --- a/icu4c/source/i18n/unicode/translit.h +++ b/icu4c/source/i18n/unicode/translit.h @@ -858,6 +858,12 @@ private: friend class NormalizationTransliterator; + static void _registerID(const UnicodeString& id); + + static void _registerSTV(const UnicodeString& source, + const UnicodeString& target, + const UnicodeString& variant); + public: /** @@ -918,6 +924,66 @@ public: */ static const UnicodeString& getAvailableID(int32_t index); + /** + * Return the number of registered source specifiers. + */ + static int32_t countAvailableSources(void); + + /** + * Return a registered source specifier. + * @param index which specifier to return, from 0 to n-1, where + * n = countAvailableSources() + * @param result fill-in paramter to receive the source specifier. + * If index is out of range, result will be empty. + * @return reference to result + */ + static UnicodeString& getAvailableSource(int32_t index, + UnicodeString& result); + + /** + * Return the number of registered target specifiers for a given + * source specifier. + */ + static int32_t countAvailableTargets(const UnicodeString& source); + + /** + * Return a registered target specifier for a given source. + * @param index which specifier to return, from 0 to n-1, where + * n = countAvailableTargets(source) + * @param source the source specifier + * @param result fill-in paramter to receive the target specifier. + * If source is invalid or if index is out of range, result will + * be empty. + * @return reference to result + */ + static UnicodeString& getAvailableTarget(int32_t index, + const UnicodeString& source, + UnicodeString& result); + + /** + * Return the number of registered variant specifiers for a given + * source-target pair. + */ + static int32_t countAvailableVariants(const UnicodeString& source, + const UnicodeString& target); + + /** + * Return a registered variant specifier for a given source-target + * pair. + * @param index which specifier to return, from 0 to n-1, where + * n = countAvailableVariants(source, target) + * @param source the source specifier + * @param target the target specifier + * @param result fill-in paramter to receive the variant + * specifier. If source is invalid or if target is invalid or if + * index is out of range, result will be empty. + * @return reference to result + */ + static UnicodeString& getAvailableVariant(int32_t index, + const UnicodeString& source, + const UnicodeString& target, + UnicodeString& result); + /** * Return the class ID for this class. This is useful only for * comparing to a return value from getDynamicClassID(). For example: diff --git a/icu4c/source/i18n/unitohex.cpp b/icu4c/source/i18n/unitohex.cpp index 21660e14813..4bfe4594bce 100644 --- a/icu4c/source/i18n/unitohex.cpp +++ b/icu4c/source/i18n/unitohex.cpp @@ -14,7 +14,7 @@ /** * ID for this transliterator. */ -const char* UnicodeToHexTransliterator::_ID = "Unicode-Hex"; +const char* UnicodeToHexTransliterator::_ID = "Any-Hex"; const UChar UnicodeToHexTransliterator::HEX_DIGITS[32] = { // Use Unicode hex values for EBCDIC compatibility diff --git a/icu4c/source/test/intltest/cpdtrtst.cpp b/icu4c/source/test/intltest/cpdtrtst.cpp index fc12396177b..5414da2aef2 100644 --- a/icu4c/source/test/intltest/cpdtrtst.cpp +++ b/icu4c/source/test/intltest/cpdtrtst.cpp @@ -188,7 +188,7 @@ void CompoundTransliteratorTest::TestGetCount(){ logln("Testing the getCount() API of CompoundTransliterator"); UErrorCode status = U_ZERO_ERROR; CompoundTransliterator *ct1=new CompoundTransliterator("Halfwidth-Fullwidth;Fullwidth-Halfwidth", status); - CompoundTransliterator *ct2=new CompoundTransliterator("Unicode-Hex;Hex-Unicode;Cyrillic-Latin;Latin-Cyrillic", status); + CompoundTransliterator *ct2=new CompoundTransliterator("Any-Hex;Hex-Any;Cyrillic-Latin;Latin-Cyrillic", status); CompoundTransliterator *ct3=(CompoundTransliterator*)ct1; CompoundTransliterator *ct4=new CompoundTransliterator("Latin-Devanagari", status); CompoundTransliterator *ct5=new CompoundTransliterator(*ct4); @@ -211,7 +211,7 @@ void CompoundTransliteratorTest::TestGetCount(){ void CompoundTransliteratorTest::TestGetSetAdoptTransliterator(){ logln("Testing the getTransliterator() API of CompoundTransliterator"); - UnicodeString ID("Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Unicode-Hex;Hex-Unicode"); + UnicodeString ID("Latin-Greek;Greek-Latin;Latin-Devanagari;Devanagari-Latin;Latin-Cyrillic;Cyrillic-Latin;Any-Hex;Hex-Any"); UErrorCode status = U_ZERO_ERROR; CompoundTransliterator *ct1=new CompoundTransliterator(ID, status); if(U_FAILURE(status)){ @@ -233,7 +233,7 @@ void CompoundTransliteratorTest::TestGetSetAdoptTransliterator(){ logln("Testing setTransliterator() API of CompoundTransliterator"); - UnicodeString ID2("Hex-Unicode;Unicode-Hex;Latin-Cyrillic;Cyrillic-Latin;Halfwidth-Fullwidth;Fullwidth-Halfwidth"); + UnicodeString ID2("Hex-Any;Any-Hex;Latin-Cyrillic;Cyrillic-Latin;Halfwidth-Fullwidth;Fullwidth-Halfwidth"); array=split(ID2, 0x003b, count); Transliterator** transarray=new Transliterator*[count]; for(i=0;itransliterate(gotResBuf, 10, 5); if(gotResBuf == temp) @@ -337,7 +337,7 @@ void TransliteratorAPITest::TestTransliterate3(){ }; int start, limit; UnicodeString message; - Transliterator *t=Transliterator::createInstance("Unicode-Hex"); + Transliterator *t=Transliterator::createInstance("Any-Hex"); if(t == 0) errln("FAIL : construction"); for(uint32_t i=0; igetFilter(); diff --git a/icu4c/source/test/intltest/transtst.cpp b/icu4c/source/test/intltest/transtst.cpp index e74f8d3b09c..cb5fd51b8e4 100644 --- a/icu4c/source/test/intltest/transtst.cpp +++ b/icu4c/source/test/intltest/transtst.cpp @@ -69,6 +69,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec, TESTCASE(33,TestContext); TESTCASE(34,TestSupplemental); TESTCASE(35,TestQuantifier); + TESTCASE(36,TestSTV); default: name = ""; break; } } @@ -444,8 +445,8 @@ void TransliteratorTest::TestCompoundKana(void) { * Compose the hex transliterators forward and reverse. */ void TransliteratorTest::TestCompoundHex(void) { - Transliterator* a = Transliterator::createInstance("Unicode-Hex"); - Transliterator* b = Transliterator::createInstance("Hex-Unicode"); + Transliterator* a = Transliterator::createInstance("Any-Hex"); + Transliterator* b = Transliterator::createInstance("Hex-Any"); Transliterator* transab[] = { a, b }; Transliterator* transba[] = { b, a }; if (a == 0 || b == 0) { @@ -490,9 +491,9 @@ class TestFilter : public UnicodeFilter { * Do some basic tests of filtering. */ void TransliteratorTest::TestFiltering(void) { - Transliterator* hex = Transliterator::createInstance("Unicode-Hex"); + Transliterator* hex = Transliterator::createInstance("Any-Hex"); if (hex == 0) { - errln("FAIL: createInstance(Unicode-Hex) failed"); + errln("FAIL: createInstance(Any-Hex) failed"); return; } hex->adoptFilter(new TestFilter()); @@ -644,7 +645,7 @@ void TransliteratorTest::TestJ243(void) { UErrorCode status = U_ZERO_ERROR; #if !defined(HPUX) - // Test default Hex-Unicode, which should handle + // Test default Hex-Any, which should handle // \u, \U, u+, and U+ HexToUnicodeTransliterator hex; expect(hex, UnicodeString("\\u0041+\\U0042,u+0043uu+0044z", ""), "A+B,CuDz"); @@ -654,7 +655,7 @@ void TransliteratorTest::TestJ243(void) { HexToUnicodeTransliterator hex2(UnicodeString("\\\\u###0;&\\#x###0\\;", ""), status); expect(hex2, UnicodeString("\\u61\\u062\\u0063\\u00645\\u66x0123", ""), "abcd5fx0123"); - // Try custom Unicode-Hex (default is tested elsewhere) + // Try custom Any-Hex (default is tested elsewhere) status = U_ZERO_ERROR; UnicodeToHexTransliterator hex3(UnicodeString("&\\#x###0;", ""), status); expect(hex3, "012", "012"); @@ -952,13 +953,13 @@ void TransliteratorTest::TestFilterIDs(void) { // Array of 3n strings: // , , , const char* DATA[] = { - "Unicode[aeiou]-Hex", - "Hex[aeiou]-Unicode", + "Any[aeiou]-Hex", + "Hex[aeiou]-Any", "quizzical", "q\\u0075\\u0069zz\\u0069c\\u0061l", - "Unicode[aeiou]-Hex;Hex[^5]-Unicode", - "Unicode[^5]-Hex;Hex[aeiou]-Unicode", + "Any[aeiou]-Hex;Hex[^5]-Any", + "Any[^5]-Hex;Hex[aeiou]-Any", "quizzical", "q\\u0075izzical", @@ -1218,7 +1219,7 @@ void TransliteratorTest::TestCompoundRBT(void) { // Careful with spacing and ';' here: Phrase this exactly // as toRules() is going to return it. If toRules() changes // with regard to spacing or ';', then adjust this string. - UnicodeString rule("::Hex-Unicode;\n" + UnicodeString rule("::Hex-Any;\n" "::Any-Lower;\n" "a > '.A.';\n" "b > '.B.';\n" @@ -1553,6 +1554,42 @@ void TransliteratorTest::TestQuantifier() { "bb x xb"); } +/** + * Test Source-Target/Variant. + */ +void TransliteratorTest::TestSTV(void) { + int32_t ns = Transliterator::countAvailableSources(); + for (int32_t i=0; i