diff --git a/coding/string_utf8_multilang.cpp b/coding/string_utf8_multilang.cpp index 86fe581068..2579255e0c 100644 --- a/coding/string_utf8_multilang.cpp +++ b/coding/string_utf8_multilang.cpp @@ -15,70 +15,70 @@ namespace // Note that it's not feasible to increase languages number here due to current encoding (6 bit to // store language code). array const kLanguages = { - {{"default", "Native for each country", "Any-Latin"}, - {"en", "English", ""}, - {"ja", "日本語", ""}, - {"fr", "Français", ""}, - {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"}, - {"ar", "العربية", "Any-Latin"}, - {"de", "Deutsch", ""}, - {"int_name", "International (Latin)", "Any-Latin"}, - {"ru", "Русский", "Russian-Latin/BGN"}, - {"sv", "Svenska", ""}, - {"zh", "中文", "Any-Latin"}, - {"fi", "Suomi", ""}, - {"be", "Беларуская", "Belarusian-Latin/BGN"}, - {"ka", "ქართული", "Georgian-Latin"}, - {"ko", "한국어", "Hangul-Latin/BGN"}, - {"he", "עברית", "Hebrew-Latin"}, - {"nl", "Nederlands", ""}, - {"ga", "Gaeilge", ""}, - {"ja_rm", "Japanese (Romanized)", "Any-Latin"}, - {"el", "Ελληνικά", "Greek-Latin"}, - {"it", "Italiano", ""}, - {"es", "Español", ""}, - {"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, - {"th", "ไทย", ""}, // Thai-Latin - {"cy", "Cymraeg", ""}, - {"sr", "Српски", "Serbian-Latin/BGN"}, - {"uk", "Українська", "Ukrainian-Latin/BGN"}, - {"ca", "Català", ""}, - {"hu", "Magyar", ""}, - {StringUtf8Multilang::kReservedLang /* hsb */, "", ""}, - {"eu", "Euskara", ""}, - {"fa", "فارسی", "Any-Latin"}, - {StringUtf8Multilang::kReservedLang /* br */, "", ""}, - {"pl", "Polski", ""}, - {"hy", "Հայերէն", "Armenian-Latin"}, - {StringUtf8Multilang::kReservedLang /* kn */, "", ""}, - {"sl", "Slovenščina", ""}, - {"ro", "Română", ""}, - {"sq", "Shqip", ""}, - {"am", "አማርኛ", "Amharic-Latin/BGN"}, - {"no", "Norsk", ""}, // Was "fy" before December 2018. - {"cs", "Čeština", ""}, - {"id", "Bahasa Indonesia", ""}, // Was "gd" before December 2018. - {"sk", "Slovenčina", ""}, - {"af", "Afrikaans", ""}, - {"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, - {StringUtf8Multilang::kReservedLang /* lb */, "", ""}, - {"pt", "Português", ""}, - {"hr", "Hrvatski", ""}, - {"da", "Dansk", ""}, // Was "fur" before December 2018. - {"vi", "Tiếng Việt", ""}, - {"tr", "Türkçe", ""}, - {"bg", "Български", "Bulgarian-Latin/BGN"}, - {"alt_name", "Alternative name", "Any-Latin"}, // Was "eo" before December 2018. - {"lt", "Lietuvių", ""}, - {"old_name", "Old/Previous name", "Any-Latin"}, // Was "la" before December 2018. - {"kk", "Қазақ", "Kazakh-Latin/BGN"}, - {StringUtf8Multilang::kReservedLang /* gsw */, "", ""}, - {"et", "Eesti", ""}, - {"ku", "Kurdish", "Any-Latin"}, - {"mn", "Mongolian", "Mongolian-Latin/BGN"}, - {"mk", "Македонски", "Macedonian-Latin/BGN"}, - {"lv", "Latviešu", ""}, - {"hi", "हिन्दी", "Any-Latin"}}}; + {{"default", "Native for each country", {"Any-Latin"}}, + {"en", "English", {}}, + {"ja", "日本語", {}}, + {"fr", "Français", {}}, + {"ko_rm", "Korean (Romanized)", {"Korean-Latin/BGN"}}, + {"ar", "العربية", {"Any-Latin"}}, + {"de", "Deutsch", {}}, + {"int_name", "International (Latin)", {"Any-Latin"}}, + {"ru", "Русский", {"Russian-Latin/BGN"}}, + {"sv", "Svenska", {}}, + {"zh", "中文", {"Any-Latin"}}, + {"fi", "Suomi", {}}, + {"be", "Беларуская", {"Belarusian-Latin/BGN"}}, + {"ka", "ქართული", {"Georgian-Latin"}}, + {"ko", "한국어", {"Hangul-Latin/BGN"}}, + {"he", "עברית", {"Hebrew-Latin"}}, + {"nl", "Nederlands", {}}, + {"ga", "Gaeilge", {}}, + {"ja_rm", "Japanese (Romanized)", {"Any-Latin"}}, + {"el", "Ελληνικά", {"Greek-Latin"}}, + {"it", "Italiano", {}}, + {"es", "Español", {}}, + {"zh_pinyin", "Chinese (Pinyin)", {"Any-Latin"}}, + {"th", "ไทย", {}}, // Thai-Latin + {"cy", "Cymraeg", {}}, + {"sr", "Српски", {"Serbian-Latin/BGN"}}, + {"uk", "Українська", {"Ukrainian-Latin/BGN"}}, + {"ca", "Català", {}}, + {"hu", "Magyar", {}}, + {StringUtf8Multilang::kReservedLang /* hsb */, "", {}}, + {"eu", "Euskara", {}}, + {"fa", "فارسی", {"Any-Latin"}}, + {StringUtf8Multilang::kReservedLang /* br */, "", {}}, + {"pl", "Polski", {}}, + {"hy", "Հայերէն", {"Armenian-Latin"}}, + {StringUtf8Multilang::kReservedLang /* kn */, "", {}}, + {"sl", "Slovenščina", {}}, + {"ro", "Română", {}}, + {"sq", "Shqip", {}}, + {"am", "አማርኛ", {"Amharic-Latin/BGN"}}, + {"no", "Norsk", {}}, // Was "fy" before December 2018. + {"cs", "Čeština", {}}, + {"id", "Bahasa Indonesia", {}}, // Was "gd" before December 2018. + {"sk", "Slovenčina", {}}, + {"af", "Afrikaans", {}}, + {"ja_kana", "日本語(カタカナ)", {"Katakana-Latin", "Hiragana-Latin"}}, + {StringUtf8Multilang::kReservedLang /* lb */, "", {}}, + {"pt", "Português", {}}, + {"hr", "Hrvatski", {}}, + {"da", "Dansk", {}}, // Was "fur" before December 2018. + {"vi", "Tiếng Việt", {}}, + {"tr", "Türkçe", {}}, + {"bg", "Български", {"Bulgarian-Latin/BGN"}}, + {"alt_name", "Alternative name", {"Any-Latin"}}, // Was "eo" before December 2018. + {"lt", "Lietuvių", {}}, + {"old_name", "Old/Previous name", {"Any-Latin"}}, // Was "la" before December 2018. + {"kk", "Қазақ", {"Kazakh-Latin/BGN"}}, + {StringUtf8Multilang::kReservedLang /* gsw */, "", {}}, + {"et", "Eesti", {}}, + {"ku", "Kurdish", {"Any-Latin"}}, + {"mn", "Mongolian", {"Mongolian-Latin/BGN"}}, + {"mk", "Македонски", {"Macedonian-Latin/BGN"}}, + {"lv", "Latviešu", {}}, + {"hi", "हिन्दी", {"Any-Latin"}}}}; static_assert( kLanguages.size() == StringUtf8Multilang::kMaxSupportedLanguages, @@ -137,7 +137,7 @@ char const * StringUtf8Multilang::GetLangByCode(int8_t langCode) if (!IsSupportedLangCode(langCode)) return ""; - return kLanguages[langCode].m_code; + return kLanguages[langCode].m_code.c_str(); } // static @@ -146,16 +146,17 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode) if (!IsSupportedLangCode(langCode)) return ""; - return kLanguages[langCode].m_name; + return kLanguages[langCode].m_name.c_str(); } // static -char const * StringUtf8Multilang::GetTransliteratorIdByCode(int8_t langCode) +vector const & StringUtf8Multilang::GetTransliteratorsIdsByCode(int8_t langCode) { + static const vector empty; if (!IsSupportedLangCode(langCode)) - return ""; + return empty; - return kLanguages[langCode].m_transliteratorId; + return kLanguages[langCode].m_transliteratorsIds; } size_t StringUtf8Multilang::GetNextIndex(size_t i) const diff --git a/coding/string_utf8_multilang.hpp b/coding/string_utf8_multilang.hpp index 9e663af9e0..108257a8b3 100644 --- a/coding/string_utf8_multilang.hpp +++ b/coding/string_utf8_multilang.hpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace utils { @@ -67,11 +68,11 @@ public: struct Lang { /// OSM language code (e.g. for name:en it's "en" part). - char const * m_code; + std::string m_code; /// Native language name. - char const * m_name; - /// Transliterator to latin id. - char const * m_transliteratorId; + std::string m_name; + /// Transliterators to latin ids. + std::vector m_transliteratorsIds; }; struct Position @@ -107,8 +108,8 @@ public: static char const * GetLangByCode(int8_t langCode); /// @returns empty string if langCode is invalid. static char const * GetLangNameByCode(int8_t langCode); - /// @returns empty string if langCode is invalid. - static char const * GetTransliteratorIdByCode(int8_t langCode); + /// @returns empty vector if langCode is invalid. + static std::vector const & GetTransliteratorsIdsByCode(int8_t langCode); inline bool operator==(StringUtf8Multilang const & rhs) const { return m_s == rhs.m_s; } inline bool operator!=(StringUtf8Multilang const & rhs) const { return !(*this == rhs); } diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp index 22f21d2bcf..f0ccf2c405 100644 --- a/coding/transliteration.cpp +++ b/coding/transliteration.cpp @@ -54,10 +54,11 @@ void Transliteration::Init(std::string const & icuDataDir) for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages()) { - if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0) - continue; - - m_transliterators.emplace(lang.m_transliteratorId, std::make_unique()); + for (auto const & t : lang.m_transliteratorsIds) + { + if (m_transliterators.count(t) == 0) + m_transliterators.emplace(t, std::make_unique()); + } } } @@ -74,47 +75,57 @@ bool Transliteration::Transliterate(std::string const & str, int8_t langCode, st if (str.empty() || strings::IsASCIIString(str)) return false; - std::string transliteratorId(StringUtf8Multilang::GetTransliteratorIdByCode(langCode)); - - if (transliteratorId.empty()) - return false; - - auto it = m_transliterators.find(transliteratorId); - if (it == m_transliterators.end()) - { - LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); - return false; - } - - if (!it->second->m_initialized) - { - std::lock_guard lock(it->second->m_mutex); - if (!it->second->m_initialized) - { - UErrorCode status = U_ZERO_ERROR; - - std::string const removeDiacriticRule = ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC"; - transliteratorId.append(removeDiacriticRule); - - UnicodeString translitId(transliteratorId.c_str()); - - it->second->m_transliterator.reset(Transliterator::createInstance(translitId, UTRANS_FORWARD, status)); - - if (it->second->m_transliterator == nullptr) - LOG(LWARNING, ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status)); - - it->second->m_initialized = true; - } - } - - if (it->second->m_transliterator == nullptr) + auto const & transliteratorsIds = StringUtf8Multilang::GetTransliteratorsIdsByCode(langCode); + if (transliteratorsIds.empty()) return false; UnicodeString ustr(str.c_str()); - it->second->m_transliterator->transliterate(ustr); + for (auto transliteratorId : transliteratorsIds) + { + if (transliteratorId.empty()) + return false; - if (ustr.isEmpty()) - return false; + auto it = m_transliterators.find(transliteratorId); + if (it == m_transliterators.end()) + { + LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); + return false; + } + + if (!it->second->m_initialized) + { + std::lock_guard lock(it->second->m_mutex); + if (!it->second->m_initialized) + { + UErrorCode status = U_ZERO_ERROR; + + std::string const removeDiacriticRule = + ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC"; + transliteratorId.append(removeDiacriticRule); + + UnicodeString translitId(transliteratorId.c_str()); + + it->second->m_transliterator.reset( + Transliterator::createInstance(translitId, UTRANS_FORWARD, status)); + + if (it->second->m_transliterator == nullptr) + { + LOG(LWARNING, + ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status)); + } + + it->second->m_initialized = true; + } + } + + if (it->second->m_transliterator == nullptr) + return false; + + it->second->m_transliterator->transliterate(ustr); + + if (ustr.isEmpty()) + return false; + } ustr.toUTF8String(out); return true; diff --git a/map/map_tests/transliteration_test.cpp b/map/map_tests/transliteration_test.cpp index 8e8a4662a7..b764af1aae 100644 --- a/map/map_tests/transliteration_test.cpp +++ b/map/map_tests/transliteration_test.cpp @@ -38,6 +38,8 @@ UNIT_TEST(Transliteration_CompareSamples) TestTransliteration(translit, "hy", "Հայերէն", "Hayeren"); TestTransliteration(translit, "am", "አማርኛ", "amarinya"); TestTransliteration(translit, "ja_kana", "カタカナ", "katakana"); + TestTransliteration(translit, "ja_kana", "ひらがな", "hiragana"); + TestTransliteration(translit, "ja_kana", "カタカナ ひらがな", "katakana hiragana"); TestTransliteration(translit, "bg", "Български", "Bulgarski"); TestTransliteration(translit, "kk", "Қазақ", "Qazaq"); TestTransliteration(translit, "mn", "Монгол хэл", "Mongol hel");