From 8f5a93d1e7093e055b6f39dc72719e61e110baf1 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Wed, 1 Jul 2020 15:42:20 +0300 Subject: [PATCH] [coding] Transliteration: support Hiragana-Katakana transliteration; threadsafe Init(). --- coding/transliteration.cpp | 123 ++++++++++++++++++++++++------------- coding/transliteration.hpp | 16 ++++- 2 files changed, 93 insertions(+), 46 deletions(-) diff --git a/coding/transliteration.cpp b/coding/transliteration.cpp index 09d71bb189..fc3939d524 100644 --- a/coding/transliteration.cpp +++ b/coding/transliteration.cpp @@ -47,9 +47,16 @@ Transliteration & Transliteration::Instance() void Transliteration::Init(std::string const & icuDataDir) { - // This function should be called at most once in a process, - // before the first ICU operation that will require the loading of an ICU data file. - // This function is not thread-safe. Use it before calling ICU APIs from multiple threads. + // Fast atomic check before mutex lock. + if (m_inited) + return; + + std::lock_guard lock(m_initializationMutex); + if (m_inited) + return; + + // This function should be called before the first ICU operation that will require the loading of + // an ICU data file. u_setDataDirectory(icuDataDir.c_str()); for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages()) @@ -60,6 +67,11 @@ void Transliteration::Init(std::string const & icuDataDir) m_transliterators.emplace(t, std::make_unique()); } } + + // We need "Hiragana-Katakana" for strings normalization, not for latin transliteration. + // That's why it is not mentioned in StringUtf8Multilang transliterators list. + m_transliterators.emplace("Hiragana-Katakana", std::make_unique()); + m_inited = true; } void Transliteration::SetMode(Transliteration::Mode mode) @@ -67,7 +79,66 @@ void Transliteration::SetMode(Transliteration::Mode mode) m_mode = mode; } -bool Transliteration::Transliterate(std::string const & str, int8_t langCode, std::string & out) const +bool Transliteration::Transliterate(std::string transliteratorId, UnicodeString & ustr) const +{ + CHECK(!transliteratorId.empty(), (transliteratorId)); + + auto it = m_transliterators.find(transliteratorId); + if (it == m_transliterators.end()) + { + LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); + return false; + } + + if (!it->second->m_initialized) + { + std::lock_guard lock(it->second->m_mutex); + if (!it->second->m_initialized) + { + UErrorCode status = U_ZERO_ERROR; + + std::string const removeDiacriticRule = + ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC"; + transliteratorId.append(removeDiacriticRule); + + UnicodeString translitId(transliteratorId.c_str()); + + it->second->m_transliterator.reset( + Transliterator::createInstance(translitId, UTRANS_FORWARD, status)); + + if (it->second->m_transliterator == nullptr) + { + LOG(LWARNING, + ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status)); + } + + it->second->m_initialized = true; + } + } + + if (it->second->m_transliterator == nullptr) + return false; + + it->second->m_transliterator->transliterate(ustr); + + if (ustr.isEmpty()) + return false; + + return true; +} + +bool Transliteration::Transliterate(std::string const & str, std::string transliteratorId, + std::string & out) const +{ + UnicodeString ustr(str.c_str()); + auto const res = Transliterate(transliteratorId, ustr); + if (res) + ustr.toUTF8String(out); + return res; +} + +bool Transliteration::Transliterate(std::string const & str, int8_t langCode, + std::string & out) const { if (m_mode != Mode::Enabled) return false; @@ -82,50 +153,14 @@ bool Transliteration::Transliterate(std::string const & str, int8_t langCode, st UnicodeString ustr(str.c_str()); for (auto transliteratorId : transliteratorsIds) { - CHECK(!transliteratorId.empty(), (transliteratorId)); - - auto it = m_transliterators.find(transliteratorId); - if (it == m_transliterators.end()) - { - LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\"")); - continue; - } - - if (!it->second->m_initialized) - { - std::lock_guard lock(it->second->m_mutex); - if (!it->second->m_initialized) - { - UErrorCode status = U_ZERO_ERROR; - - std::string const removeDiacriticRule = - ";NFD;[\u02B9-\u02D3\u0301-\u0358\u00B7\u0027]Remove;NFC"; - transliteratorId.append(removeDiacriticRule); - - UnicodeString translitId(transliteratorId.c_str()); - - it->second->m_transliterator.reset( - Transliterator::createInstance(translitId, UTRANS_FORWARD, status)); - - if (it->second->m_transliterator == nullptr) - { - LOG(LWARNING, - ("Cannot create transliterator \"", transliteratorId, "\", icu error =", status)); - } - - it->second->m_initialized = true; - } - } - - if (it->second->m_transliterator == nullptr) + if (!Transliterate(transliteratorId, ustr)) continue; - it->second->m_transliterator->transliterate(ustr); - - if (ustr.isEmpty()) - return false; } + if (ustr.isEmpty()) + return false; + ustr.toUTF8String(out); return true; } diff --git a/coding/transliteration.hpp b/coding/transliteration.hpp index cefe519d82..e2c1e73fef 100644 --- a/coding/transliteration.hpp +++ b/coding/transliteration.hpp @@ -4,8 +4,14 @@ #include #include #include +#include #include +namespace icu +{ +class UnicodeString; +} // namespace icu + class Transliteration { public: @@ -22,13 +28,19 @@ public: void Init(std::string const & icuDataDir); void SetMode(Mode mode); + bool Transliterate(std::string const & str, std::string transliteratorId, + std::string & out) const; bool Transliterate(std::string const & str, int8_t langCode, std::string & out) const; private: + struct TransliteratorInfo; + Transliteration(); - std::atomic m_mode; + bool Transliterate(std::string transliteratorId, icu::UnicodeString & ustr) const; - struct TransliteratorInfo; + std::mutex m_initializationMutex; + std::atomic m_inited; + std::atomic m_mode; std::map> m_transliterators; };