Using a pool of transliterators.

This commit is contained in:
Daria Volvenkova 2017-03-24 14:22:54 +03:00
parent 955d43ea1e
commit bfdc4ee246
15 changed files with 165 additions and 67 deletions

View file

@ -5,7 +5,8 @@ add_definitions(
-DU_STATIC_IMPLEMENTATION
-DU_COMMON_IMPLEMENTATION
-DU_I18N_IMPLEMENTATION
-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS)
-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS
-DU_DISABLE_RENAMING)
add_compile_options(
"-Wall"

View file

@ -13,7 +13,8 @@ DEFINES *= U_CHARSET_IS_UTF8=1
DEFINES *= U_STATIC_IMPLEMENTATION
DEFINES *= U_COMMON_IMPLEMENTATION
DEFINES *= U_I18N_IMPLEMENTATION
#DEFINES *= U_NO_DEFAULT_INCLUDE_UTF_HEADERS
DEFINES *= U_NO_DEFAULT_INCLUDE_UTF_HEADERS
DEFINES *= U_DISABLE_RENAMING
INCLUDEPATH *= common \
i18n

View file

@ -1,8 +1,12 @@
project(coding)
add_definitions(-DU_DISABLE_RENAMING)
include_directories(
${OMIM_ROOT}/coding
${OMIM_ROOT}/3party/expat
${OMIM_ROOT}/3party/icu/common
${OMIM_ROOT}/3party/icu/i18n
)
set(
@ -66,7 +70,9 @@ set(
streams_sink.hpp
succinct_mapper.hpp
traffic.cpp
traffic.hpp
traffic.hpp
transliterator.cpp
transliterator.hpp
uri.cpp
uri.hpp
url_encode.hpp

View file

@ -2,6 +2,9 @@
TARGET = coding
TEMPLATE = lib
CONFIG += staticlib warn_on
INCLUDEPATH += ../3party/icu/common ../3party/icu/i18n
DEFINES *= U_DISABLE_RENAMING
ROOT_DIR = ..
@ -24,6 +27,7 @@ SOURCES += \
reader_writer_ops.cpp \
simple_dense_coding.cpp \
traffic.cpp \
transliteration.cpp \
uri.cpp \
# varint_vector.cpp \
zip_creator.cpp \
@ -76,6 +80,7 @@ HEADERS += \
streams_sink.hpp \
succinct_mapper.hpp \
traffic.hpp \
transliteration.hpp \
uri.hpp \
url_encode.hpp \
value_opt_string.hpp \

View file

@ -9,22 +9,22 @@ namespace
// Languages below were choosen after sorting name:<lang> tags in 2011.
// Note, that it's not feasible to increase languages number here due to
// our current encoding (6 bit to store language code).
StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any"},
{"en", "English", "English"}, {"ja", "日本語", "Japanese"}, {"fr", "Français", "French"}, {"ko_rm", "Korean (Romanized)", "Korean"},
{"ar", "العربية", "Arabic"}, {"de", "Deutsch", "German"}, {"int_name", "International (Latin)", "Latin"}, {"ru", "Русский", "Russian"},
{"sv", "Svenska", "Swedish"}, {"zh", "中文", "Chinese"}, {"fi", "Suomi", "Finnish"}, {"be", "Беларуская", "Belarusian"}, {"ka", "ქართული", "Georgian"},
{"ko", "한국어", "Korean"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", "Dutch"}, {"ga", "Gaeilge", "Irish"},
{"ja_rm", "Japanese (Romanized)", "Japanese"}, {"el", "Ελληνικά", "Greek"}, {"it", "Italiano", "Italian"}, {"es", "Español", "Spanish"},
{"zh_pinyin", "Chinese (Pinyin)", "Chinese"}, {"th", "ไทย", "Thailand"}, {"cy", "Cymraeg", "Welsh"}, {"sr", "Српски", "Serbian"},
{"uk", "Українська", "Ukrainian"}, {"ca", "Català", "Catalan"}, {"hu", "Magyar", "Hungarian"}, {"hsb", "Hornjoserbšćina", "Upper Sorbian"}, {"eu", "Euskara", "Basque"},
{"fa", "فارسی", "Farsi"}, {"br", "Breton", "Breton"}, {"pl", "Polski", "Polish"}, {"hy", "Հայերէն", "Armenian"}, {"kn", "ಕನ್ನಡ", "Kannada"},
{"sl", "Slovenščina", "Slovene"}, {"ro", "Română", "Romanian"}, {"sq", "Shqipe", "Shqipe"}, {"am", "አማርኛ", "Amharic"}, {"fy", "Frysk", "Frisian"},
{"cs", "Čeština", "Czech"}, {"gd", "Gàidhlig", "Scots Gaelic"}, {"sk", "Slovenčina", "Slovak"}, {"af", "Afrikaans", "Afrikaans"},
{"ja_kana", "日本語(カタカナ)", "Japanese (Katakana)"}, {"lb", "Luxembourgish", "Luxembourgish"}, {"pt", "Português", "Portuguese"}, {"hr", "Hrvatski", "Croatian"},
{"fur", "Friulian", "Friulian"}, {"vi", "Tiếng Việt", "Vietnamese"}, {"tr", "Türkçe", "Turkish"}, {"bg", "Български", "Bulgarian"},
{"eo", "Esperanto", "Esperanto"}, {"lt", "Lietuvių", "Lithuanian"}, {"la", "Latin", "Latin"}, {"kk", "Қазақ", "Kazakh"},
{"gsw", "Schwiizertüütsch", "Swiss German"}, {"et", "Eesti", "Estonian"}, {"ku", "Kurdish", "Kurdish"}, {"mn", "Mongolian", "Mongolian"},
{"mk", "Македонски", "Macedonian"}, {"lv", "Latviešu", "Latvian"}, {"hi", "हिन्दी", "Hindi"}
StringUtf8Multilang::Languages const g_languages = {{ {"default", "Native for each country", "Any-Latin"},
{"en", "English", ""}, {"ja", "日本語", "Any-Latin"}, {"fr", "Français", ""}, {"ko_rm", "Korean (Romanized)", "Korean-Latin/BGN"},
{"ar", "العربية", "Any-Latin"}, {"de", "Deutsch", ""}, {"int_name", "International (Latin)", "Any-Latin"}, {"ru", "Русский", "Russian-Latin/BGN"},
{"sv", "Svenska", "Any-Latin"}, {"zh", "中文", "Any-Latin"}, {"fi", "Suomi", "Any-Latin"}, {"be", "Беларуская", "Belarusian-Latin/BGN"}, {"ka", "ქართული", "Georgian-Latin"},
{"ko", "한국어", "Hangul"}, {"he", "עברית", "Hebrew"}, {"nl", "Nederlands", ""}, {"ga", "Gaeilge", "Any-Latin"},
{"ja_rm", "Japanese (Romanized)", "Any-Latin"}, {"el", "Ελληνικά", "Greek-Latin"}, {"it", "Italiano", ""}, {"es", "Español", ""},
{"zh_pinyin", "Chinese (Pinyin)", "Any-Latin"}, {"th", "ไทย", "Thai-Latin"}, {"cy", "Cymraeg", "Any-Latin"}, {"sr", "Српски", "Serbian-Latin/BGN"},
{"uk", "Українська", "Ukrainian-Latin/BGN"}, {"ca", "Català", "Any-Latin"}, {"hu", "Magyar", "Any-Latin"}, {"hsb", "Hornjoserbšćina", "Any-Latin"}, {"eu", "Euskara", "Any-Latin"},
{"fa", "فارسی", "Any-Latin"}, {"br", "Breton", "Any-Latin"}, {"pl", "Polski", "Any-Latin"}, {"hy", "Հայերէն", "Armenian-Latin"}, {"kn", "ಕನ್ನಡ", "Kannada-Latin"},
{"sl", "Slovenščina", "Any-Latin"}, {"ro", "Română", "Any-Latin"}, {"sq", "Shqipe", "Any-Latin"}, {"am", "አማርኛ", "Amharic-Latin/BGN"}, {"fy", "Frysk", "Any-Latin"},
{"cs", "Čeština", "Any-Latin"}, {"gd", "Gàidhlig", "Any-Latin"}, {"sk", "Slovenčina", "Any-Latin"}, {"af", "Afrikaans", "Any-Latin"},
{"ja_kana", "日本語(カタカナ)", "Katakana-Latin"}, {"lb", "Luxembourgish", "Any-Latin"}, {"pt", "Português", "Any-Latin"}, {"hr", "Hrvatski", "Any-Latin"},
{"fur", "Friulian", "Any-Latin"}, {"vi", "Tiếng Việt", "Any-Latin"}, {"tr", "Türkçe", "Any-Latin"}, {"bg", "Български", "Bulgarian-Latin/BGN"},
{"eo", "Esperanto", "Any-Latin"}, {"lt", "Lietuvių", "Any-Latin"}, {"la", "Latin", ""}, {"kk", "Қазақ", "Kazakh-Latin/BGN"},
{"gsw", "Schwiizertüütsch", "Any-Latin"}, {"et", "Eesti", "Any-Latin"}, {"ku", "Kurdish", "Any-Latin"}, {"mn", "Mongolian", "Mongolian-Latin/BGN"},
{"mk", "Македонски", "Macedonian-Latin/BGN"}, {"lv", "Latviešu", "Any-Latin"}, {"hi", "हिन्दी", "Any-Latin"}
}};
static_assert(g_languages.size() == StringUtf8Multilang::kMaxSupportedLanguages,
@ -72,11 +72,11 @@ char const * StringUtf8Multilang::GetLangNameByCode(int8_t langCode)
}
// static
char const * StringUtf8Multilang::GetLangEnNameByCode(int8_t langCode)
char const * StringUtf8Multilang::GetTransliteratorIdByCode(int8_t langCode)
{
if (langCode < 0 || langCode >= static_cast<int8_t>(g_languages.size()))
return "";
return g_languages[langCode].m_enName;
return g_languages[langCode].m_transliteratorId;
}
size_t StringUtf8Multilang::GetNextIndex(size_t i) const

View file

@ -50,8 +50,8 @@ public:
char const * m_code;
/// Native language name.
char const * m_name;
/// Native language name in English.
char const * m_enName;
/// Transliterator to latin id.
char const * m_transliteratorId;
};
using Languages = array<Lang, kMaxSupportedLanguages>;
@ -64,7 +64,7 @@ public:
/// @returns empty string if langCode is invalid.
static char const * GetLangNameByCode(int8_t langCode);
/// @returns empty string if langCode is invalid.
static char const * GetLangEnNameByCode(int8_t langCode);
static char const * GetTransliteratorIdByCode(int8_t langCode);
inline bool operator== (StringUtf8Multilang const & rhs) const
{

View file

@ -0,0 +1,59 @@
#include "coding/transliteration.hpp"
#include "coding/multilang_utf8_string.hpp"
#include "base/logging.hpp"
#include "3party/icu/common/unicode/unistr.h"
#include "3party/icu/common/unicode/utypes.h"
#include "3party/icu/i18n/unicode/translit.h"
#include "3party/icu/i18n/unicode/utrans.h"
Transliteration::~Transliteration()
{
//u_cleanup();
}
Transliteration & Transliteration::GetInstance()
{
static Transliteration instance;
return instance;
}
void Transliteration::Init(std::string const & icuDataDir)
{
u_setDataDirectory(icuDataDir.c_str());
for (auto const & lang : StringUtf8Multilang::GetSupportedLanguages())
{
if (strlen(lang.m_transliteratorId) == 0 || m_transliterators.count(lang.m_transliteratorId) != 0)
continue;
UErrorCode status = U_ZERO_ERROR;
std::unique_ptr<Transliterator> transliterator(
Transliterator::createInstance(lang.m_transliteratorId, UTRANS_FORWARD, status));
if (transliterator != nullptr)
m_transliterators.emplace(lang.m_transliteratorId, std::move(transliterator));
else
LOG(LWARNING, ("Cannot create transliterator \"", lang.m_transliteratorId, "\", icu error =", status));
}
}
std::string Transliteration::Transliterate(std::string const & str, int8_t langCode) const
{
auto const transliteratorId = StringUtf8Multilang::GetTransliteratorIdByCode(langCode);
auto const & it = m_transliterators.find(transliteratorId);
if (it == m_transliterators.end())
{
LOG(LWARNING, ("Transliteration failed, unknown transliterator \"", transliteratorId, "\""));
return "";
}
UnicodeString ustr(str.c_str());
it->second->transliterate(ustr);
std::string resultStr;
ustr.toUTF8String(resultStr);
return resultStr;
}

View file

@ -0,0 +1,28 @@
#pragma once
#include <map>
#include <memory>
#include <string>
namespace icu
{
class Transliterator;
}
class Transliteration
{
public:
~Transliteration();
static Transliteration & GetInstance();
void Init(std::string const & icuDataDir);
std::string Transliterate(std::string const & str, int8_t langCode) const;
private:
Transliteration() = default;
struct TransliteratorWrapper;
std::map<std::string, std::unique_ptr<icu::Transliterator>> m_transliterators;
};

View file

@ -1,6 +1,6 @@
project(indexer)
include_directories(${OMIM_ROOT}/3party/protobuf/src ${OMIM_ROOT}/3party/icu/common ${OMIM_ROOT}/3party/icu/i18n)
include_directories(${OMIM_ROOT}/3party/protobuf/src)
set(
SRC

View file

@ -8,14 +8,10 @@
#include "geometry/point2d.hpp"
#include "coding/multilang_utf8_string.hpp"
#include "coding/transliteration.hpp"
#include "base/base.hpp"
#include "3party/icu/i18n/unicode/translit.h"
#include "3party/icu/i18n/unicode/utrans.h"
#include "3party/icu/common/unicode/utypes.h"
#include "3party/icu/common/unicode/unistr.h"
#include "std/vector.hpp"
namespace
@ -44,13 +40,13 @@ void GetTransliteratedName(feature::RegionData const & regionData, StringUtf8Mul
{
if (src.GetString(code, srcName))
{
out = Transliterate(srcName, StringUtf8Multilang::GetLangEnNameByCode(code));
out = Transliteration::GetInstance().Transliterate(srcName, code);
if (!out.empty())
return;
}
}
if (!mwmLangCodes.empty() && src.GetString(StringUtf8Multilang::kDefaultCode, srcName))
out = Transliterate(srcName, StringUtf8Multilang::GetLangEnNameByCode(mwmLangCodes[0]));
out = Transliteration::GetInstance().Transliterate(srcName, mwmLangCodes[0]);
}
void GetBestName(StringUtf8Multilang const & src, vector<int8_t> const & priorityList, string & out)
@ -87,34 +83,6 @@ void GetBestName(StringUtf8Multilang const & src, vector<int8_t> const & priorit
}
} // namespace
void initICU(std::string const & icuDataDir)
{
u_setDataDirectory(icuDataDir.c_str());
}
std::string Transliterate(std::string const & str, std::string const & lang)
{
UnicodeString ustr(str.c_str());
UErrorCode status = U_ZERO_ERROR;
const std::string id = lang + "-Latin/BGN";
std::unique_ptr<Transliterator> latinTransliterator(Transliterator::createInstance(id.c_str(), UTRANS_FORWARD, status));
if (latinTransliterator == nullptr)
{
LOG(LWARNING, ("Cannot create transliterator", id));
return "";
}
latinTransliterator->transliterate(ustr);
std::string resultStr;
ustr.toUTF8String(resultStr);
LOG(LDEBUG, ("Transliterated", str, "->", resultStr, "id =", id));
return resultStr;
}
namespace feature
{

View file

@ -7,9 +7,6 @@
struct FeatureID;
class StringUtf8Multilang;
void initICU(std::string const & icuDataDir);
std::string Transliterate(std::string const & str, std::string const & lang);
namespace feature
{
class TypesHolder;

View file

@ -3,7 +3,7 @@
TARGET = indexer
TEMPLATE = lib
CONFIG += staticlib warn_on
INCLUDEPATH += ../3party/protobuf/src ../3party/icu/common ../3party/icu/i18n
INCLUDEPATH += ../3party/protobuf/src
ROOT_DIR = ..

View file

@ -71,9 +71,10 @@
#include "platform/socket.hpp"
#include "coding/internal/file_data.hpp"
#include "coding/zip_reader.hpp"
#include "coding/url_encode.hpp"
#include "coding/file_name_utils.hpp"
#include "coding/transliteration.hpp"
#include "coding/url_encode.hpp"
#include "coding/zip_reader.hpp"
#include "geometry/angles.hpp"
#include "geometry/any_rect2d.hpp"
@ -510,7 +511,7 @@ Framework::Framework()
kICUDataFile,
GetPlatform().WritableDir() + kICUDataFile);
#endif
initICU(GetPlatform().WritableDir());
Transliteration::GetInstance().Init(GetPlatform().WritableDir());
}
Framework::~Framework()

View file

@ -124,6 +124,8 @@
67E8DB751BBC17490053C5BA /* writer_test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67E8DB2B1BBC16C70053C5BA /* writer_test.cpp */; };
67E8DB761BBC17490053C5BA /* zip_creator_test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67E8DB2C1BBC16C70053C5BA /* zip_creator_test.cpp */; };
67E8DB771BBC17490053C5BA /* zip_reader_test.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67E8DB2D1BBC16C70053C5BA /* zip_reader_test.cpp */; };
BB537C5F1E8490120074D9D3 /* transliteration.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BB537C5D1E8490120074D9D3 /* transliteration.cpp */; };
BB537C601E8490120074D9D3 /* transliteration.hpp in Headers */ = {isa = PBXBuildFile; fileRef = BB537C5E1E8490120074D9D3 /* transliteration.hpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@ -262,6 +264,8 @@
67E8DB2B1BBC16C70053C5BA /* writer_test.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = writer_test.cpp; sourceTree = "<group>"; };
67E8DB2C1BBC16C70053C5BA /* zip_creator_test.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = zip_creator_test.cpp; sourceTree = "<group>"; };
67E8DB2D1BBC16C70053C5BA /* zip_reader_test.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = zip_reader_test.cpp; sourceTree = "<group>"; };
BB537C5D1E8490120074D9D3 /* transliteration.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = transliteration.cpp; sourceTree = "<group>"; };
BB537C5E1E8490120074D9D3 /* transliteration.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = transliteration.hpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -380,6 +384,8 @@
6753421D1A3F586300A0A8C3 /* coding */ = {
isa = PBXGroup;
children = (
BB537C5D1E8490120074D9D3 /* transliteration.cpp */,
BB537C5E1E8490120074D9D3 /* transliteration.hpp */,
34A129D11DF99E43001B4531 /* zlib.cpp */,
34A129D21DF99E43001B4531 /* zlib.hpp */,
675E889A1DB7B0D000F8EBDA /* traffic.cpp */,
@ -479,6 +485,7 @@
675342CD1A3F588C00A0A8C3 /* varint_vector.hpp in Headers */,
675342B51A3F588C00A0A8C3 /* reader_cache.hpp in Headers */,
675342CE1A3F588C00A0A8C3 /* varint.hpp in Headers */,
BB537C601E8490120074D9D3 /* transliteration.hpp in Headers */,
675342D01A3F588C00A0A8C3 /* writer.hpp in Headers */,
675342CA1A3F588C00A0A8C3 /* var_serial_vector.hpp in Headers */,
347F33391C4540F0009758CC /* fixed_bits_ddvector.hpp in Headers */,
@ -660,6 +667,7 @@
6753429F1A3F588C00A0A8C3 /* file_reader.cpp in Sources */,
34A129D31DF99E43001B4531 /* zlib.cpp in Sources */,
676818201DC3ABD80094C0AC /* traffic_test.cpp in Sources */,
BB537C5F1E8490120074D9D3 /* transliteration.cpp in Sources */,
675342C51A3F588C00A0A8C3 /* uri.cpp in Sources */,
675342BB1A3F588C00A0A8C3 /* reader.cpp in Sources */,
670BAACB1D0B0C1E000302DA /* huffman.cpp in Sources */,
@ -700,6 +708,17 @@
isa = XCBuildConfiguration;
baseConfigurationReference = 34A72A431DBE4989003D1F5F /* common-debug.xcconfig */;
buildSettings = {
GCC_PREPROCESSOR_DEFINITIONS = (
"$(inherited)",
U_DISABLE_RENAMING,
);
HEADER_SEARCH_PATHS = (
"$(inherited)",
"$(OMIM_ROOT)",
"$(BOOST_ROOT)",
"$(OMIM_ROOT)/3party/icu/common",
"$(OMIM_ROOT)/3party/icu/i18n",
);
};
name = Debug;
};
@ -707,6 +726,17 @@
isa = XCBuildConfiguration;
baseConfigurationReference = 34A72A441DBE4989003D1F5F /* common-release.xcconfig */;
buildSettings = {
GCC_PREPROCESSOR_DEFINITIONS = (
"$(inherited)",
U_DISABLE_RENAMING,
);
HEADER_SEARCH_PATHS = (
"$(inherited)",
"$(OMIM_ROOT)",
"$(BOOST_ROOT)",
"$(OMIM_ROOT)/3party/icu/common",
"$(OMIM_ROOT)/3party/icu/i18n",
);
};
name = Release;
};

View file

@ -2522,6 +2522,7 @@
U_COMMON_IMPLEMENTATION,
U_STATIC_IMPLEMENTATION,
"U_CHARSET_IS_UTF8=1",
U_DISABLE_RENAMING,
);
HEADER_SEARCH_PATHS = (
"$(inherited)",
@ -2544,6 +2545,7 @@
U_COMMON_IMPLEMENTATION,
U_STATIC_IMPLEMENTATION,
"U_CHARSET_IS_UTF8=1",
U_DISABLE_RENAMING,
);
HEADER_SEARCH_PATHS = (
"$(inherited)",