[generator] Use reasonable order for transliteration of region names

This commit is contained in:
LaGrunge 2019-06-17 17:00:24 +03:00 committed by mpimenov
parent 9b149eef29
commit 529bdebbe7
5 changed files with 192 additions and 35 deletions

View file

@ -147,3 +147,50 @@ UNIT_TEST(MultilangString_HasString)
TEST(!s.HasString(1), ());
TEST(!s.HasString(32), ());
}
UNIT_TEST(MultilangString_ForEachLanguage)
{
using Translations = vector<pair<string, string>>;
StringUtf8Multilang s;
Translations const scotlandTranslations = {
{"be", "Шатландыя"}, {"cs", "Skotsko"}, {"cy", "Yr Alban"}, {"da", "Skotland"},
{"de", "Schottland"}, {"eo", "Skotlando"}, {"es", "Escocia"}, {"eu", "Eskozia"},
{"fi", "Skotlanti"}, {"fr", "Écosse"}, {"ga", "Albain"}, {"gd", "Alba"},
{"hr", "Škotska"}, {"ia", "Scotia"}, {"io", "Skotia"}, {"ja", "スコットランド"},
{"ku", "Skotland"}, {"lfn", "Scotland"}, {"nl", "Schotland"}, {"pl", "Szkocja"},
{"ru", "Шотландия"}, {"sco", "Scotland"}, {"sk", "Škótsko"}, {"sr", "Шкотска"},
{"sv", "Skottland"}, {"tok", "Sukosi"}, {"tzl", "Escot"}, {"uk", "Шотландія"},
{"vo", "Skotän"}, {"zh", "苏格兰"}};
Translations const usedTranslations = {
{"be", "Шатландыя"}, {"cs", "Skotsko"}, {"eu", "Eskozia"}, {"zh", "苏格兰"}};
for (auto const & langAndTranslation : scotlandTranslations)
{
s.AddString(langAndTranslation.first, langAndTranslation.second);
}
set<string> testAccumulator;
vector<string> const preferredLanguages = {"cs", "eu", "be", "zh"};
vector<string> const preferredTranslations = {"Skotsko", "Eskozia", "Шатландыя", "苏格兰"};
auto const fn = [&testAccumulator, &usedTranslations](int8_t code, string const & name) {
testAccumulator.insert(name);
if (usedTranslations.size() > testAccumulator.size())
return base::ControlFlow::Continue;
return base::ControlFlow::Break;
};
TEST(s.ForEachLanguage(preferredLanguages, fn), ());
TEST_EQUAL(testAccumulator.size(), preferredTranslations.size(), ());
for (string const & translation : preferredTranslations)
{
TEST(testAccumulator.find(translation) != testAccumulator.end(), ());
}
testAccumulator.clear();
vector<string> const corruptedLanguages = {"Матерный", "Детский", "BirdLanguage"};
TEST(!s.ForEachLanguage(corruptedLanguages, fn), ());
TEST_EQUAL(testAccumulator.size(), 0, ());
}

View file

@ -229,6 +229,30 @@ bool StringUtf8Multilang::GetString(int8_t lang, string & utf8s) const
return false;
}
StringUtf8Multilang::TranslationPositions StringUtf8Multilang::GenerateTranslationPositions() const
{
TranslationPositions result;
size_t i = 0;
size_t const sz = m_s.size();
while (i < sz)
{
size_t const next = GetNextIndex(i);
int8_t const code = m_s[i] & 0x3F;
if (GetLangByCode(code) != kReservedLang)
result[code] = Position{i + 1, next - i - 1};
i = next;
}
return result;
}
std::string StringUtf8Multilang::GetTranslation(
StringUtf8Multilang::Position const & position) const
{
return m_s.substr(position.m_begin, position.m_length);
}
bool StringUtf8Multilang::HasString(int8_t lang) const
{
if (!IsSupportedLangCode(lang))

View file

@ -73,6 +73,14 @@ public:
char const * m_transliteratorId;
};
struct Position
{
size_t m_begin = 0;
size_t m_length = 0;
};
using TranslationPositions = std::map<int8_t, Position>;
static int8_t constexpr kUnsupportedLanguageCode = -1;
static int8_t constexpr kDefaultCode = 0;
static int8_t constexpr kEnglishCode = 1;
@ -129,6 +137,33 @@ public:
}
}
/// Used for ordered languages, if you want to do something with priority of that order.
/// \param languages ordered languages names.
/// \param fn function or functor, using base::ControlFlow as return value.
/// \return true if ForEachLanguage was stopped by base::ControlFlow::Break, false otherwise.
template <typename Fn>
bool ForEachLanguage(std::vector<std::string> const & languages, Fn && fn) const
{
auto const & translationPositions = GenerateTranslationPositions();
base::ControlFlowWrapper<Fn> wrapper(std::forward<Fn>(fn));
for (std::string const & language : languages)
{
int8_t const languageCode = GetLangIndex(language);
if (GetLangByCode(languageCode) != kReservedLang)
{
auto const & translationPositionsIt = translationPositions.find(languageCode);
if (translationPositionsIt != translationPositions.end() &&
wrapper(languageCode, GetTranslation(translationPositionsIt->second)) ==
base::ControlFlow::Break)
{
return true;
}
}
}
return false;
};
bool GetString(int8_t lang, std::string & utf8s) const;
bool GetString(std::string const & lang, std::string & utf8s) const
{
@ -157,6 +192,9 @@ public:
}
private:
TranslationPositions GenerateTranslationPositions() const;
std::string GetTranslation(Position const & position) const;
size_t GetNextIndex(size_t i) const;
std::string m_s;

View file

@ -10,6 +10,8 @@
#include "platform/platform.hpp"
#include "coding/transliteration.hpp"
#include "base/file_name_utils.hpp"
#include "base/macros.hpp"
@ -18,8 +20,8 @@
#include <limits>
#include <memory>
#include <string>
#include <vector>
#include <utility>
#include <vector>
using namespace generator_tests;
using namespace generator::regions;
@ -33,7 +35,7 @@ using Tags = std::vector<std::pair<std::string, std::string>>;
FeatureBuilder const kEmptyFeature;
OsmElement CreateOsmRelation(uint64_t id, std::string const & adminLevel,
std::string const & place = "")
std::string const & place = "")
{
OsmElement el;
el.m_id = id;
@ -67,7 +69,8 @@ RegionsBuilder::Regions MakeTestDataSet1(RegionInfo & collector)
FeatureBuilder fb;
fb.AddName("default", "Country_1");
fb.SetOsmId(MakeOsmRelation(1 /* id */));
vector<m2::PointD> poly = {{2, 8}, {3, 12}, {8, 15}, {13, 12}, {15, 7}, {11, 2}, {4, 4}, {2, 8}};
vector<m2::PointD> poly = {{2, 8}, {3, 12}, {8, 15}, {13, 12},
{15, 7}, {11, 2}, {4, 4}, {2, 8}};
fb.AddPolygon(poly);
fb.SetHoles({{{5, 8}, {7, 10}, {10, 10}, {11, 7}, {10, 4}, {7, 5}, {5, 8}}});
fb.SetArea();
@ -108,8 +111,8 @@ RegionsBuilder::Regions MakeTestDataSet1(RegionInfo & collector)
FeatureBuilder fb;
fb.AddName("default", "Country_1_Region_4");
fb.SetOsmId(MakeOsmRelation(4 /* id */));
vector<m2::PointD> poly = {{7, 10}, {9, 12}, {8, 15}, {13, 12}, {15, 7}, {12, 9},
{11, 7}, {10, 10}, {7, 10}};
vector<m2::PointD> poly = {{7, 10}, {9, 12}, {8, 15}, {13, 12}, {15, 7},
{12, 9}, {11, 7}, {10, 10}, {7, 10}};
fb.AddPolygon(poly);
fb.SetArea();
regions.emplace_back(Region(fb, collector.Get(MakeOsmRelation(4 /* id */))));
@ -119,8 +122,8 @@ RegionsBuilder::Regions MakeTestDataSet1(RegionInfo & collector)
FeatureBuilder fb;
fb.AddName("default", "Country_1_Region_5");
fb.SetOsmId(MakeOsmRelation(5 /* id */));
vector<m2::PointD> poly = {{4, 4}, {2, 8}, {3, 12}, {8, 15}, {9, 12}, {7, 10}, {5, 8},
{7, 5}, {4, 4}};
vector<m2::PointD> poly = {{4, 4}, {2, 8}, {3, 12}, {8, 15}, {9, 12},
{7, 10}, {5, 8}, {7, 5}, {4, 4}};
fb.AddPolygon(poly);
fb.SetArea();
regions.emplace_back(Region(fb, collector.Get(MakeOsmRelation(5 /* id */))));
@ -140,7 +143,8 @@ RegionsBuilder::Regions MakeTestDataSet1(RegionInfo & collector)
FeatureBuilder fb;
fb.AddName("default", "Country_1_Region_5_Subregion_7");
fb.SetOsmId(MakeOsmRelation(7 /* id */));
vector<m2::PointD> poly = {{3, 12}, {8, 15}, {9, 12}, {7, 10}, {5, 8}, {5, 10}, {4, 10}, {3, 12}};
vector<m2::PointD> poly = {{3, 12}, {8, 15}, {9, 12}, {7, 10},
{5, 8}, {5, 10}, {4, 10}, {3, 12}};
fb.AddPolygon(poly);
fb.SetArea();
regions.emplace_back(Region(fb, collector.Get(MakeOsmRelation(7 /* id */))));
@ -176,8 +180,7 @@ bool NameExists(std::vector<std::string> const & coll, std::string const & name)
{
auto const end = std::end(coll);
return std::find(std::begin(coll), end, name) != end;
}
;
};
} // namespace
UNIT_TEST(RegionsBuilderTest_GetCountryNames)
@ -199,9 +202,11 @@ UNIT_TEST(RegionsBuilderTest_GetCountries)
auto const & countries = builder.GetCountriesOuters();
TEST_EQUAL(countries.size(), 3, ());
TEST_EQUAL(std::count_if(std::begin(countries), std::end(countries),
[](const Region & r) {return r.GetName() == "Country_1"; }), 1, ());
[](const Region & r) { return r.GetName() == "Country_1"; }),
1, ());
TEST_EQUAL(std::count_if(std::begin(countries), std::end(countries),
[](const Region & r) {return r.GetName() == "Country_2"; }), 2, ());
[](const Region & r) { return r.GetName() == "Country_2"; }),
2, ());
}
UNIT_TEST(RegionsBuilderTest_GetCountryTrees)
@ -230,3 +235,44 @@ UNIT_TEST(RegionsBuilderTest_GetCountryTrees)
TEST(NameExists(bankOfNames, "Country_1Country_1_Region_5Country_1_Region_5_Subregion_6"), ());
TEST(NameExists(bankOfNames, "Country_1Country_1_Region_5Country_1_Region_5_Subregion_7"), ());
}
using Translations = std::vector<std::pair<std::string, std::string>>;
bool TestTransliteration(Translations const & translations,
std::string const & expectedTransliteration)
{
StringUtf8Multilang regionName;
for (auto const & langAndTranslation : translations)
{
regionName.AddString(langAndTranslation.first, langAndTranslation.second);
}
RegionWithName region(regionName);
return region.GetEnglishOrTransliteratedName() == expectedTransliteration;
}
UNIT_TEST(RegionTransliteration)
{
Transliteration & translit = Transliteration::Instance();
translit.Init(GetPlatform().ResourcesDir());
Translations const scotlandTranslations = {
{"default", "Scotland"}, {"be", "Шатландыя"}, {"cs", "Skotsko"}, {"cy", "Yr Alban"},
{"da", "Skotland"}, {"de", "Schottland"}, {"eo", "Skotlando"}, {"es", "Escocia"},
{"eu", "Eskozia"}, {"fi", "Skotlanti"}, {"fr", "Écosse"}, {"ga", "Albain"},
{"gd", "Alba"}, {"hr", "Škotska"}, {"ia", "Scotia"}, {"io", "Skotia"},
{"ja", "スコットランド"}, {"ku", "Skotland"}, {"lfn", "Scotland"}, {"nl", "Schotland"},
{"pl", "Szkocja"}, {"ru", "Шотландия"}, {"sco", "Scotland"}, {"sk", "Škótsko"},
{"sr", "Шкотска"}, {"sv", "Skottland"}, {"tok", "Sukosi"}, {"tzl", "Escot"},
{"uk", "Шотландія"}, {"vo", "Skotän"}, {"zh", "苏格兰"}};
Translations const michiganTranslations = {
{"default", "Michigan"}, {"ar", "ميشيغان"}, {"az", "Miçiqan"}, {"be", "Мічыган"},
{"bg", "Мичиган"}, {"br", "Michigan"}, {"en", "Michigan"}, {"eo", "Miĉigano"},
{"es", "Míchigan"}, {"fa", "میشیگان"}, {"haw", "Mikikana"}, {"he", "מישיגן"},
{"hy", "Միչիգան"}, {"ja", "ミシガン州"}, {"ko", "미시간"}, {"lt", "Mičiganas"},
{"lv", "Mičigana"}, {"nv", "Míshigin"}, {"pl", "Michigan"}, {"ru", "Мичиган"},
{"sr", "Мичиген"}, {"ta", "மிச்சிகன்"}, {"th", "รัฐมิชิแกน"}, {"tl", "Misigan"},
{"uk", "Мічиган"}, {"yi", "מישיגן"}, {"zh", "密歇根州"}};
TEST(TestTransliteration(scotlandTranslations, "Shotlandiya"), ());
TEST(TestTransliteration(michiganTranslations, "Michigan"), ());
}

View file

@ -4,11 +4,22 @@
#include "base/assert.hpp"
#include "base/control_flow.hpp"
#include "base/string_utils.hpp"
namespace generator
{
namespace regions
{
namespace
{
// Languages in order for better transliterations for Russian. This is kind
// of workaround before real made translations.
const std::vector<std::string> kRuPreferredLanguagesForTransliterate = {
"en" /*English*/,
"ru" /*Русский*/,
};
} // namespace
std::string RegionWithName::GetName(int8_t lang) const
{
std::string s;
@ -19,12 +30,16 @@ std::string RegionWithName::GetName(int8_t lang) const
std::string RegionWithName::GetEnglishOrTransliteratedName() const
{
std::string s = GetName(StringUtf8Multilang::kEnglishCode);
if (!s.empty())
if (!s.empty() && strings::IsASCIIString(s))
return s;
s = GetName(StringUtf8Multilang::kInternationalCode);
if (!s.empty() && strings::IsASCIIString(s))
return s;
auto const fn = [&s](int8_t code, std::string const & name) {
if (code != StringUtf8Multilang::kDefaultCode &&
Transliteration::Instance().Transliterate(name, code, s))
Transliteration::Instance().Transliterate(name, code, s) && strings::IsASCIIString(s))
{
return base::ControlFlow::Break;
}
@ -32,33 +47,20 @@ std::string RegionWithName::GetEnglishOrTransliteratedName() const
return base::ControlFlow::Continue;
};
m_name.ForEach(fn);
if (!m_name.ForEachLanguage(kRuPreferredLanguagesForTransliterate, fn))
m_name.ForEach(fn);
return s;
}
StringUtf8Multilang const & RegionWithName::GetMultilangName() const
{
return m_name;
}
StringUtf8Multilang const & RegionWithName::GetMultilangName() const { return m_name; }
void RegionWithName::SetMultilangName(StringUtf8Multilang const & name)
{
m_name = name;
}
void RegionWithName::SetMultilangName(StringUtf8Multilang const & name) { m_name = name; }
base::GeoObjectId RegionWithData::GetId() const
{
return m_regionData.GetOsmId();
}
base::GeoObjectId RegionWithData::GetId() const { return m_regionData.GetOsmId(); }
bool RegionWithData::HasIsoCode() const
{
return m_regionData.HasIsoCodeAlpha2();
}
bool RegionWithData::HasIsoCode() const { return m_regionData.HasIsoCodeAlpha2(); }
std::string RegionWithData::GetIsoCode() const
{
return m_regionData.GetIsoCodeAlpha2();
}
std::string RegionWithData::GetIsoCode() const { return m_regionData.GetIsoCodeAlpha2(); }
} // namespace regions
} // namespace generator