From aaac8554bd7e81f584354810a423aae0579fa028 Mon Sep 17 00:00:00 2001 From: Anatoly Serdtcev Date: Thu, 1 Aug 2019 19:07:11 +0300 Subject: [PATCH] [geocoder] Index all locales names --- geocoder/geocoder.cpp | 5 +- geocoder/geocoder_cli/geocoder_cli.cpp | 3 +- geocoder/geocoder_tests/geocoder_tests.cpp | 22 +++- geocoder/hierarchy.cpp | 114 ++++++++++++++------- geocoder/hierarchy.hpp | 11 +- geocoder/hierarchy_reader.cpp | 4 +- geocoder/index.cpp | 54 ++++++---- geocoder/index.hpp | 2 + geocoder/name_dictionary.cpp | 71 +++++++++++-- geocoder/name_dictionary.hpp | 39 ++++++- 10 files changed, 248 insertions(+), 77 deletions(-) diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index 9475ef7afb..8a5e6fb4f0 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -374,8 +374,9 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector { m_index.ForEachRelatedBuilding(docId, [&](Index::DocId const & buildingDocId) { auto const & bld = m_index.GetDoc(buildingDocId); - auto const & realHN = bld.GetNormalizedName(Type::Building, - m_hierarchy.GetNormalizedNameDictionary()); + auto const & multipleHN = bld.GetNormalizedMultipleNames( + Type::Building, m_hierarchy.GetNormalizedNameDictionary()); + auto const & realHN = multipleHN.GetMainName(); auto const & realHNUniStr = strings::MakeUniString(realHN); if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN, false /* queryIsPrefix */)) diff --git a/geocoder/geocoder_cli/geocoder_cli.cpp b/geocoder/geocoder_cli/geocoder_cli.cpp index 1fa8602c11..b8ac491472 100644 --- a/geocoder/geocoder_cli/geocoder_cli.cpp +++ b/geocoder/geocoder_cli/geocoder_cli.cpp @@ -40,7 +40,8 @@ void PrintResults(Hierarchy const & hierarchy, vector const & results) if (e->m_normalizedAddress[i] != NameDictionary::kUnspecifiedPosition) { auto type = static_cast(i); - cout << delimiter << ToString(type) << ": " << e->GetNormalizedName(type, dictionary); + auto multipleNames = e->GetNormalizedMultipleNames(type, dictionary); + cout << delimiter << ToString(type) << ": " << multipleNames.GetMainName(); delimiter = ", "; } } diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index abfe1e7fd5..b6d9fda2da 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -76,9 +76,25 @@ UNIT_TEST(Geocoder_Hierarchy) }); TEST_EQUAL(entries.size(), 1, ()); - TEST_EQUAL(entries[0].GetNormalizedName(Type::Country, dictionary), "cuba", ()); - TEST_EQUAL(entries[0].GetNormalizedName(Type::Region, dictionary), "ciego de avila", ()); - TEST_EQUAL(entries[0].GetNormalizedName(Type::Subregion, dictionary), "florencia", ()); + TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Country, dictionary).GetMainName(), "cuba", + ()); + TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Region, dictionary).GetMainName(), + "ciego de avila", ()); + TEST_EQUAL(entries[0].GetNormalizedMultipleNames(Type::Subregion, dictionary).GetMainName(), + "florencia", ()); +} + +UNIT_TEST(Geocoder_EnglishNames) +{ + string const kData = R"#( +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}, "en": {"address": {"locality": "Moscow"}}}}} +11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица Новый Арбат"}}, "en": {"address": {"locality": "Moscow", "street": "New Arbat Avenue"}}}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "Moscow, New Arbat", {{Id{0x11}, 1.0}, {Id{0x10}, 0.6}}); } UNIT_TEST(Geocoder_OnlyBuildings) diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index 146bf34658..4e65fa6988 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -43,34 +43,44 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl( MYTHROW(base::Json::Exception, ("Not a json object.")); } + if (!DeserializeAddressFromJSON(root, normalizedNameDictionaryBuilder, stats)) + return false; + auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales", "default"); - auto const address = base::GetJSONObligatoryField(defaultLocale, "address"); + FromJSONObjectOptionalField(defaultLocale, "name", m_name); + if (m_name.empty()) + ++stats.m_emptyNames; + + if (m_type == Type::Count) + { + LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr)); + ++stats.m_emptyAddresses; + } + return true; +} + +bool Hierarchy::Entry::DeserializeAddressFromJSON( + json_t * const root, NameDictionaryBuilder & normalizedNameDictionaryBuilder, + ParsingStats & stats) +{ + auto const locales = base::GetJSONObligatoryFieldByPath(root, "properties", "locales"); m_normalizedAddress= {}; - Tokens tokens; for (size_t i = 0; i < static_cast(Type::Count); ++i) { Type const type = static_cast(i); - string const & levelKey = ToString(type); - auto const levelJson = base::GetJSONOptionalField(address, levelKey); - if (!levelJson) - continue; - - if (base::JSONIsNull(levelJson)) + MultipleNames multipleNames; + if (!FetchAddressFieldNames(locales, type, multipleNames, normalizedNameDictionaryBuilder, + stats)) + { return false; + } - string levelValue; - FromJSON(levelJson, levelValue); - if (levelValue.empty()) - continue; - - search::NormalizeAndTokenizeAsUtf8(levelValue, tokens); - if (tokens.empty()) - continue; - - auto normalizedValue = strings::JoinStrings(tokens, " "); - m_normalizedAddress[i] = normalizedNameDictionaryBuilder.Add(normalizedValue); - m_type = static_cast(i); + if (!multipleNames.GetMainName().empty()) + { + m_normalizedAddress[i] = normalizedNameDictionaryBuilder.Add(move(multipleNames)); + m_type = static_cast(i); + } } auto const & subregion = m_normalizedAddress[static_cast(Type::Subregion)]; @@ -88,22 +98,53 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl( return false; } - FromJSONObjectOptionalField(defaultLocale, "name", m_name); - if (m_name.empty()) - ++stats.m_emptyNames; - - if (m_type == Type::Count) - { - LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr)); - ++stats.m_emptyAddresses; - } return true; } -std::string const & Hierarchy::Entry::GetNormalizedName( +// static +bool Hierarchy::Entry::FetchAddressFieldNames( + json_t * const locales, Type type, MultipleNames & multipleNames, + NameDictionaryBuilder & normalizedNameDictionaryBuilder, ParsingStats & stats) +{ + char const * localeName = nullptr; + json_t * localisedNames = nullptr; + string const & levelKey = ToString(type); + Tokens tokens; + json_object_foreach(locales, localeName, localisedNames) + { + auto const address = base::GetJSONObligatoryField(localisedNames, "address"); + auto const levelJson = base::GetJSONOptionalField(address, levelKey); + if (!levelJson) + continue; + + if (base::JSONIsNull(levelJson)) + return false; + + string levelValue; + FromJSON(levelJson, levelValue); + if (levelValue.empty()) + continue; + + search::NormalizeAndTokenizeAsUtf8(levelValue, tokens); + if (tokens.empty()) + continue; + + auto normalizedValue = strings::JoinStrings(tokens, " "); + static std::string defaultLocale = "default"; + if (localeName == defaultLocale) + multipleNames.SetMainName(normalizedValue); + else + multipleNames.AddAltName(normalizedValue); + } + + return true; +} + +MultipleNames const & Hierarchy::Entry::GetNormalizedMultipleNames( Type type, NameDictionary const & normalizedNameDictionary) const { - return normalizedNameDictionary.Get(m_normalizedAddress[static_cast(type)]); + auto const & addressField = m_normalizedAddress[static_cast(type)]; + return normalizedNameDictionary.Get(addressField); } // Hierarchy --------------------------------------------------------------------------------------- @@ -151,13 +192,16 @@ bool Hierarchy::IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry cons if (toEntry.m_normalizedAddress[i] == NameDictionary::kUnspecifiedPosition) return false; + auto const pos1 = entry.m_normalizedAddress[i]; auto const pos2 = toEntry.m_normalizedAddress[i]; - if (pos1 != pos2 && - m_normalizedNameDictionary.Get(pos1) != m_normalizedNameDictionary.Get(pos2)) - { + if (pos1 == pos2) + continue; + + auto const & name1 = m_normalizedNameDictionary.Get(pos1).GetMainName(); + auto const & name2 = m_normalizedNameDictionary.Get(pos2).GetMainName(); + if (name1 != name2) return false; - } } return true; } diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index 86c0508562..a9534e3445 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -65,9 +65,16 @@ public: bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr, NameDictionaryBuilder & normalizedNameDictionaryBuilder, ParsingStats & stats); + bool DeserializeAddressFromJSON(json_t * const root, + NameDictionaryBuilder & normalizedNameDictionaryBuilder, + ParsingStats & stats); + static bool FetchAddressFieldNames(json_t * const locales, Type type, + MultipleNames & multipleNames, + NameDictionaryBuilder & normalizedNameDictionaryBuilder, + ParsingStats & stats); - std::string const & GetNormalizedName(Type type, - NameDictionary const & normalizedNameDictionary) const; + MultipleNames const & GetNormalizedMultipleNames( + Type type, NameDictionary const & normalizedNameDictionary) const; bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; } base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid); diff --git a/geocoder/hierarchy_reader.cpp b/geocoder/hierarchy_reader.cpp index 9013ba5957..ffababcb7b 100644 --- a/geocoder/hierarchy_reader.cpp +++ b/geocoder/hierarchy_reader.cpp @@ -85,8 +85,8 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount) { if (auto & position = entry.m_normalizedAddress[i]) { - auto const & name = taskNameDictionary.Get(position); - position = nameDictionaryBuilder.Add(name); + auto const & multipleNames = taskNameDictionary.Get(position); + position = nameDictionaryBuilder.Add(MultipleNames{multipleNames}); } } } diff --git a/geocoder/index.cpp b/geocoder/index.cpp index 3aef2674e0..4938456128 100644 --- a/geocoder/index.cpp +++ b/geocoder/index.cpp @@ -75,9 +75,11 @@ void Index::AddEntries() } else { - auto const & name = doc.GetNormalizedName(doc.m_type, dictionary); - search::NormalizeAndTokenizeAsUtf8(name, tokens); - m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId); + for (auto const & name : doc.GetNormalizedMultipleNames(doc.m_type, dictionary)) + { + search::NormalizeAndTokenizeAsUtf8(name, tokens); + InsertToIndex(tokens, docId); + } } ++numIndexed; @@ -98,26 +100,28 @@ void Index::AddStreet(DocId const & docId, Index::Doc const & doc) }; auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary(); - auto const & name = doc.GetNormalizedName(Type::Street, dictionary); Tokens tokens; - search::NormalizeAndTokenizeAsUtf8(name, tokens); - - if (all_of(begin(tokens), end(tokens), isStreetSynonym)) + for (auto const & name : doc.GetNormalizedMultipleNames(Type::Street, dictionary)) { - if (tokens.size() > 1) - m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId); - return; - } + search::NormalizeAndTokenizeAsUtf8(name, tokens); - m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId); + if (all_of(begin(tokens), end(tokens), isStreetSynonym)) + { + if (tokens.size() > 1) + InsertToIndex(tokens, docId); + return; + } - for (size_t i = 0; i < tokens.size(); ++i) - { - if (!isStreetSynonym(tokens[i])) - continue; - auto addr = tokens; - addr.erase(addr.begin() + i); - m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId); + InsertToIndex(tokens, docId); + + for (size_t i = 0; i < tokens.size(); ++i) + { + if (!isStreetSynonym(tokens[i])) + continue; + auto addr = tokens; + addr.erase(addr.begin() + i); + InsertToIndex(addr, docId); + } } } @@ -157,9 +161,12 @@ void Index::AddHouses(unsigned int loadThreadsCount) else continue; - auto const & relationName = dictionary.Get(relation); + auto const & relationMultipleNames = dictionary.Get(relation); + auto const & relationName = relationMultipleNames.GetMainName(); Tokens relationNameTokens; search::NormalizeAndTokenizeAsUtf8(relationName, relationNameTokens); + CHECK(!relationNameTokens.empty(), ()); + bool indexed = false; ForEachDocId(relationNameTokens, [&](DocId const & candidate) { auto const & candidateDoc = GetDoc(candidate); @@ -188,4 +195,11 @@ void Index::AddHouses(unsigned int loadThreadsCount) if (numIndexed % kLogBatch != 0) LOG(LINFO, ("Indexed", numIndexed, "houses")); } + +void Index::InsertToIndex(Tokens const & tokens, DocId docId) +{ + auto & ids = m_docIdsByTokens[MakeIndexKey(tokens)]; + if (0 == count(ids.begin(), ids.end(), docId)) + ids.emplace_back(docId); +} } // namespace geocoder diff --git a/geocoder/index.hpp b/geocoder/index.hpp index 50f58471b9..1d9aeff3d2 100644 --- a/geocoder/index.hpp +++ b/geocoder/index.hpp @@ -55,6 +55,8 @@ public: } private: + void InsertToIndex(Tokens const & tokens, DocId docId); + // Converts |tokens| to a single UTF-8 string that can be used // as a key in the |m_docIdsByTokens| map. static std::string MakeIndexKey(Tokens const & tokens); diff --git a/geocoder/name_dictionary.cpp b/geocoder/name_dictionary.cpp index 7d45ab5081..560e566215 100644 --- a/geocoder/name_dictionary.cpp +++ b/geocoder/name_dictionary.cpp @@ -2,34 +2,91 @@ #include "base/assert.hpp" +#include +#include #include #include namespace geocoder { +// MultipleName ------------------------------------------------------------------------------------ +MultipleNames::MultipleNames(std::string const & mainName) + : m_names{mainName} +{ } + +std::string const & MultipleNames::GetMainName() const noexcept +{ + return m_names[0]; +} + +std::vector const & MultipleNames::GetNames() const noexcept +{ + return m_names; +} + +MultipleNames::const_iterator MultipleNames::begin() const noexcept +{ + return m_names.begin(); +} + +MultipleNames::const_iterator MultipleNames::end() const noexcept +{ + return m_names.end(); +} + +void MultipleNames::SetMainName(std::string const & name) +{ + m_names[0] = name; +} + +void MultipleNames::AddAltName(std::string const & name) +{ + m_names.emplace_back(std::move(name)); + // Sort for operator==. + ASSERT_GREATER_OR_EQUAL(m_names.size(), 2, ()); + std::inplace_merge(std::next(m_names.begin()), std::prev(m_names.end()), m_names.end()); +} + +bool operator==(MultipleNames const & lhs, MultipleNames const & rhs) noexcept +{ + return lhs.m_names == rhs.m_names; +} + +bool operator!=(MultipleNames const & lhs, MultipleNames const & rhs) noexcept +{ + return !(lhs == rhs); +} + // NameDictionary ---------------------------------------------------------------------------------- -std::string const & NameDictionary::Get(Position position) const +MultipleNames const & NameDictionary::Get(Position position) const { CHECK_GREATER(position, 0, ()); CHECK_LESS_OR_EQUAL(position, m_stock.size(), ()); return m_stock[position - 1]; } -NameDictionary::Position NameDictionary::Add(std::string const & s) +NameDictionary::Position NameDictionary::Add(MultipleNames && names) { + CHECK(!names.GetMainName().empty(), ()); CHECK_LESS(m_stock.size(), std::numeric_limits::max(), ()); - m_stock.push_back(s); + m_stock.push_back(std::move(names)); return m_stock.size(); // index + 1 } -// NameDictionaryBuilder ----------------------------------------------------------------------------- -NameDictionary::Position NameDictionaryBuilder::Add(std::string const & s) +// NameDictionaryBuilder::Hash --------------------------------------------------------------------- +size_t NameDictionaryBuilder::Hash::operator()(MultipleNames const & names) const noexcept { - auto indexItem = m_index.find(s); + return std::hash{}(names.GetMainName()); +} + +// NameDictionaryBuilder ----------------------------------------------------------------------------- +NameDictionary::Position NameDictionaryBuilder::Add(MultipleNames && names) +{ + auto indexItem = m_index.find(names); if (indexItem != m_index.end()) return indexItem->second; - auto p = m_dictionary.Add(s); + auto p = m_dictionary.Add(std::move(names)); auto indexEmplace = m_index.emplace(m_dictionary.Get(p), p); CHECK(indexEmplace.second, ()); return p; diff --git a/geocoder/name_dictionary.hpp b/geocoder/name_dictionary.hpp index eaac18a6e1..fdf5b390f9 100644 --- a/geocoder/name_dictionary.hpp +++ b/geocoder/name_dictionary.hpp @@ -7,6 +7,30 @@ namespace geocoder { +class MultipleNames +{ +public: + using const_iterator = std::vector::const_iterator; + + explicit MultipleNames(std::string const & mainName = {}); + + std::string const & GetMainName() const noexcept; + std::vector const & GetNames() const noexcept; + + const_iterator begin() const noexcept; + const_iterator end() const noexcept; + + void SetMainName(std::string const & name); + // Complexity: O(N-1) - a best case, O(N*log(N)) - a worst case. + void AddAltName(std::string const & name); + + friend bool operator==(MultipleNames const & lhs, MultipleNames const & rhs) noexcept; + friend bool operator!=(MultipleNames const & lhs, MultipleNames const & rhs) noexcept; + +private: + std::vector m_names; +}; + class NameDictionary { public: @@ -22,11 +46,11 @@ public: NameDictionary(NameDictionary const &) = delete; NameDictionary & operator=(NameDictionary const &) = delete; - std::string const & Get(Position position) const; - Position Add(std::string const & s); + MultipleNames const & Get(Position position) const; + Position Add(MultipleNames && s); private: - std::vector m_stock; + std::vector m_stock; }; class NameDictionaryBuilder @@ -36,11 +60,16 @@ public: NameDictionaryBuilder(NameDictionaryBuilder const &) = delete; NameDictionaryBuilder & operator=(NameDictionaryBuilder const &) = delete; - NameDictionary::Position Add(std::string const & s); + NameDictionary::Position Add(MultipleNames && s); NameDictionary Release(); private: + struct Hash + { + size_t operator()(MultipleNames const & names) const noexcept; + }; + NameDictionary m_dictionary; - std::unordered_map m_index; + std::unordered_map m_index; }; } // namespace geocoder