From 6837cafdbd697ddb1363956cd2466dfc4d4554c6 Mon Sep 17 00:00:00 2001 From: Anatoly Serdtcev Date: Mon, 29 Jul 2019 16:21:31 +0300 Subject: [PATCH] [geocoder] Optimize memory: name dictionary for address parts --- geocoder/CMakeLists.txt | 2 + geocoder/geocoder.cpp | 12 ++- geocoder/geocoder_cli/geocoder_cli.cpp | 17 ++++- geocoder/geocoder_tests/geocoder_tests.cpp | 15 ++-- geocoder/hierarchy.cpp | 86 ++++++++++++---------- geocoder/hierarchy.hpp | 23 +++--- geocoder/hierarchy_reader.cpp | 20 ++++- geocoder/hierarchy_reader.hpp | 4 +- geocoder/index.cpp | 70 +++++++++++------- geocoder/index.hpp | 1 + geocoder/name_dictionary.cpp | 42 +++++++++++ geocoder/name_dictionary.hpp | 46 ++++++++++++ 12 files changed, 245 insertions(+), 93 deletions(-) create mode 100644 geocoder/name_dictionary.cpp create mode 100644 geocoder/name_dictionary.hpp diff --git a/geocoder/CMakeLists.txt b/geocoder/CMakeLists.txt index 5c82e4398b..277d94629d 100644 --- a/geocoder/CMakeLists.txt +++ b/geocoder/CMakeLists.txt @@ -12,6 +12,8 @@ set( hierarchy_reader.hpp index.cpp index.hpp + name_dictionary.cpp + name_dictionary.hpp result.cpp result.hpp types.cpp diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index 57fce89ac9..9475ef7afb 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -374,10 +374,14 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector { m_index.ForEachRelatedBuilding(docId, [&](Index::DocId const & buildingDocId) { auto const & bld = m_index.GetDoc(buildingDocId); - auto const bt = static_cast(Type::Building); - auto const & realHN = MakeHouseNumber(bld.m_address[bt]); - if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */)) + auto const & realHN = bld.GetNormalizedName(Type::Building, + m_hierarchy.GetNormalizedNameDictionary()); + auto const & realHNUniStr = strings::MakeUniString(realHN); + if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN, + false /* queryIsPrefix */)) + { curLayer.m_entries.emplace_back(buildingDocId); + } }); } }); @@ -405,7 +409,7 @@ bool Geocoder::HasParent(vector const & layers, Hierarchy::Entr // Note that the relationship is somewhat inverted: every ancestor // is stored in the address but the nodes have no information // about their children. - if (m_index.GetDoc(docId).IsParentTo(e)) + if (m_hierarchy.IsParentTo(m_index.GetDoc(docId), e)) return true; } return false; diff --git a/geocoder/geocoder_cli/geocoder_cli.cpp b/geocoder/geocoder_cli/geocoder_cli.cpp index f3b78d569f..98bc9b176e 100644 --- a/geocoder/geocoder_cli/geocoder_cli.cpp +++ b/geocoder/geocoder_cli/geocoder_cli.cpp @@ -24,13 +24,28 @@ void PrintResults(Hierarchy const & hierarchy, vector const & results) if (results.empty()) return; cout << "Top results:" << endl; + + auto const & dictionary = hierarchy.GetNormalizedNameDictionary(); for (size_t i = 0; i < results.size(); ++i) { if (FLAGS_top >= 0 && static_cast(i) >= FLAGS_top) break; cout << " " << DebugPrint(results[i]); if (auto const && e = hierarchy.GetEntryForOsmId(results[i].m_osmId)) - cout << " " << DebugPrint(e->m_address); + { + cout << " ["; + auto const * delimiter = ""; + for (size_t i = 0; i < static_cast(Type::Count); ++i) + { + if (e->m_normalizedAddress[i]) + { + auto type = static_cast(i); + cout << delimiter << ToString(type) << ": " << e->GetNormalizedName(type, dictionary); + delimiter = ", "; + } + } + cout << "]"; + } cout << endl; } } diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 85f1d4efc8..abfe1e7fd5 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -29,13 +29,6 @@ C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates" C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}} C00000000059D6B5 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"locales": {"default": {"name": "Florencia", "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 6}} )#"; - -geocoder::Tokens Split(string const & s) -{ - geocoder::Tokens result; - search::NormalizeAndTokenizeAsUtf8(s, result); - return result; -} } // namespace namespace geocoder @@ -74,6 +67,8 @@ UNIT_TEST(Geocoder_Hierarchy) { ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData); Geocoder geocoder(regionsJsonFile.GetFullPath()); + auto const & hierarchy = geocoder.GetHierarchy(); + auto const & dictionary = hierarchy.GetNormalizedNameDictionary(); vector entries; geocoder.GetIndex().ForEachDocId({("florencia")}, [&](Index::DocId const & docId) { @@ -81,9 +76,9 @@ UNIT_TEST(Geocoder_Hierarchy) }); TEST_EQUAL(entries.size(), 1, ()); - TEST_EQUAL(entries[0].m_address[static_cast(Type::Country)], Split("cuba"), ()); - TEST_EQUAL(entries[0].m_address[static_cast(Type::Region)], Split("ciego de avila"), ()); - TEST_EQUAL(entries[0].m_address[static_cast(Type::Subregion)], Split("florencia"), ()); + TEST_EQUAL(entries[0].GetNormalizedName(Type::Country, dictionary), "cuba", ()); + TEST_EQUAL(entries[0].GetNormalizedName(Type::Region, dictionary), "ciego de avila", ()); + TEST_EQUAL(entries[0].GetNormalizedName(Type::Subregion, dictionary), "florencia", ()); } UNIT_TEST(Geocoder_OnlyBuildings) diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index a6e99f5511..718934ec1c 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -16,12 +16,14 @@ using namespace std; namespace geocoder { // Hierarchy::Entry -------------------------------------------------------------------------------- -bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats & stats) +bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, + NameDictionaryMaker & normalizedNameDictionaryMaker, + ParsingStats & stats) { try { base::Json root(jsonStr.c_str()); - return DeserializeFromJSONImpl(root.get(), jsonStr, stats); + return DeserializeFromJSONImpl(root.get(), jsonStr, normalizedNameDictionaryMaker, stats); } catch (base::Json::Exception const & e) { @@ -32,6 +34,7 @@ bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats // todo(@m) Factor out to geojson.hpp? Add geojson to myjansson? bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const & jsonStr, + NameDictionaryMaker & normalizedNameDictionaryMaker, ParsingStats & stats) { if (!json_is_object(root)) @@ -43,7 +46,8 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales", "default"); auto const address = base::GetJSONObligatoryField(defaultLocale, "address"); - bool hasDuplicateAddress = false; + m_normalizedAddress= {}; + Tokens tokens; for (size_t i = 0; i < static_cast(Type::Count); ++i) { Type const type = static_cast(i); @@ -60,69 +64,52 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const if (levelValue.empty()) continue; - if (!m_address[i].empty()) - { - LOG(LDEBUG, ("Duplicate address field", type, "when parsing", jsonStr)); - hasDuplicateAddress = true; - } + search::NormalizeAndTokenizeAsUtf8(levelValue, tokens); + if (tokens.empty()) + continue; - search::NormalizeAndTokenizeAsUtf8(levelValue, m_address[i]); - - if (!m_address[i].empty()) - m_type = static_cast(i); + auto normalizedValue = strings::JoinStrings(tokens, " "); + m_normalizedAddress[i] = normalizedNameDictionaryMaker.Add(normalizedValue); + m_type = static_cast(i); } - auto const & subregion = m_address[static_cast(Type::Subregion)]; - auto const & locality = m_address[static_cast(Type::Locality)]; - if (m_type == Type::Street && locality.empty() && subregion.empty() /* if locality detection fail */) + auto const & subregion = m_normalizedAddress[static_cast(Type::Subregion)]; + auto const & locality = m_normalizedAddress[static_cast(Type::Locality)]; + if (m_type == Type::Street && !locality && !subregion) { ++stats.m_noLocalityStreets; return false; } - if (m_type == Type::Building && locality.empty() && subregion.empty() /* if locality detection fail */) + if (m_type == Type::Building && !locality && !subregion) { ++stats.m_noLocalityBuildings; return false; } - m_nameTokens.clear(); FromJSONObjectOptionalField(defaultLocale, "name", m_name); - search::NormalizeAndTokenizeAsUtf8(m_name, m_nameTokens); - if (m_name.empty()) ++stats.m_emptyNames; - if (hasDuplicateAddress) - ++stats.m_duplicateAddresses; - if (m_type == Type::Count) { LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr)); ++stats.m_emptyAddresses; } - else if (m_nameTokens != m_address[static_cast(m_type)]) - { - ++stats.m_mismatchedNames; - } - return true; } -bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const +std::string const & Hierarchy::Entry::GetNormalizedName( + Type type, NameDictionary const & normalizedNameDictionary) const { - for (size_t i = 0; i < static_cast(geocoder::Type::Count); ++i) - { - if (!m_address[i].empty() && m_address[i] != e.m_address[i]) - return false; - } - return true; + return normalizedNameDictionary.Get(m_normalizedAddress[static_cast(type)]); } // Hierarchy --------------------------------------------------------------------------------------- -Hierarchy::Hierarchy(vector && entries, bool sorted) - : m_entries{std::move(entries)} +Hierarchy::Hierarchy(vector && entries, NameDictionary && normalizedNameDictionary) + : m_entries{move(entries)} + , m_normalizedNameDictionary{move(normalizedNameDictionary)} { - if (!sorted) + if (!is_sorted(m_entries.begin(), m_entries.end())) { LOG(LINFO, ("Sorting entries...")); sort(m_entries.begin(), m_entries.end()); @@ -134,6 +121,11 @@ vector const & Hierarchy::GetEntries() const return m_entries; } +NameDictionary const & Hierarchy::GetNormalizedNameDictionary() const +{ + return m_normalizedNameDictionary; +} + Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & osmId) const { auto const cmp = [](Hierarchy::Entry const & e, base::GeoObjectId const & id) { @@ -147,4 +139,24 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o return &(*it); } + +bool Hierarchy::IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry const & toEntry) const +{ + for (size_t i = 0; i < static_cast(geocoder::Type::Count); ++i) + { + if (!entry.m_normalizedAddress[i]) + continue; + + if (!toEntry.m_normalizedAddress[i]) + return false; + auto const pos1 = entry.m_normalizedAddress[i]; + auto const pos2 = toEntry.m_normalizedAddress[i]; + if (pos1 != pos2 && + m_normalizedNameDictionary.Get(pos1) != m_normalizedNameDictionary.Get(pos2)) + { + return false; + } + } + return true; +} } // namespace geocoder diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index 087f694074..71d56d90c8 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -1,5 +1,6 @@ #pragma once +#include "geocoder/name_dictionary.hpp" #include "geocoder/types.hpp" #include "base/geo_object_id.hpp" @@ -58,36 +59,38 @@ public: // part of the geojson entry. struct Entry { - bool DeserializeFromJSON(std::string const & jsonStr, ParsingStats & stats); - + bool DeserializeFromJSON(std::string const & jsonStr, + NameDictionaryMaker & normalizedNameDictionaryMaker, + ParsingStats & stats); bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr, + NameDictionaryMaker & normalizedNameDictionaryMaker, ParsingStats & stats); - // Checks whether this entry is a parent of |e|. - bool IsParentTo(Entry const & e) const; - + std::string const & GetNormalizedName(Type type, + NameDictionary const & normalizedNameDictionary) const; bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; } base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid); // Original name of the entry. Useful for debugging. std::string m_name; - // Tokenized and simplified name of the entry. - Tokens m_nameTokens; Type m_type = Type::Count; - // The address fields of this entry, one per Type. - std::array(Type::Count)> m_address; + // The positions of entry address fields in normalized name dictionary, one per Type. + std::array(Type::Count)> m_normalizedAddress{}; }; - explicit Hierarchy(std::vector && entries, bool sorted); + explicit Hierarchy(std::vector && entries, NameDictionary && normalizeNameDictionary); std::vector const & GetEntries() const; + NameDictionary const & GetNormalizedNameDictionary() const; Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const; + bool IsParentTo(Hierarchy::Entry const & entry, Hierarchy::Entry const & toEntry) const; private: std::vector m_entries; + NameDictionary m_normalizedNameDictionary; }; } // namespace geocoder diff --git a/geocoder/hierarchy_reader.cpp b/geocoder/hierarchy_reader.cpp index ce9269309e..12827c87c2 100644 --- a/geocoder/hierarchy_reader.cpp +++ b/geocoder/hierarchy_reader.cpp @@ -61,6 +61,7 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount) LOG(LINFO, ("Reading entries...")); vector entries; + NameDictionaryMaker nameDictionaryMaker; ParsingStats stats{}; base::thread_pool::computational::ThreadPool threadPool{readersCount}; @@ -77,6 +78,18 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount) tasks.pop_front(); auto & taskEntries = taskResult.m_entries; + auto const & taskNameDictionary = taskResult.m_nameDictionary; + for (auto & entry : taskEntries) + { + for (size_t i = 0; i < static_cast(Type::Count); ++i) + { + if (auto & position = entry.m_normalizedAddress[i]) + { + auto const & name = taskNameDictionary.Get(position); + position = nameDictionaryMaker.Add(name); + } + } + } move(begin(taskEntries), end(taskEntries), back_inserter(entries)); stats += taskResult.m_stats; @@ -105,7 +118,7 @@ Hierarchy HierarchyReader::Read(unsigned int readersCount) ("Entries whose names do not match their most specific addresses:", stats.m_mismatchedNames)); LOG(LINFO, ("(End of stats.)")); - return Hierarchy{move(entries), true}; + return Hierarchy{move(entries), nameDictionaryMaker.Release()}; } void HierarchyReader::CheckDuplicateOsmIds(vector const & entries, @@ -155,6 +168,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries( { vector entries; entries.reserve(bufferSize); + NameDictionaryMaker nameDictionaryMaker; ParsingStats stats; for (size_t i = 0; i < bufferSize; ++i) @@ -178,7 +192,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries( auto const osmId = base::GeoObjectId(encodedId); entry.m_osmId = osmId; - if (!entry.DeserializeFromJSON(json, stats)) + if (!entry.DeserializeFromJSON(json, nameDictionaryMaker, stats)) continue; if (entry.m_type == Type::Count) @@ -193,7 +207,7 @@ HierarchyReader::ParsingResult HierarchyReader::DeserializeEntries( entries.push_back(move(entry)); } - return {move(entries), move(stats)}; + return {move(entries), nameDictionaryMaker.Release(), move(stats)}; } // static diff --git a/geocoder/hierarchy_reader.hpp b/geocoder/hierarchy_reader.hpp index 9f1bd8b2b7..4d089a7f23 100644 --- a/geocoder/hierarchy_reader.hpp +++ b/geocoder/hierarchy_reader.hpp @@ -1,6 +1,7 @@ #pragma once #include "geocoder/hierarchy.hpp" +#include "geocoder/name_dictionary.hpp" #include "base/exception.hpp" #include "base/geo_object_id.hpp" @@ -33,12 +34,13 @@ private: struct ParsingResult { std::vector m_entries; + NameDictionary m_nameDictionary; ParsingStats m_stats; }; ParsingResult ReadEntries(size_t count); ParsingResult DeserializeEntries(std::vector const & linesBuffer, - std::size_t const bufferSize); + std::size_t const bufferSize); static bool DeserializeId(std::string const & str, uint64_t & id); static std::string SerializeId(uint64_t id); diff --git a/geocoder/index.cpp b/geocoder/index.cpp index ae47e36e1c..1a8a06fab1 100644 --- a/geocoder/index.cpp +++ b/geocoder/index.cpp @@ -26,6 +26,7 @@ namespace geocoder { Index::Index(Hierarchy const & hierarchy, unsigned int loadThreadsCount) : m_docs(hierarchy.GetEntries()) + , m_hierarchy{hierarchy} { CHECK_GREATER_OR_EQUAL(loadThreadsCount, 1, ()); @@ -55,6 +56,8 @@ string Index::MakeIndexKey(Tokens const & tokens) void Index::AddEntries() { size_t numIndexed = 0; + auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary(); + Tokens tokens; for (DocId docId = 0; docId < static_cast(m_docs.size()); ++docId) { auto const & doc = m_docs[static_cast(docId)]; @@ -72,8 +75,9 @@ void Index::AddEntries() } else { - size_t const t = static_cast(doc.m_type); - m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId); + auto const & name = doc.GetNormalizedName(doc.m_type, dictionary); + search::NormalizeAndTokenizeAsUtf8(name, tokens); + m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId); } ++numIndexed; @@ -88,28 +92,30 @@ void Index::AddEntries() void Index::AddStreet(DocId const & docId, Index::Doc const & doc) { CHECK_EQUAL(doc.m_type, Type::Street, ()); - size_t const t = static_cast(doc.m_type); auto isStreetSynonym = [] (string const & s) { return search::IsStreetSynonym(strings::MakeUniString(s)); }; - if (all_of(begin(doc.m_address[t]), end(doc.m_address[t]), isStreetSynonym)) + auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary(); + auto const & name = doc.GetNormalizedName(Type::Street, dictionary); + Tokens tokens; + search::NormalizeAndTokenizeAsUtf8(name, tokens); + + if (all_of(begin(tokens), end(tokens), isStreetSynonym)) { - LOG(LDEBUG, ("Undefined proper name in tokens", doc.m_address[t], "of street entry", - doc.m_osmId, "(", doc.m_address, ")")); - if (doc.m_address[t].size() > 1) - m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId); + if (tokens.size() > 1) + m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId); return; } - m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId); + m_docIdsByTokens[MakeIndexKey(tokens)].emplace_back(docId); - for (size_t i = 0; i < doc.m_address[t].size(); ++i) + for (size_t i = 0; i < tokens.size(); ++i) { - if (!isStreetSynonym(doc.m_address[t][i])) + if (!isStreetSynonym(tokens[i])) continue; - auto addr = doc.m_address[t]; + auto addr = tokens; addr.erase(addr.begin() + i); m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId); } @@ -123,6 +129,8 @@ void Index::AddHouses(unsigned int loadThreadsCount) vector threads(loadThreadsCount); CHECK_GREATER(threads.size(), 0, ()); + auto const & dictionary = m_hierarchy.GetNormalizedNameDictionary(); + for (size_t t = 0; t < threads.size(); ++t) { threads[t] = thread([&, t, this]() { @@ -137,31 +145,39 @@ void Index::AddHouses(unsigned int loadThreadsCount) if (buildingDoc.m_type != Type::Building) continue; - auto const & street = buildingDoc.m_address[static_cast(Type::Street)]; - auto const & locality = buildingDoc.m_address[static_cast(Type::Locality)]; + auto const & street = buildingDoc.m_normalizedAddress[static_cast(Type::Street)]; + auto const & locality = + buildingDoc.m_normalizedAddress[static_cast(Type::Locality)]; - Tokens const * relationName = nullptr; - - if (!street.empty()) - relationName = &street; - else if (!locality.empty()) - relationName = &locality; - - if (!relationName) + NameDictionary::Position relation = NameDictionary::kUnspecifiedPosition; + if (street) + relation = street; + else if (locality) + relation = locality; + else continue; - ForEachDocId(*relationName, [&](DocId const & candidate) { + auto const & relationName = dictionary.Get(relation); + Tokens relationNameTokens; + search::NormalizeAndTokenizeAsUtf8(relationName, relationNameTokens); + bool indexed = false; + ForEachDocId(relationNameTokens, [&](DocId const & candidate) { auto const & candidateDoc = GetDoc(candidate); - if (candidateDoc.IsParentTo(buildingDoc)) + if (m_hierarchy.IsParentTo(candidateDoc, buildingDoc)) { + indexed = true; + lock_guard lock(buildingsMutex); m_relatedBuildings[candidate].emplace_back(docId); } }); - auto processedCount = numIndexed.fetch_add(1) + 1; - if (processedCount % kLogBatch == 0) - LOG(LINFO, ("Indexed", processedCount, "houses")); + if (indexed) + { + auto processedCount = numIndexed.fetch_add(1) + 1; + if (processedCount % kLogBatch == 0) + LOG(LINFO, ("Indexed", processedCount, "houses")); + } } }); } diff --git a/geocoder/index.hpp b/geocoder/index.hpp index 34a5941384..50f58471b9 100644 --- a/geocoder/index.hpp +++ b/geocoder/index.hpp @@ -70,6 +70,7 @@ private: void AddHouses(unsigned int loadThreadsCount); std::vector const & m_docs; + Hierarchy const & m_hierarchy; std::unordered_map> m_docIdsByTokens; diff --git a/geocoder/name_dictionary.cpp b/geocoder/name_dictionary.cpp new file mode 100644 index 0000000000..98bbfd4d27 --- /dev/null +++ b/geocoder/name_dictionary.cpp @@ -0,0 +1,42 @@ +#include "geocoder/name_dictionary.hpp" + +#include "base/assert.hpp" + +#include + +namespace geocoder +{ +// NameDictionary ---------------------------------------------------------------------------------- +std::string const & NameDictionary::Get(Position position) const +{ + CHECK_GREATER(position, 0, ()); + CHECK_LESS_OR_EQUAL(position, m_stock.size(), ()); + return m_stock[position - 1]; +} + +NameDictionary::Position NameDictionary::Add(std::string const & s) +{ + CHECK_LESS(m_stock.size(), UINT32_MAX, ()); + m_stock.push_back(s); + return m_stock.size(); // index + 1 +} + +// NameDictionaryMaker ----------------------------------------------------------------------------- +NameDictionary::Position NameDictionaryMaker::Add(std::string const & s) +{ + auto indexItem = m_index.find(s); + if (indexItem != m_index.end()) + return indexItem->second; + + auto p = m_dictionary.Add(s); + auto indexEmplace = m_index.emplace(m_dictionary.Get(p), p); + CHECK(indexEmplace.second, ()); + return p; +} + +NameDictionary NameDictionaryMaker::Release() +{ + m_index.clear(); + return std::move(m_dictionary); +} +} // namespace geocoder diff --git a/geocoder/name_dictionary.hpp b/geocoder/name_dictionary.hpp new file mode 100644 index 0000000000..344046cb0e --- /dev/null +++ b/geocoder/name_dictionary.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include +#include + +namespace geocoder +{ +class NameDictionary +{ +public: + // Values of Position type: kUnspecifiedPosition or >= 1. + using Position = std::uint32_t; + + static constexpr Position kUnspecifiedPosition = 0; + + NameDictionary() = default; + NameDictionary(NameDictionary &&) = default; + NameDictionary & operator=(NameDictionary &&) = default; + + NameDictionary(NameDictionary const &) = delete; + NameDictionary & operator=(NameDictionary const &) = delete; + + std::string const & Get(Position position) const; + Position Add(std::string const & s); + +private: + std::vector m_stock; +}; + +class NameDictionaryMaker +{ +public: + NameDictionaryMaker() = default; + NameDictionaryMaker(NameDictionaryMaker const &) = delete; + NameDictionaryMaker & operator=(NameDictionaryMaker const &) = delete; + + NameDictionary::Position Add(std::string const & s); + NameDictionary Release(); + +private: + NameDictionary m_dictionary; + std::unordered_map m_index; +}; +} // namespace geocoder