From 5bc28544212d997c0045e68c64edc4314124a8d8 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Sat, 29 Dec 2018 18:29:29 +0300 Subject: [PATCH] [geocoder] Index now uses integer document ids instead of pointers. --- geocoder/geocoder.cpp | 68 +++++++--------- geocoder/geocoder.hpp | 6 +- geocoder/geocoder_tests/geocoder_tests.cpp | 16 ++-- geocoder/hierarchy.cpp | 20 ++--- geocoder/hierarchy.hpp | 6 +- geocoder/index.cpp | 95 ++++++++++------------ geocoder/index.hpp | 56 ++++++++++--- 7 files changed, 142 insertions(+), 125 deletions(-) diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index 6b9a091877..3ef10c88b0 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -73,22 +73,6 @@ geocoder::Type NextType(geocoder::Type type) return static_cast(t + 1); } -bool HasParent(vector const & layers, - geocoder::Hierarchy::Entry const & e) -{ - CHECK(!layers.empty(), ()); - auto const & layer = layers.back(); - for (auto const * pe : layer.m_entries) - { - // Note that the relationship is somewhat inverted: every ancestor - // is stored in the address but the nodes have no information - // about their children. - if (pe->IsParentTo(e)) - return true; - } - return false; -} - strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens) { return strings::MakeUniString(strings::JoinStrings(tokens, " ")); @@ -285,8 +269,11 @@ void Geocoder::Go(Context & ctx, Type type) const allTypes.push_back(t); } - for (auto const * e : curLayer.m_entries) - ctx.AddResult(e->m_osmId, certainty, type, move(allTypes), ctx.AllTokensUsed()); + for (auto const & docId : curLayer.m_entries) + { + ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, move(allTypes), + ctx.AllTokensUsed()); + } ctx.GetLayers().emplace_back(move(curLayer)); SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); }); @@ -316,36 +303,43 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer // let's stay on the safer side and set the house number bit. ctx.SetHouseNumberBit(); - for (auto const & se : layer.m_entries) + for (auto const & streetDocId : layer.m_entries) { - auto const * buildings = m_index.GetBuildingsOnStreet(se->m_osmId); - if (buildings == nullptr) - continue; - for (auto const & be : *buildings) - { + m_index.ForEachBuildingOnStreet(streetDocId, [&](Index::DocId const & buildingDocId) { + auto const & bld = m_index.GetDoc(buildingDocId); auto const bt = static_cast(Type::Building); - auto const & realHN = MakeHouseNumber(be->m_address[bt]); + auto const & realHN = MakeHouseNumber(bld.m_address[bt]); if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */)) - curLayer.m_entries.emplace_back(be); - } + curLayer.m_entries.emplace_back(buildingDocId); + }); } } void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery, Layer & curLayer) const { - auto const * entries = m_index.GetEntries(subquery); - if (!entries || entries->empty()) - return; + m_index.ForEachDocId(subquery, [&](Index::DocId const & docId) { + auto const & d = m_index.GetDoc(docId); + if (d.m_type != type) + return; - for (auto const * e : *entries) + if (ctx.GetLayers().empty() || HasParent(ctx.GetLayers(), d)) + curLayer.m_entries.emplace_back(docId); + }); +} + +bool Geocoder::HasParent(vector const & layers, Hierarchy::Entry const & e) const +{ + CHECK(!layers.empty(), ()); + auto const & layer = layers.back(); + for (auto const & docId : layer.m_entries) { - CHECK(e, ()); - if (e->m_type != type) - continue; - - if (ctx.GetLayers().empty() || HasParent(ctx.GetLayers(), *e)) - curLayer.m_entries.emplace_back(e); + // Note that the relationship is somewhat inverted: every ancestor + // is stored in the address but the nodes have no information + // about their children. + if (m_hierarchy.IsParent(m_index.GetDoc(docId), e)) + return true; } + return false; } } // namespace geocoder diff --git a/geocoder/geocoder.hpp b/geocoder/geocoder.hpp index 08434b113f..f987de5f11 100644 --- a/geocoder/geocoder.hpp +++ b/geocoder/geocoder.hpp @@ -42,7 +42,7 @@ public: struct Layer { Type m_type = Type::Count; - std::vector m_entries; + std::vector m_entries; }; // This class is very similar to the one we use in search/. @@ -134,6 +134,10 @@ private: void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery, Layer & curLayer) const; + // Returns whether any of the paths through |layers| can be extended + // by appending |e|. + bool HasParent(std::vector const & layers, Hierarchy::Entry const & e) const; + Hierarchy m_hierarchy; Index m_index; diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 111c6f718a..58c951e9fa 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -72,15 +72,15 @@ UNIT_TEST(Geocoder_Hierarchy) ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - auto entries = geocoder.GetIndex().GetEntries({("florencia")}); + vector entries; + geocoder.GetIndex().ForEachDocId({("florencia")}, [&](Index::DocId const & docId) { + entries.emplace_back(geocoder.GetIndex().GetDoc(docId)); + }); - TEST(entries, ()); - TEST_EQUAL(entries->size(), 1, ()); - TEST_EQUAL((*entries)[0]->m_address[static_cast(Type::Country)], Split("cuba"), ()); - TEST_EQUAL((*entries)[0]->m_address[static_cast(Type::Region)], Split("ciego de avila"), - ()); - TEST_EQUAL((*entries)[0]->m_address[static_cast(Type::Subregion)], Split("florencia"), - ()); + TEST_EQUAL(entries.size(), 1, ()); + TEST_EQUAL(entries[0].m_address[static_cast(Type::Country)], Split("cuba"), ()); + TEST_EQUAL(entries[0].m_address[static_cast(Type::Region)], Split("ciego de avila"), ()); + TEST_EQUAL(entries[0].m_address[static_cast(Type::Subregion)], Split("florencia"), ()); } UNIT_TEST(Geocoder_OnlyBuildings) diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index 43e334b0bf..8579ad3ff2 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -127,16 +127,6 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const return true; } -bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const -{ - for (size_t i = 0; i < static_cast(geocoder::Type::Count); ++i) - { - if (!m_address[i].empty() && m_address[i] != e.m_address[i]) - return false; - } - return true; -} - // Hierarchy --------------------------------------------------------------------------------------- Hierarchy::Hierarchy(string const & pathToJsonHierarchy) { @@ -216,4 +206,14 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o return &(*it); } + +bool Hierarchy::IsParent(Hierarchy::Entry const & pe, Hierarchy::Entry const & e) const +{ + for (size_t i = 0; i < static_cast(geocoder::Type::Count); ++i) + { + if (!pe.m_address[i].empty() && pe.m_address[i] != e.m_address[i]) + return false; + } + return true; +} } // namespace geocoder diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index bae7d2fc38..952ab6bbc0 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -57,9 +57,6 @@ public: bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr, ParsingStats & stats); - // Checks whether this entry is a parent of |e|. - bool IsParentTo(Entry const & e) const; - bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; } base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid); @@ -81,6 +78,9 @@ public: Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const; + // Checks whether |pe| is a parent of |e|. + bool IsParent(Entry const & pe, Entry const & e) const; + private: std::vector m_entries; }; diff --git a/geocoder/index.cpp b/geocoder/index.cpp index 512668dfdf..eed857fe3b 100644 --- a/geocoder/index.cpp +++ b/geocoder/index.cpp @@ -8,69 +8,58 @@ #include "base/logging.hpp" #include "base/string_utils.hpp" +#include + using namespace std; namespace { -// Information will be logged for every |kLogBatch| entries. +// Information will be logged for every |kLogBatch| docs. size_t const kLogBatch = 100000; - -string MakeIndexKey(geocoder::Tokens const & tokens) { return strings::JoinStrings(tokens, " "); } } // namespace namespace geocoder { -Index::Index(Hierarchy const & hierarchy) : m_entries(hierarchy.GetEntries()) +Index::Index(Hierarchy const & hierarchy) : m_docs(hierarchy.GetEntries()) { - LOG(LINFO, ("Indexing entries...")); + LOG(LINFO, ("Indexing hierarchy entries...")); AddEntries(); LOG(LINFO, ("Indexing houses...")); - AddHouses(); + AddHouses(hierarchy); } -vector const * const Index::GetEntries(Tokens const & tokens) const +Index::Doc const & Index::GetDoc(DocId const id) const { - auto const it = m_entriesByTokens.find(MakeIndexKey(tokens)); - if (it == m_entriesByTokens.end()) - return {}; - - return &it->second; + ASSERT_LESS(static_cast(id), m_docs.size(), ()); + return m_docs[static_cast(id)]; } -vector const * const Index::GetBuildingsOnStreet( - base::GeoObjectId const & osmId) const +string Index::MakeIndexKey(Tokens const & tokens) const { - auto const it = m_buildingsOnStreet.find(osmId); - if (it == m_buildingsOnStreet.end()) - return {}; - - return &it->second; + return strings::JoinStrings(tokens, " "); } void Index::AddEntries() { size_t numIndexed = 0; - for (auto const & e : m_entries) + for (DocId docId = 0; docId < static_cast(m_docs.size()); ++docId) { - // The entry is indexed only by its address. + auto const & doc = m_docs[static_cast(docId)]; + // The doc is indexed only by its address. // todo(@m) Index it by name too. - if (e.m_type == Type::Count) + if (doc.m_type == Type::Count) continue; - if (e.m_type == Type::Street) + if (doc.m_type == Type::Street) { - AddStreet(e); + AddStreet(docId, doc); } else { - size_t const t = static_cast(e.m_type); - m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e); + size_t const t = static_cast(doc.m_type); + m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId); } - // Index every token but do not index prefixes. - // for (auto const & tok : entry.m_address[t]) - // m_entriesByTokens[{tok}].emplace_back(entry); - ++numIndexed; if (numIndexed % kLogBatch == 0) LOG(LINFO, ("Indexed", numIndexed, "entries")); @@ -80,45 +69,45 @@ void Index::AddEntries() LOG(LINFO, ("Indexed", numIndexed, "entries")); } -void Index::AddStreet(Hierarchy::Entry const & e) +void Index::AddStreet(DocId const & docId, Index::Doc const & doc) { - CHECK_EQUAL(e.m_type, Type::Street, ()); - size_t const t = static_cast(e.m_type); - m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e); + CHECK_EQUAL(doc.m_type, Type::Street, ()); + size_t const t = static_cast(doc.m_type); + m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId); - for (size_t i = 0; i < e.m_address[t].size(); ++i) + for (size_t i = 0; i < doc.m_address[t].size(); ++i) { - if (!search::IsStreetSynonym(strings::MakeUniString(e.m_address[t][i]))) + if (!search::IsStreetSynonym(strings::MakeUniString(doc.m_address[t][i]))) continue; - auto addr = e.m_address[t]; + auto addr = doc.m_address[t]; addr.erase(addr.begin() + i); - m_entriesByTokens[MakeIndexKey(addr)].emplace_back(&e); + m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId); } } -void Index::AddHouses() +void Index::AddHouses(Hierarchy const & hierarchy) { size_t numIndexed = 0; - for (auto & be : m_entries) + for (DocId docId = 0; docId < static_cast(hierarchy.GetEntries().size()); ++docId) { - if (be.m_type != Type::Building) + auto const & buildingDoc = GetDoc(docId); + + if (buildingDoc.m_type != Type::Building) continue; size_t const t = static_cast(Type::Street); - auto const * streetCandidates = GetEntries(be.m_address[t]); - if (streetCandidates == nullptr) - continue; + ForEachDocId(buildingDoc.m_address[t], [&](DocId const & streetCandidate) { + auto const & streetDoc = GetDoc(streetCandidate); + if (hierarchy.IsParent(streetDoc, buildingDoc)) + { + m_buildingsOnStreet[streetCandidate].emplace_back(docId); - for (auto & se : *streetCandidates) - { - if (se->IsParentTo(be)) - m_buildingsOnStreet[se->m_osmId].emplace_back(&be); - } - - ++numIndexed; - if (numIndexed % kLogBatch == 0) - LOG(LINFO, ("Indexed", numIndexed, "houses")); + ++numIndexed; + if (numIndexed % kLogBatch == 0) + LOG(LINFO, ("Indexed", numIndexed, "houses")); + } + }); } if (numIndexed % kLogBatch != 0) diff --git a/geocoder/index.hpp b/geocoder/index.hpp index 569ebb2e40..bc9b283c3f 100644 --- a/geocoder/index.hpp +++ b/geocoder/index.hpp @@ -4,6 +4,7 @@ #include "base/geo_object_id.hpp" +#include #include #include #include @@ -13,36 +14,65 @@ namespace geocoder class Index { public: - using EntryPtr = Hierarchy::Entry const *; + // Number of the entry in the list of all hierarchy entries + // that the index was constructed from. + using DocId = uint32_t; + + using Doc = Hierarchy::Entry; explicit Index(Hierarchy const & hierarchy); - // Returns a pointer to entries whose names exactly match |tokens| - // (the order matters) or nullptr if there are no such entries. + Doc const & GetDoc(DocId const id) const; + + // Calls |fn| for DocIds of Docs whose names exactly match |tokens| (the order matters). // // todo This method (and the whole class, in fact) is in the // prototype stage and may be too slow. Proper indexing should // be implemented to perform this type of queries. - std::vector const * const GetEntries(Tokens const & tokens) const; + template + void ForEachDocId(Tokens const & tokens, Fn && fn) const + { + auto const it = m_docIdsByTokens.find(MakeIndexKey(tokens)); + if (it == m_docIdsByTokens.end()) + return; - std::vector const * const GetBuildingsOnStreet(base::GeoObjectId const & osmId) const; + for (DocId const & docId : it->second) + fn(docId); + } + + // Calls |fn| for DocIds of buildings that are located on the + // street whose DocId is |streetDocId|. + template + void ForEachBuildingOnStreet(DocId const & streetDocId, Fn && fn) const + { + auto const it = m_buildingsOnStreet.find(streetDocId); + if (it == m_buildingsOnStreet.end()) + return; + + for (DocId const & docId : it->second) + fn(docId); + } private: - // Adds address information of entries to the index. + // Converts |tokens| to a single UTF-8 string that can be used + // as a key in the |m_docIdsByTokens| map. + std::string MakeIndexKey(Tokens const & tokens) const; + + // Adds address information of |m_docs| to the index. void AddEntries(); - // Adds the street |e| to the index, with and without synonyms - // of the word "street". - void AddStreet(Hierarchy::Entry const & e); + // Adds the street |e| (which has the id of |docId|) to the index, + // with and without synonyms of the word "street". + void AddStreet(DocId const & docId, Doc const & e); // Fills the |m_buildingsOnStreet| field. - void AddHouses(); + void AddHouses(Hierarchy const & hierarchy); - std::vector const & m_entries; + std::vector const & m_docs; - std::unordered_map> m_entriesByTokens; + std::unordered_map> m_docIdsByTokens; // Lists of houses grouped by the streets they belong to. - std::unordered_map> m_buildingsOnStreet; + std::unordered_map> m_buildingsOnStreet; }; } // namespace geocoder