From 87e199b36f911e2e87fbfbdd01646fed3fef3d37 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Tue, 27 Nov 2018 13:52:55 +0300 Subject: [PATCH] [geocoder] Better indexing. --- geocoder/geocoder.cpp | 16 ++-- geocoder/geocoder_tests/geocoder_tests.cpp | 15 ++-- geocoder/hierarchy.cpp | 97 ++++++++++++++-------- geocoder/hierarchy.hpp | 10 ++- 4 files changed, 91 insertions(+), 47 deletions(-) diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index f3515daa58..1bc73b2575 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -156,6 +156,13 @@ void Geocoder::Context::FillResults(vector & results) const results.emplace_back(e.m_key /* osmId */, e.m_value /* certainty */); } + if (!results.empty()) + { + auto const by = results.front().m_certainty; + for (auto & r : results) + r.m_certainty /= by; + } + ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ()); ASSERT_LESS_OR_EQUAL(results.size(), kMaxResults, ()); } @@ -234,7 +241,6 @@ void Geocoder::Go(Context & ctx, Type type) const CHECK(it != kWeight.end(), ()); certainty += it->second; } - LOG(LINFO, (ctx.GetTokenTypes(), certainty)); for (auto const * e : curLayer.m_entries) { @@ -284,13 +290,13 @@ void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & s if (!entries || entries->empty()) return; - for (auto const & e : *entries) + for (auto const * e : *entries) { - if (e.m_type != type) + if (e->m_type != type) continue; - if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e)) - curLayer.m_entries.emplace_back(&e); + if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), *e)) + curLayer.m_entries.emplace_back(e); } } } // namespace geocoder diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 5de5c5f2b9..52730b8d6e 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -46,6 +46,7 @@ void TestGeocoder(Geocoder & geocoder, string const & query, vector && e sort(expected.begin(), expected.end(), base::LessBy(&Result::m_osmId)); for (size_t i = 0; i < actual.size(); ++i) { + TEST(actual[i].m_certainty >= 0.0 && actual[i].m_certainty <= 1.0, (actual[i].m_certainty)); TEST_EQUAL(actual[i].m_osmId, expected[i].m_osmId, ()); TEST(base::AlmostEqualAbs(actual[i].m_certainty, expected[i].m_certainty, kCertaintyEps), (query, actual[i].m_certainty, expected[i].m_certainty)); @@ -60,10 +61,9 @@ UNIT_TEST(Geocoder_Smoke) base::GeoObjectId const florenciaId(0xc00000000059d6b5); base::GeoObjectId const cubaId(0xc00000000004b279); - // todo(@m) Return the certainty levels back to the [0.0, 1.0] range. - TestGeocoder(geocoder, "florencia", {{florenciaId, 4.0}}); - TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 14.0}, {cubaId, 10.0}}); - TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 10.0}, {florenciaId, 14.0}}); + TestGeocoder(geocoder, "florencia", {{florenciaId, 1.0}}); + TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 1.0}, {cubaId, 0.714286}}); + TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 0.714286}, {florenciaId, 1.0}}); } UNIT_TEST(Geocoder_Hierarchy) @@ -75,9 +75,10 @@ UNIT_TEST(Geocoder_Hierarchy) TEST(entries, ()); TEST_EQUAL(entries->size(), 1, ()); - TEST_EQUAL((*entries)[0].m_address[static_cast(Type::Country)], Split("cuba"), ()); - TEST_EQUAL((*entries)[0].m_address[static_cast(Type::Region)], Split("ciego de avila"), + TEST_EQUAL((*entries)[0]->m_address[static_cast(Type::Country)], Split("cuba"), ()); + TEST_EQUAL((*entries)[0]->m_address[static_cast(Type::Region)], Split("ciego de avila"), + ()); + TEST_EQUAL((*entries)[0]->m_address[static_cast(Type::Subregion)], Split("florencia"), ()); - TEST_EQUAL((*entries)[0].m_address[static_cast(Type::Subregion)], Split("florencia"), ()); } } // namespace geocoder diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index 168a965b6b..1288335bf1 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -11,9 +11,16 @@ #include "base/string_utils.hpp" #include +#include using namespace std; +namespace +{ +// Information will be logged for every |kLogBatch| entries. +size_t const kLogBatch = 100000; +} // namespace + namespace geocoder { // Hierarchy::Entry -------------------------------------------------------------------------------- @@ -104,6 +111,7 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy) string line; ParsingStats stats; + LOG(LINFO, ("Reading entries")); while (getline(ifs, line)) { if (line.empty()) @@ -126,23 +134,21 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy) if (!entry.DeserializeFromJSON(line, stats)) continue; - // The entry is indexed only its address. - // todo(@m) Index it by name too. - if (entry.m_type != Type::Count) - { - ++stats.m_numLoaded; - size_t const t = static_cast(entry.m_type); - m_entries[entry.m_address[t]].emplace_back(entry); + if (entry.m_type == Type::Count) + continue; - // Index every token but do not index prefixes. - // for (auto const & tok : entry.m_address[t]) - // m_entries[{tok}].emplace_back(entry); - } + ++stats.m_numLoaded; + if (stats.m_numLoaded % kLogBatch == 0) + LOG(LINFO, ("Read", stats.m_numLoaded, "entries")); + m_entriesStorage.emplace_back(move(entry)); } + LOG(LINFO, ("Indexing entries")); + IndexEntries(); + LOG(LINFO, ("Indexing houses")); IndexHouses(); - LOG(LINFO, ("Finished reading the hierarchy. Stats:")); + LOG(LINFO, ("Finished reading and indexing the hierarchy. Stats:")); LOG(LINFO, ("Entries indexed:", stats.m_numLoaded)); LOG(LINFO, ("Corrupted json lines:", stats.m_badJsons)); LOG(LINFO, ("Unreadable base::GeoObjectIds:", stats.m_badOsmIds)); @@ -154,39 +160,64 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy) LOG(LINFO, ("(End of stats.)")); } -vector const * const Hierarchy::GetEntries( +vector const * const Hierarchy::GetEntries( vector const & tokens) const { - auto it = m_entries.find(tokens); - if (it == m_entries.end()) + auto it = m_entriesByTokens.find(tokens); + if (it == m_entriesByTokens.end()) return {}; return &it->second; } +void Hierarchy::IndexEntries() +{ + size_t numIndexed = 0; + for (Entry & e : m_entriesStorage) + { + // The entry is indexed only by its address. + // todo(@m) Index it by name too. + if (e.m_type == Type::Count) + continue; + + size_t const t = static_cast(e.m_type); + m_entriesByTokens[e.m_address[t]].emplace_back(&e); + + // Index every token but do not index prefixes. + // for (auto const & tok : entry.m_address[t]) + // m_entriesByTokens[{tok}].emplace_back(entry); + + ++numIndexed; + if (numIndexed % kLogBatch == 0) + LOG(LINFO, ("Indexed", numIndexed, "entries")); + } +} + void Hierarchy::IndexHouses() { - for (auto const & it : m_entries) + size_t numIndexed = 0; + for (auto const & be : m_entriesStorage) { - for (auto const & be : it.second) + if (be.m_type != Type::Building) + continue; + + size_t const t = static_cast(Type::Street); + + // GetEntries() cannot be used here because one of the + // "consts" in it conflicts with the emplace_back below. + auto streetCandidatesIt = m_entriesByTokens.find(be.m_address[t]); + if (streetCandidatesIt == m_entriesByTokens.end()) + continue; + + for (auto & se : streetCandidatesIt->second) { - if (be.m_type != Type::Building) - continue; - - size_t const t = static_cast(Type::Street); - - // GetEntries() cannot be used here because one of the - // "consts" in it conflicts with the emplace_back below. - auto streetCandidatesIt = m_entries.find(be.m_address[t]); - if (streetCandidatesIt == m_entries.end()) - continue; - - for (auto & se : streetCandidatesIt->second) - { - if (se.IsParentTo(be)) - se.m_buildingsOnStreet.emplace_back(&be); - } + if (se->IsParentTo(be)) + se->m_buildingsOnStreet.emplace_back(&be); } + + ++numIndexed; + if (numIndexed % kLogBatch == 0) + LOG(LINFO, ("Indexed", numIndexed, "houses")); } } } // namespace geocoder diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index 95d9753552..d5bad91d6d 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -85,12 +85,18 @@ public: // todo This method (and the whole class, in fact) is in the // prototype stage and may be too slow. Proper indexing should // be implemented to perform this type of queries. - std::vector const * const GetEntries(std::vector const & tokens) const; + std::vector const * const GetEntries( + std::vector const & tokens) const; private: + // Adds address information of entries to the index. + void IndexEntries(); + // Fills |m_buildingsOnStreet| field for all street entries. void IndexHouses(); - std::map> m_entries; + std::map> m_entriesByTokens; + + std::vector m_entriesStorage; }; } // namespace geocoder