From dbd3473e5d6bca8e6e49f7a127d7fa11f58b1d12 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Mon, 26 Nov 2018 17:43:52 +0300 Subject: [PATCH] [geocoder] Improvements. * Keeping only the top results (this is not a proper beam search yet but we probably will still need the data structure in the future). * Fixed the parent-child layer check. * A separate code path to index the buildings. --- geocoder/CMakeLists.txt | 2 + geocoder/beam.cpp | 31 ++++ geocoder/beam.hpp | 40 +++++ geocoder/geocoder.cpp | 161 ++++++++++++++++----- geocoder/geocoder.hpp | 14 +- geocoder/geocoder_cli/CMakeLists.txt | 1 + geocoder/geocoder_tests/CMakeLists.txt | 1 + geocoder/geocoder_tests/geocoder_tests.cpp | 7 +- geocoder/hierarchy.cpp | 45 +++++- geocoder/hierarchy.hpp | 14 +- geocoder/result.hpp | 2 + geocoder/types.cpp | 3 +- geocoder/types.hpp | 1 + 13 files changed, 272 insertions(+), 50 deletions(-) create mode 100644 geocoder/beam.cpp create mode 100644 geocoder/beam.hpp diff --git a/geocoder/CMakeLists.txt b/geocoder/CMakeLists.txt index 88065b006c..01f2cf1a35 100644 --- a/geocoder/CMakeLists.txt +++ b/geocoder/CMakeLists.txt @@ -4,6 +4,8 @@ include_directories(${OMIM_ROOT}/3party/jansson/src) set( SRC + beam.cpp + beam.hpp geocoder.cpp geocoder.hpp hierarchy.cpp diff --git a/geocoder/beam.cpp b/geocoder/beam.cpp new file mode 100644 index 0000000000..ab0882232a --- /dev/null +++ b/geocoder/beam.cpp @@ -0,0 +1,31 @@ +#include "geocoder/beam.hpp" + +#include "base/assert.hpp" + +#include + +namespace geocoder +{ +Beam::Beam(size_t capacity) : m_capacity(capacity) { m_entries.reserve(m_capacity); } + +void Beam::Add(Key const & key, Value value) +{ + if (m_capacity == 0) + return; + + Entry const e(key, value); + auto it = std::lower_bound(m_entries.begin(), m_entries.end(), e); + + if (it == m_entries.end()) + { + if (m_entries.size() < m_capacity) + m_entries.emplace_back(e); + return; + } + + if (m_entries.size() == m_capacity) + m_entries.pop_back(); + + m_entries.insert(it, e); +} +} // namespace geocoder diff --git a/geocoder/beam.hpp b/geocoder/beam.hpp new file mode 100644 index 0000000000..68534ce125 --- /dev/null +++ b/geocoder/beam.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include "base/geo_object_id.hpp" + +#include + +namespace geocoder +{ +// A data structure to perform the beam search with. +// Maintains a list of (Key, Value) pairs sorted in the decreasing +// order of Values. +class Beam +{ +public: + using Key = base::GeoObjectId; + using Value = double; + + struct Entry + { + Key m_key; + Value m_value; + + Entry(Key const & key, Value value) : m_key(key), m_value(value) {} + + bool operator<(Entry const & rhs) const { return m_value > rhs.m_value; } + }; + + explicit Beam(size_t capacity); + + // O(log(n) + n) for |n| entries. + // O(|m_capacity|) in the worst case. + void Add(Key const & key, Value value); + + std::vector const & GetEntries() const { return m_entries; } + +private: + size_t m_capacity; + std::vector m_entries; +}; +} // namespace geocoder diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index d74b9adaee..bed0f8af60 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -1,10 +1,14 @@ #include "geocoder/geocoder.hpp" +#include "search/house_numbers_matcher.hpp" + #include "indexer/search_string_utils.hpp" #include "base/assert.hpp" #include "base/logging.hpp" #include "base/scope_guard.hpp" +#include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" #include "base/timer.hpp" #include @@ -14,6 +18,20 @@ using namespace std; namespace { +size_t const kMaxResults = 100; + +map const kWeight = { + {geocoder::Type::Country, 10.0}, + {geocoder::Type::Region, 5.0}, + {geocoder::Type::Subregion, 4.0}, + {geocoder::Type::Locality, 3.0}, + {geocoder::Type::Suburb, 3.0}, + {geocoder::Type::Sublocality, 2.0}, + {geocoder::Type::Street, 1.0}, + {geocoder::Type::Building, 0.1}, + {geocoder::Type::Count, 0.0}, +}; + // todo(@m) This is taken from search/geocoder.hpp. Refactor. struct ScopedMarkTokens { @@ -52,25 +70,29 @@ geocoder::Type NextType(geocoder::Type type) bool FindParent(vector const & layers, geocoder::Hierarchy::Entry const & e) { - for (auto const & layer : layers) + CHECK(!layers.empty(), ()); + auto const & layer = layers.back(); + for (auto const * pe : layer.m_entries) { - for (auto const * pe : layer.m_entries) - { - // Note that the relationship is somewhat inverted: every ancestor - // is stored in the address but the nodes have no information - // about their children. - if (e.m_address[static_cast(pe->m_type)] == pe->m_nameTokens) - return true; - } + // Note that the relationship is somewhat inverted: every ancestor + // is stored in the address but the nodes have no information + // about their children. + if (pe->IsParentTo(e)) + return true; } return false; } + +strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens) +{ + return strings::JoinStrings(tokens, strings::MakeUniString("")); +} } // namespace namespace geocoder { // Geocoder::Context ------------------------------------------------------------------------------- -Geocoder::Context::Context(string const & query) +Geocoder::Context::Context(string const & query) : m_beam(kMaxResults) { search::NormalizeAndTokenizeString(query, m_tokens); m_tokenTypes.assign(m_tokens.size(), Type::Count); @@ -116,15 +138,25 @@ bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_toke void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty) { - m_results[osmId] = max(m_results[osmId], certainty); + m_beam.Add(osmId, certainty); } void Geocoder::Context::FillResults(vector & results) const { results.clear(); - results.reserve(m_results.size()); - for (auto const & e : m_results) - results.emplace_back(e.first /* osmId */, e.second /* certainty */); + results.reserve(m_beam.GetEntries().size()); + + set seen; + for (auto const & e : m_beam.GetEntries()) + { + if (!seen.insert(e.m_key).second) + continue; + + results.emplace_back(e.m_key /* osmId */, e.m_value /* certainty */); + } + + ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ()); + ASSERT_LESS_OR_EQUAL(results.size(), kMaxResults, ()); } vector & Geocoder::Context::GetLayers() { return m_layers; } @@ -132,7 +164,7 @@ vector & Geocoder::Context::GetLayers() { return m_layers; } vector const & Geocoder::Context::GetLayers() const { return m_layers; } // Geocoder ---------------------------------------------------------------------------------------- -Geocoder::Geocoder(string pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {} +Geocoder::Geocoder(string const & pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {} void Geocoder::ProcessQuery(string const & query, vector & results) const { @@ -172,39 +204,92 @@ void Geocoder::Go(Context & ctx, Type type) const subquery.push_back(ctx.GetToken(j)); - auto const * entries = m_hierarchy.GetEntries(subquery); - if (!entries || entries->empty()) - continue; - Layer curLayer; curLayer.m_type = type; - for (auto const & e : *entries) - { - if (e.m_type != type) - continue; - if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e)) - curLayer.m_entries.emplace_back(&e); + // Buildings are indexed separately. + if (type == Type::Building) + { + FillBuildingsLayer(ctx, subquery, curLayer); + } + else + { + FillRegularLayer(ctx, type, subquery, curLayer); } - if (!curLayer.m_entries.empty()) + if (curLayer.m_entries.empty()) + continue; + + ScopedMarkTokens mark(ctx, type, i, j + 1); + + // double const certainty = + // static_cast(ctx.GetNumUsedTokens()) / + // static_cast(ctx.GetNumTokens()); + + double certainty = 0; + for (auto const t : ctx.GetTokenTypes()) { - ScopedMarkTokens mark(ctx, type, i, j + 1); - - double const certainty = - static_cast(ctx.GetNumUsedTokens()) / static_cast(ctx.GetNumTokens()); - - for (auto const * e : curLayer.m_entries) - ctx.AddResult(e->m_osmId, certainty); - - ctx.GetLayers().emplace_back(move(curLayer)); - SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); }); - - Go(ctx, NextType(type)); + auto const it = kWeight.find(t); + CHECK(it != kWeight.end(), ()); + certainty += it->second; } + LOG(LINFO, (ctx.GetTokenTypes(), certainty)); + + for (auto const * e : curLayer.m_entries) + { + ctx.AddResult(e->m_osmId, certainty); + } + + ctx.GetLayers().emplace_back(move(curLayer)); + SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); }); + + Go(ctx, NextType(type)); } } Go(ctx, NextType(type)); } + +void Geocoder::FillBuildingsLayer(Context const & ctx, Tokens const & subquery, + Layer & curLayer) const +{ + if (ctx.GetLayers().empty()) + return; + auto const & layer = ctx.GetLayers().back(); + if (layer.m_type != Type::Street) + return; + + auto const & subqueryHN = MakeHouseNumber(subquery); + + if (!search::house_numbers::LooksLikeHouseNumber(subqueryHN, false /* isPrefix */)) + return; + + for (auto const & se : layer.m_entries) + { + for (auto const & be : se->m_buildingsOnStreet) + { + auto const bt = static_cast(Type::Building); + auto const & realHN = MakeHouseNumber(be->m_address[bt]); + if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */)) + curLayer.m_entries.emplace_back(be); + } + } +} + +void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery, + Layer & curLayer) const +{ + auto const * entries = m_hierarchy.GetEntries(subquery); + if (!entries || entries->empty()) + return; + + for (auto const & e : *entries) + { + if (e.m_type != type) + continue; + + if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e)) + curLayer.m_entries.emplace_back(&e); + } +} } // namespace geocoder diff --git a/geocoder/geocoder.hpp b/geocoder/geocoder.hpp index 6075ad8fda..077b4e7402 100644 --- a/geocoder/geocoder.hpp +++ b/geocoder/geocoder.hpp @@ -1,5 +1,6 @@ #pragma once +#include "geocoder/beam.hpp" #include "geocoder/hierarchy.hpp" #include "geocoder/result.hpp" #include "geocoder/types.hpp" @@ -9,6 +10,7 @@ #include #include +#include #include namespace geocoder @@ -78,13 +80,14 @@ public: size_t m_numUsedTokens = 0; - // The highest value of certainty for each retrieved osm id. - std::unordered_map m_results; + // The highest value of certainty for a fixed amount of + // the most relevant retrieved osm ids. + Beam m_beam; std::vector m_layers; }; - explicit Geocoder(std::string pathToJsonHierarchy); + explicit Geocoder(std::string const & pathToJsonHierarchy); void ProcessQuery(std::string const & query, std::vector & results) const; @@ -93,7 +96,10 @@ public: private: void Go(Context & ctx, Type type) const; - void EmitResult() const; + void FillBuildingsLayer(Context const & ctx, Tokens const & subquery, Layer & curLayer) const; + + void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery, + Layer & curLayer) const; Hierarchy m_hierarchy; }; diff --git a/geocoder/geocoder_cli/CMakeLists.txt b/geocoder/geocoder_cli/CMakeLists.txt index 1080e7d468..d2e5655ab6 100644 --- a/geocoder/geocoder_cli/CMakeLists.txt +++ b/geocoder/geocoder_cli/CMakeLists.txt @@ -12,6 +12,7 @@ omim_add_executable(${PROJECT_NAME} ${SRC}) omim_link_libraries( ${PROJECT_NAME} geocoder + search indexer platform coding diff --git a/geocoder/geocoder_tests/CMakeLists.txt b/geocoder/geocoder_tests/CMakeLists.txt index cf9478c276..a59bd7e1ac 100644 --- a/geocoder/geocoder_tests/CMakeLists.txt +++ b/geocoder/geocoder_tests/CMakeLists.txt @@ -11,6 +11,7 @@ omim_link_libraries( ${PROJECT_NAME} platform_tests_support geocoder + search indexer platform coding diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 13d8f61909..5de5c5f2b9 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -60,9 +60,10 @@ UNIT_TEST(Geocoder_Smoke) base::GeoObjectId const florenciaId(0xc00000000059d6b5); base::GeoObjectId const cubaId(0xc00000000004b279); - TestGeocoder(geocoder, "florencia", {{florenciaId, 1.0}}); - TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 1.0}, {cubaId, 0.5}}); - TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 0.25}, {florenciaId, 0.5}}); + // todo(@m) Return the certainty levels back to the [0.0, 1.0] range. + TestGeocoder(geocoder, "florencia", {{florenciaId, 4.0}}); + TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 14.0}, {cubaId, 10.0}}); + TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 10.0}, {florenciaId, 14.0}}); } UNIT_TEST(Geocoder_Hierarchy) diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index 8a47aefc27..30dc423e00 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -7,6 +7,9 @@ #include "base/logging.hpp" #include "base/macros.hpp" +#include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" + #include using namespace std; @@ -81,10 +84,19 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const else if (m_nameTokens != m_address[static_cast(m_type)]) { ++stats.m_mismatchedNames; - LOG(LDEBUG, ("Hierarchy entry name is not the most detailed field in its address:", jsonStr)); } } +bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const +{ + for (size_t i = 0; i < static_cast(geocoder::Type::Count); ++i) + { + if (!m_address[i].empty() && m_address[i] != e.m_address[i]) + return false; + } + return true; +} + // Hierarchy --------------------------------------------------------------------------------------- Hierarchy::Hierarchy(string const & pathToJsonHierarchy) { @@ -114,16 +126,22 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy) if (!entry.DeserializeFromJSON(line, stats)) continue; - // The entry is indexed only by its address. + // The entry is indexed only its address. // todo(@m) Index it by name too. if (entry.m_type != Type::Count) { ++stats.m_numLoaded; size_t const t = static_cast(entry.m_type); m_entries[entry.m_address[t]].emplace_back(entry); + + // Index every token but do not index prefixes. + // for (auto const & tok : entry.m_address[t]) + // m_entries[{tok}].emplace_back(entry); } } + IndexHouses(); + LOG(LINFO, ("Finished reading the hierarchy. Stats:")); LOG(LINFO, ("Entries indexed:", stats.m_numLoaded)); LOG(LINFO, ("Corrupted json lines:", stats.m_badJsons)); @@ -145,4 +163,27 @@ vector const * const Hierarchy::GetEntries( return &it->second; } + +void Hierarchy::IndexHouses() +{ + for (auto const & it : m_entries) + { + for (auto const & be : it.second) + { + if (be.m_type != Type::Building) + continue; + + size_t const t = static_cast(Type::Street); + auto const * streetCandidates = GetEntries(be.m_address[t]); + if (streetCandidates == nullptr) + continue; + + for (auto & se : *streetCandidates) + { + if (se.IsParentTo(be)) + se.m_buildingsOnStreet.emplace_back(&be); + } + } + } +} } // namespace geocoder diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index 77b66f7793..57cc09e6f7 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -57,17 +57,24 @@ public: void DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr, ParsingStats & stats); + // Checks whether this entry is a parent of |e|. + bool IsParentTo(Entry const & e) const; + base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid); // Original name of the entry. Useful for debugging. std::string m_name; // Tokenized and simplified name of the entry. - std::vector m_nameTokens; + Tokens m_nameTokens; Type m_type = Type::Count; // The address fields of this entry, one per Type. std::array(Type::Count) + 1> m_address; + + // List of houses that belong to the street that is desribed by this entry. + // Only valid if |m_type| is Type::Street. + mutable std::vector m_buildingsOnStreet; }; explicit Hierarchy(std::string const & pathToJsonHierarchy); @@ -77,10 +84,13 @@ public: // // todo This method (and the whole class, in fact) is in the // prototype stage and may be too slow. Proper indexing should - // be implemented to perform this type of queries.a + // be implemented to perform this type of queries. std::vector const * const GetEntries(std::vector const & tokens) const; private: + // Fills |m_buildingsOnStreet| field for all street entries. + void IndexHouses(); + std::map> m_entries; }; } // namespace geocoder diff --git a/geocoder/result.hpp b/geocoder/result.hpp index ba04bad46d..c073125ea7 100644 --- a/geocoder/result.hpp +++ b/geocoder/result.hpp @@ -8,6 +8,8 @@ namespace geocoder { struct Result { + Result() = default; + Result(base::GeoObjectId const & osmId, double certainty) : m_osmId(osmId), m_certainty(certainty) { } diff --git a/geocoder/types.cpp b/geocoder/types.cpp index cd61053f04..52b6024467 100644 --- a/geocoder/types.cpp +++ b/geocoder/types.cpp @@ -14,8 +14,9 @@ string ToString(Type type) case Type::Region: return "region"; case Type::Subregion: return "subregion"; case Type::Locality: return "locality"; - case Type::Sublocality: return "sublocality"; case Type::Suburb: return "suburb"; + case Type::Sublocality: return "sublocality"; + case Type::Street: return "street"; case Type::Building: return "building"; case Type::Count: return "count"; } diff --git a/geocoder/types.hpp b/geocoder/types.hpp index 776fd71c18..41977c2643 100644 --- a/geocoder/types.hpp +++ b/geocoder/types.hpp @@ -19,6 +19,7 @@ enum class Type Locality, Suburb, Sublocality, + Street, Building, Count