[geocoder] Improvements.

* Keeping only the top results (this is not a proper beam search
  yet but we probably will still need the data structure in the future).
* Fixed the parent-child layer check.
* A separate code path to index the buildings.
This commit is contained in:
Maxim Pimenov 2018-11-26 17:43:52 +03:00 committed by Tatiana Yan
parent cac79f6091
commit dbd3473e5d
13 changed files with 272 additions and 50 deletions

View file

@ -4,6 +4,8 @@ include_directories(${OMIM_ROOT}/3party/jansson/src)
set(
SRC
beam.cpp
beam.hpp
geocoder.cpp
geocoder.hpp
hierarchy.cpp

31
geocoder/beam.cpp Normal file
View file

@ -0,0 +1,31 @@
#include "geocoder/beam.hpp"
#include "base/assert.hpp"
#include <algorithm>
namespace geocoder
{
Beam::Beam(size_t capacity) : m_capacity(capacity) { m_entries.reserve(m_capacity); }
void Beam::Add(Key const & key, Value value)
{
if (m_capacity == 0)
return;
Entry const e(key, value);
auto it = std::lower_bound(m_entries.begin(), m_entries.end(), e);
if (it == m_entries.end())
{
if (m_entries.size() < m_capacity)
m_entries.emplace_back(e);
return;
}
if (m_entries.size() == m_capacity)
m_entries.pop_back();
m_entries.insert(it, e);
}
} // namespace geocoder

40
geocoder/beam.hpp Normal file
View file

@ -0,0 +1,40 @@
#pragma once
#include "base/geo_object_id.hpp"
#include <vector>
namespace geocoder
{
// A data structure to perform the beam search with.
// Maintains a list of (Key, Value) pairs sorted in the decreasing
// order of Values.
class Beam
{
public:
using Key = base::GeoObjectId;
using Value = double;
struct Entry
{
Key m_key;
Value m_value;
Entry(Key const & key, Value value) : m_key(key), m_value(value) {}
bool operator<(Entry const & rhs) const { return m_value > rhs.m_value; }
};
explicit Beam(size_t capacity);
// O(log(n) + n) for |n| entries.
// O(|m_capacity|) in the worst case.
void Add(Key const & key, Value value);
std::vector<Entry> const & GetEntries() const { return m_entries; }
private:
size_t m_capacity;
std::vector<Entry> m_entries;
};
} // namespace geocoder

View file

@ -1,10 +1,14 @@
#include "geocoder/geocoder.hpp"
#include "search/house_numbers_matcher.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/assert.hpp"
#include "base/logging.hpp"
#include "base/scope_guard.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include "base/timer.hpp"
#include <algorithm>
@ -14,6 +18,20 @@ using namespace std;
namespace
{
size_t const kMaxResults = 100;
map<geocoder::Type, double> const kWeight = {
{geocoder::Type::Country, 10.0},
{geocoder::Type::Region, 5.0},
{geocoder::Type::Subregion, 4.0},
{geocoder::Type::Locality, 3.0},
{geocoder::Type::Suburb, 3.0},
{geocoder::Type::Sublocality, 2.0},
{geocoder::Type::Street, 1.0},
{geocoder::Type::Building, 0.1},
{geocoder::Type::Count, 0.0},
};
// todo(@m) This is taken from search/geocoder.hpp. Refactor.
struct ScopedMarkTokens
{
@ -52,25 +70,29 @@ geocoder::Type NextType(geocoder::Type type)
bool FindParent(vector<geocoder::Geocoder::Layer> const & layers,
geocoder::Hierarchy::Entry const & e)
{
for (auto const & layer : layers)
CHECK(!layers.empty(), ());
auto const & layer = layers.back();
for (auto const * pe : layer.m_entries)
{
for (auto const * pe : layer.m_entries)
{
// Note that the relationship is somewhat inverted: every ancestor
// is stored in the address but the nodes have no information
// about their children.
if (e.m_address[static_cast<size_t>(pe->m_type)] == pe->m_nameTokens)
return true;
}
// Note that the relationship is somewhat inverted: every ancestor
// is stored in the address but the nodes have no information
// about their children.
if (pe->IsParentTo(e))
return true;
}
return false;
}
strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens)
{
return strings::JoinStrings(tokens, strings::MakeUniString(""));
}
} // namespace
namespace geocoder
{
// Geocoder::Context -------------------------------------------------------------------------------
Geocoder::Context::Context(string const & query)
Geocoder::Context::Context(string const & query) : m_beam(kMaxResults)
{
search::NormalizeAndTokenizeString(query, m_tokens);
m_tokenTypes.assign(m_tokens.size(), Type::Count);
@ -116,15 +138,25 @@ bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_toke
void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty)
{
m_results[osmId] = max(m_results[osmId], certainty);
m_beam.Add(osmId, certainty);
}
void Geocoder::Context::FillResults(vector<Result> & results) const
{
results.clear();
results.reserve(m_results.size());
for (auto const & e : m_results)
results.emplace_back(e.first /* osmId */, e.second /* certainty */);
results.reserve(m_beam.GetEntries().size());
set<decltype(m_beam)::Key> seen;
for (auto const & e : m_beam.GetEntries())
{
if (!seen.insert(e.m_key).second)
continue;
results.emplace_back(e.m_key /* osmId */, e.m_value /* certainty */);
}
ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ());
ASSERT_LESS_OR_EQUAL(results.size(), kMaxResults, ());
}
vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
@ -132,7 +164,7 @@ vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
vector<Geocoder::Layer> const & Geocoder::Context::GetLayers() const { return m_layers; }
// Geocoder ----------------------------------------------------------------------------------------
Geocoder::Geocoder(string pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {}
Geocoder::Geocoder(string const & pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {}
void Geocoder::ProcessQuery(string const & query, vector<Result> & results) const
{
@ -172,39 +204,92 @@ void Geocoder::Go(Context & ctx, Type type) const
subquery.push_back(ctx.GetToken(j));
auto const * entries = m_hierarchy.GetEntries(subquery);
if (!entries || entries->empty())
continue;
Layer curLayer;
curLayer.m_type = type;
for (auto const & e : *entries)
{
if (e.m_type != type)
continue;
if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e))
curLayer.m_entries.emplace_back(&e);
// Buildings are indexed separately.
if (type == Type::Building)
{
FillBuildingsLayer(ctx, subquery, curLayer);
}
else
{
FillRegularLayer(ctx, type, subquery, curLayer);
}
if (!curLayer.m_entries.empty())
if (curLayer.m_entries.empty())
continue;
ScopedMarkTokens mark(ctx, type, i, j + 1);
// double const certainty =
// static_cast<double>(ctx.GetNumUsedTokens()) /
// static_cast<double>(ctx.GetNumTokens());
double certainty = 0;
for (auto const t : ctx.GetTokenTypes())
{
ScopedMarkTokens mark(ctx, type, i, j + 1);
double const certainty =
static_cast<double>(ctx.GetNumUsedTokens()) / static_cast<double>(ctx.GetNumTokens());
for (auto const * e : curLayer.m_entries)
ctx.AddResult(e->m_osmId, certainty);
ctx.GetLayers().emplace_back(move(curLayer));
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
Go(ctx, NextType(type));
auto const it = kWeight.find(t);
CHECK(it != kWeight.end(), ());
certainty += it->second;
}
LOG(LINFO, (ctx.GetTokenTypes(), certainty));
for (auto const * e : curLayer.m_entries)
{
ctx.AddResult(e->m_osmId, certainty);
}
ctx.GetLayers().emplace_back(move(curLayer));
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
Go(ctx, NextType(type));
}
}
Go(ctx, NextType(type));
}
void Geocoder::FillBuildingsLayer(Context const & ctx, Tokens const & subquery,
Layer & curLayer) const
{
if (ctx.GetLayers().empty())
return;
auto const & layer = ctx.GetLayers().back();
if (layer.m_type != Type::Street)
return;
auto const & subqueryHN = MakeHouseNumber(subquery);
if (!search::house_numbers::LooksLikeHouseNumber(subqueryHN, false /* isPrefix */))
return;
for (auto const & se : layer.m_entries)
{
for (auto const & be : se->m_buildingsOnStreet)
{
auto const bt = static_cast<size_t>(Type::Building);
auto const & realHN = MakeHouseNumber(be->m_address[bt]);
if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */))
curLayer.m_entries.emplace_back(be);
}
}
}
void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
Layer & curLayer) const
{
auto const * entries = m_hierarchy.GetEntries(subquery);
if (!entries || entries->empty())
return;
for (auto const & e : *entries)
{
if (e.m_type != type)
continue;
if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e))
curLayer.m_entries.emplace_back(&e);
}
}
} // namespace geocoder

View file

@ -1,5 +1,6 @@
#pragma once
#include "geocoder/beam.hpp"
#include "geocoder/hierarchy.hpp"
#include "geocoder/result.hpp"
#include "geocoder/types.hpp"
@ -9,6 +10,7 @@
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace geocoder
@ -78,13 +80,14 @@ public:
size_t m_numUsedTokens = 0;
// The highest value of certainty for each retrieved osm id.
std::unordered_map<base::GeoObjectId, double> m_results;
// The highest value of certainty for a fixed amount of
// the most relevant retrieved osm ids.
Beam m_beam;
std::vector<Layer> m_layers;
};
explicit Geocoder(std::string pathToJsonHierarchy);
explicit Geocoder(std::string const & pathToJsonHierarchy);
void ProcessQuery(std::string const & query, std::vector<Result> & results) const;
@ -93,7 +96,10 @@ public:
private:
void Go(Context & ctx, Type type) const;
void EmitResult() const;
void FillBuildingsLayer(Context const & ctx, Tokens const & subquery, Layer & curLayer) const;
void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
Layer & curLayer) const;
Hierarchy m_hierarchy;
};

View file

@ -12,6 +12,7 @@ omim_add_executable(${PROJECT_NAME} ${SRC})
omim_link_libraries(
${PROJECT_NAME}
geocoder
search
indexer
platform
coding

View file

@ -11,6 +11,7 @@ omim_link_libraries(
${PROJECT_NAME}
platform_tests_support
geocoder
search
indexer
platform
coding

View file

@ -60,9 +60,10 @@ UNIT_TEST(Geocoder_Smoke)
base::GeoObjectId const florenciaId(0xc00000000059d6b5);
base::GeoObjectId const cubaId(0xc00000000004b279);
TestGeocoder(geocoder, "florencia", {{florenciaId, 1.0}});
TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 1.0}, {cubaId, 0.5}});
TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 0.25}, {florenciaId, 0.5}});
// todo(@m) Return the certainty levels back to the [0.0, 1.0] range.
TestGeocoder(geocoder, "florencia", {{florenciaId, 4.0}});
TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 14.0}, {cubaId, 10.0}});
TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 10.0}, {florenciaId, 14.0}});
}
UNIT_TEST(Geocoder_Hierarchy)

View file

@ -7,6 +7,9 @@
#include "base/logging.hpp"
#include "base/macros.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include <fstream>
using namespace std;
@ -81,10 +84,19 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
else if (m_nameTokens != m_address[static_cast<size_t>(m_type)])
{
++stats.m_mismatchedNames;
LOG(LDEBUG, ("Hierarchy entry name is not the most detailed field in its address:", jsonStr));
}
}
bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const
{
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
{
if (!m_address[i].empty() && m_address[i] != e.m_address[i])
return false;
}
return true;
}
// Hierarchy ---------------------------------------------------------------------------------------
Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
{
@ -114,16 +126,22 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
if (!entry.DeserializeFromJSON(line, stats))
continue;
// The entry is indexed only by its address.
// The entry is indexed only its address.
// todo(@m) Index it by name too.
if (entry.m_type != Type::Count)
{
++stats.m_numLoaded;
size_t const t = static_cast<size_t>(entry.m_type);
m_entries[entry.m_address[t]].emplace_back(entry);
// Index every token but do not index prefixes.
// for (auto const & tok : entry.m_address[t])
// m_entries[{tok}].emplace_back(entry);
}
}
IndexHouses();
LOG(LINFO, ("Finished reading the hierarchy. Stats:"));
LOG(LINFO, ("Entries indexed:", stats.m_numLoaded));
LOG(LINFO, ("Corrupted json lines:", stats.m_badJsons));
@ -145,4 +163,27 @@ vector<Hierarchy::Entry> const * const Hierarchy::GetEntries(
return &it->second;
}
void Hierarchy::IndexHouses()
{
for (auto const & it : m_entries)
{
for (auto const & be : it.second)
{
if (be.m_type != Type::Building)
continue;
size_t const t = static_cast<size_t>(Type::Street);
auto const * streetCandidates = GetEntries(be.m_address[t]);
if (streetCandidates == nullptr)
continue;
for (auto & se : *streetCandidates)
{
if (se.IsParentTo(be))
se.m_buildingsOnStreet.emplace_back(&be);
}
}
}
}
} // namespace geocoder

View file

@ -57,17 +57,24 @@ public:
void DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
ParsingStats & stats);
// Checks whether this entry is a parent of |e|.
bool IsParentTo(Entry const & e) const;
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
// Original name of the entry. Useful for debugging.
std::string m_name;
// Tokenized and simplified name of the entry.
std::vector<strings::UniString> m_nameTokens;
Tokens m_nameTokens;
Type m_type = Type::Count;
// The address fields of this entry, one per Type.
std::array<Tokens, static_cast<size_t>(Type::Count) + 1> m_address;
// List of houses that belong to the street that is desribed by this entry.
// Only valid if |m_type| is Type::Street.
mutable std::vector<Entry const *> m_buildingsOnStreet;
};
explicit Hierarchy(std::string const & pathToJsonHierarchy);
@ -77,10 +84,13 @@ public:
//
// todo This method (and the whole class, in fact) is in the
// prototype stage and may be too slow. Proper indexing should
// be implemented to perform this type of queries.a
// be implemented to perform this type of queries.
std::vector<Entry> const * const GetEntries(std::vector<strings::UniString> const & tokens) const;
private:
// Fills |m_buildingsOnStreet| field for all street entries.
void IndexHouses();
std::map<Tokens, std::vector<Entry>> m_entries;
};
} // namespace geocoder

View file

@ -8,6 +8,8 @@ namespace geocoder
{
struct Result
{
Result() = default;
Result(base::GeoObjectId const & osmId, double certainty) : m_osmId(osmId), m_certainty(certainty)
{
}

View file

@ -14,8 +14,9 @@ string ToString(Type type)
case Type::Region: return "region";
case Type::Subregion: return "subregion";
case Type::Locality: return "locality";
case Type::Sublocality: return "sublocality";
case Type::Suburb: return "suburb";
case Type::Sublocality: return "sublocality";
case Type::Street: return "street";
case Type::Building: return "building";
case Type::Count: return "count";
}

View file

@ -19,6 +19,7 @@ enum class Type
Locality,
Suburb,
Sublocality,
Street,
Building,
Count