forked from organicmaps/organicmaps
[geocoder] Improvements.
* Keeping only the top results (this is not a proper beam search yet but we probably will still need the data structure in the future). * Fixed the parent-child layer check. * A separate code path to index the buildings.
This commit is contained in:
parent
cac79f6091
commit
dbd3473e5d
13 changed files with 272 additions and 50 deletions
|
@ -4,6 +4,8 @@ include_directories(${OMIM_ROOT}/3party/jansson/src)
|
|||
|
||||
set(
|
||||
SRC
|
||||
beam.cpp
|
||||
beam.hpp
|
||||
geocoder.cpp
|
||||
geocoder.hpp
|
||||
hierarchy.cpp
|
||||
|
|
31
geocoder/beam.cpp
Normal file
31
geocoder/beam.cpp
Normal file
|
@ -0,0 +1,31 @@
|
|||
#include "geocoder/beam.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
Beam::Beam(size_t capacity) : m_capacity(capacity) { m_entries.reserve(m_capacity); }
|
||||
|
||||
void Beam::Add(Key const & key, Value value)
|
||||
{
|
||||
if (m_capacity == 0)
|
||||
return;
|
||||
|
||||
Entry const e(key, value);
|
||||
auto it = std::lower_bound(m_entries.begin(), m_entries.end(), e);
|
||||
|
||||
if (it == m_entries.end())
|
||||
{
|
||||
if (m_entries.size() < m_capacity)
|
||||
m_entries.emplace_back(e);
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_entries.size() == m_capacity)
|
||||
m_entries.pop_back();
|
||||
|
||||
m_entries.insert(it, e);
|
||||
}
|
||||
} // namespace geocoder
|
40
geocoder/beam.hpp
Normal file
40
geocoder/beam.hpp
Normal file
|
@ -0,0 +1,40 @@
|
|||
#pragma once
|
||||
|
||||
#include "base/geo_object_id.hpp"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
// A data structure to perform the beam search with.
|
||||
// Maintains a list of (Key, Value) pairs sorted in the decreasing
|
||||
// order of Values.
|
||||
class Beam
|
||||
{
|
||||
public:
|
||||
using Key = base::GeoObjectId;
|
||||
using Value = double;
|
||||
|
||||
struct Entry
|
||||
{
|
||||
Key m_key;
|
||||
Value m_value;
|
||||
|
||||
Entry(Key const & key, Value value) : m_key(key), m_value(value) {}
|
||||
|
||||
bool operator<(Entry const & rhs) const { return m_value > rhs.m_value; }
|
||||
};
|
||||
|
||||
explicit Beam(size_t capacity);
|
||||
|
||||
// O(log(n) + n) for |n| entries.
|
||||
// O(|m_capacity|) in the worst case.
|
||||
void Add(Key const & key, Value value);
|
||||
|
||||
std::vector<Entry> const & GetEntries() const { return m_entries; }
|
||||
|
||||
private:
|
||||
size_t m_capacity;
|
||||
std::vector<Entry> m_entries;
|
||||
};
|
||||
} // namespace geocoder
|
|
@ -1,10 +1,14 @@
|
|||
#include "geocoder/geocoder.hpp"
|
||||
|
||||
#include "search/house_numbers_matcher.hpp"
|
||||
|
||||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/logging.hpp"
|
||||
#include "base/scope_guard.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
#include "base/timer.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -14,6 +18,20 @@ using namespace std;
|
|||
|
||||
namespace
|
||||
{
|
||||
size_t const kMaxResults = 100;
|
||||
|
||||
map<geocoder::Type, double> const kWeight = {
|
||||
{geocoder::Type::Country, 10.0},
|
||||
{geocoder::Type::Region, 5.0},
|
||||
{geocoder::Type::Subregion, 4.0},
|
||||
{geocoder::Type::Locality, 3.0},
|
||||
{geocoder::Type::Suburb, 3.0},
|
||||
{geocoder::Type::Sublocality, 2.0},
|
||||
{geocoder::Type::Street, 1.0},
|
||||
{geocoder::Type::Building, 0.1},
|
||||
{geocoder::Type::Count, 0.0},
|
||||
};
|
||||
|
||||
// todo(@m) This is taken from search/geocoder.hpp. Refactor.
|
||||
struct ScopedMarkTokens
|
||||
{
|
||||
|
@ -52,25 +70,29 @@ geocoder::Type NextType(geocoder::Type type)
|
|||
bool FindParent(vector<geocoder::Geocoder::Layer> const & layers,
|
||||
geocoder::Hierarchy::Entry const & e)
|
||||
{
|
||||
for (auto const & layer : layers)
|
||||
CHECK(!layers.empty(), ());
|
||||
auto const & layer = layers.back();
|
||||
for (auto const * pe : layer.m_entries)
|
||||
{
|
||||
for (auto const * pe : layer.m_entries)
|
||||
{
|
||||
// Note that the relationship is somewhat inverted: every ancestor
|
||||
// is stored in the address but the nodes have no information
|
||||
// about their children.
|
||||
if (e.m_address[static_cast<size_t>(pe->m_type)] == pe->m_nameTokens)
|
||||
return true;
|
||||
}
|
||||
// Note that the relationship is somewhat inverted: every ancestor
|
||||
// is stored in the address but the nodes have no information
|
||||
// about their children.
|
||||
if (pe->IsParentTo(e))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens)
|
||||
{
|
||||
return strings::JoinStrings(tokens, strings::MakeUniString(""));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
// Geocoder::Context -------------------------------------------------------------------------------
|
||||
Geocoder::Context::Context(string const & query)
|
||||
Geocoder::Context::Context(string const & query) : m_beam(kMaxResults)
|
||||
{
|
||||
search::NormalizeAndTokenizeString(query, m_tokens);
|
||||
m_tokenTypes.assign(m_tokens.size(), Type::Count);
|
||||
|
@ -116,15 +138,25 @@ bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_toke
|
|||
|
||||
void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty)
|
||||
{
|
||||
m_results[osmId] = max(m_results[osmId], certainty);
|
||||
m_beam.Add(osmId, certainty);
|
||||
}
|
||||
|
||||
void Geocoder::Context::FillResults(vector<Result> & results) const
|
||||
{
|
||||
results.clear();
|
||||
results.reserve(m_results.size());
|
||||
for (auto const & e : m_results)
|
||||
results.emplace_back(e.first /* osmId */, e.second /* certainty */);
|
||||
results.reserve(m_beam.GetEntries().size());
|
||||
|
||||
set<decltype(m_beam)::Key> seen;
|
||||
for (auto const & e : m_beam.GetEntries())
|
||||
{
|
||||
if (!seen.insert(e.m_key).second)
|
||||
continue;
|
||||
|
||||
results.emplace_back(e.m_key /* osmId */, e.m_value /* certainty */);
|
||||
}
|
||||
|
||||
ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ());
|
||||
ASSERT_LESS_OR_EQUAL(results.size(), kMaxResults, ());
|
||||
}
|
||||
|
||||
vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
|
||||
|
@ -132,7 +164,7 @@ vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
|
|||
vector<Geocoder::Layer> const & Geocoder::Context::GetLayers() const { return m_layers; }
|
||||
|
||||
// Geocoder ----------------------------------------------------------------------------------------
|
||||
Geocoder::Geocoder(string pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {}
|
||||
Geocoder::Geocoder(string const & pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {}
|
||||
|
||||
void Geocoder::ProcessQuery(string const & query, vector<Result> & results) const
|
||||
{
|
||||
|
@ -172,39 +204,92 @@ void Geocoder::Go(Context & ctx, Type type) const
|
|||
|
||||
subquery.push_back(ctx.GetToken(j));
|
||||
|
||||
auto const * entries = m_hierarchy.GetEntries(subquery);
|
||||
if (!entries || entries->empty())
|
||||
continue;
|
||||
|
||||
Layer curLayer;
|
||||
curLayer.m_type = type;
|
||||
for (auto const & e : *entries)
|
||||
{
|
||||
if (e.m_type != type)
|
||||
continue;
|
||||
|
||||
if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e))
|
||||
curLayer.m_entries.emplace_back(&e);
|
||||
// Buildings are indexed separately.
|
||||
if (type == Type::Building)
|
||||
{
|
||||
FillBuildingsLayer(ctx, subquery, curLayer);
|
||||
}
|
||||
else
|
||||
{
|
||||
FillRegularLayer(ctx, type, subquery, curLayer);
|
||||
}
|
||||
|
||||
if (!curLayer.m_entries.empty())
|
||||
if (curLayer.m_entries.empty())
|
||||
continue;
|
||||
|
||||
ScopedMarkTokens mark(ctx, type, i, j + 1);
|
||||
|
||||
// double const certainty =
|
||||
// static_cast<double>(ctx.GetNumUsedTokens()) /
|
||||
// static_cast<double>(ctx.GetNumTokens());
|
||||
|
||||
double certainty = 0;
|
||||
for (auto const t : ctx.GetTokenTypes())
|
||||
{
|
||||
ScopedMarkTokens mark(ctx, type, i, j + 1);
|
||||
|
||||
double const certainty =
|
||||
static_cast<double>(ctx.GetNumUsedTokens()) / static_cast<double>(ctx.GetNumTokens());
|
||||
|
||||
for (auto const * e : curLayer.m_entries)
|
||||
ctx.AddResult(e->m_osmId, certainty);
|
||||
|
||||
ctx.GetLayers().emplace_back(move(curLayer));
|
||||
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
|
||||
|
||||
Go(ctx, NextType(type));
|
||||
auto const it = kWeight.find(t);
|
||||
CHECK(it != kWeight.end(), ());
|
||||
certainty += it->second;
|
||||
}
|
||||
LOG(LINFO, (ctx.GetTokenTypes(), certainty));
|
||||
|
||||
for (auto const * e : curLayer.m_entries)
|
||||
{
|
||||
ctx.AddResult(e->m_osmId, certainty);
|
||||
}
|
||||
|
||||
ctx.GetLayers().emplace_back(move(curLayer));
|
||||
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
|
||||
|
||||
Go(ctx, NextType(type));
|
||||
}
|
||||
}
|
||||
|
||||
Go(ctx, NextType(type));
|
||||
}
|
||||
|
||||
void Geocoder::FillBuildingsLayer(Context const & ctx, Tokens const & subquery,
|
||||
Layer & curLayer) const
|
||||
{
|
||||
if (ctx.GetLayers().empty())
|
||||
return;
|
||||
auto const & layer = ctx.GetLayers().back();
|
||||
if (layer.m_type != Type::Street)
|
||||
return;
|
||||
|
||||
auto const & subqueryHN = MakeHouseNumber(subquery);
|
||||
|
||||
if (!search::house_numbers::LooksLikeHouseNumber(subqueryHN, false /* isPrefix */))
|
||||
return;
|
||||
|
||||
for (auto const & se : layer.m_entries)
|
||||
{
|
||||
for (auto const & be : se->m_buildingsOnStreet)
|
||||
{
|
||||
auto const bt = static_cast<size_t>(Type::Building);
|
||||
auto const & realHN = MakeHouseNumber(be->m_address[bt]);
|
||||
if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */))
|
||||
curLayer.m_entries.emplace_back(be);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
|
||||
Layer & curLayer) const
|
||||
{
|
||||
auto const * entries = m_hierarchy.GetEntries(subquery);
|
||||
if (!entries || entries->empty())
|
||||
return;
|
||||
|
||||
for (auto const & e : *entries)
|
||||
{
|
||||
if (e.m_type != type)
|
||||
continue;
|
||||
|
||||
if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e))
|
||||
curLayer.m_entries.emplace_back(&e);
|
||||
}
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "geocoder/beam.hpp"
|
||||
#include "geocoder/hierarchy.hpp"
|
||||
#include "geocoder/result.hpp"
|
||||
#include "geocoder/types.hpp"
|
||||
|
@ -9,6 +10,7 @@
|
|||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace geocoder
|
||||
|
@ -78,13 +80,14 @@ public:
|
|||
|
||||
size_t m_numUsedTokens = 0;
|
||||
|
||||
// The highest value of certainty for each retrieved osm id.
|
||||
std::unordered_map<base::GeoObjectId, double> m_results;
|
||||
// The highest value of certainty for a fixed amount of
|
||||
// the most relevant retrieved osm ids.
|
||||
Beam m_beam;
|
||||
|
||||
std::vector<Layer> m_layers;
|
||||
};
|
||||
|
||||
explicit Geocoder(std::string pathToJsonHierarchy);
|
||||
explicit Geocoder(std::string const & pathToJsonHierarchy);
|
||||
|
||||
void ProcessQuery(std::string const & query, std::vector<Result> & results) const;
|
||||
|
||||
|
@ -93,7 +96,10 @@ public:
|
|||
private:
|
||||
void Go(Context & ctx, Type type) const;
|
||||
|
||||
void EmitResult() const;
|
||||
void FillBuildingsLayer(Context const & ctx, Tokens const & subquery, Layer & curLayer) const;
|
||||
|
||||
void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
|
||||
Layer & curLayer) const;
|
||||
|
||||
Hierarchy m_hierarchy;
|
||||
};
|
||||
|
|
|
@ -12,6 +12,7 @@ omim_add_executable(${PROJECT_NAME} ${SRC})
|
|||
omim_link_libraries(
|
||||
${PROJECT_NAME}
|
||||
geocoder
|
||||
search
|
||||
indexer
|
||||
platform
|
||||
coding
|
||||
|
|
|
@ -11,6 +11,7 @@ omim_link_libraries(
|
|||
${PROJECT_NAME}
|
||||
platform_tests_support
|
||||
geocoder
|
||||
search
|
||||
indexer
|
||||
platform
|
||||
coding
|
||||
|
|
|
@ -60,9 +60,10 @@ UNIT_TEST(Geocoder_Smoke)
|
|||
base::GeoObjectId const florenciaId(0xc00000000059d6b5);
|
||||
base::GeoObjectId const cubaId(0xc00000000004b279);
|
||||
|
||||
TestGeocoder(geocoder, "florencia", {{florenciaId, 1.0}});
|
||||
TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 1.0}, {cubaId, 0.5}});
|
||||
TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 0.25}, {florenciaId, 0.5}});
|
||||
// todo(@m) Return the certainty levels back to the [0.0, 1.0] range.
|
||||
TestGeocoder(geocoder, "florencia", {{florenciaId, 4.0}});
|
||||
TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 14.0}, {cubaId, 10.0}});
|
||||
TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 10.0}, {florenciaId, 14.0}});
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_Hierarchy)
|
||||
|
|
|
@ -7,6 +7,9 @@
|
|||
#include "base/logging.hpp"
|
||||
#include "base/macros.hpp"
|
||||
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
using namespace std;
|
||||
|
@ -81,10 +84,19 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
|
|||
else if (m_nameTokens != m_address[static_cast<size_t>(m_type)])
|
||||
{
|
||||
++stats.m_mismatchedNames;
|
||||
LOG(LDEBUG, ("Hierarchy entry name is not the most detailed field in its address:", jsonStr));
|
||||
}
|
||||
}
|
||||
|
||||
bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const
|
||||
{
|
||||
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
|
||||
{
|
||||
if (!m_address[i].empty() && m_address[i] != e.m_address[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Hierarchy ---------------------------------------------------------------------------------------
|
||||
Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
||||
{
|
||||
|
@ -114,16 +126,22 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
|||
if (!entry.DeserializeFromJSON(line, stats))
|
||||
continue;
|
||||
|
||||
// The entry is indexed only by its address.
|
||||
// The entry is indexed only its address.
|
||||
// todo(@m) Index it by name too.
|
||||
if (entry.m_type != Type::Count)
|
||||
{
|
||||
++stats.m_numLoaded;
|
||||
size_t const t = static_cast<size_t>(entry.m_type);
|
||||
m_entries[entry.m_address[t]].emplace_back(entry);
|
||||
|
||||
// Index every token but do not index prefixes.
|
||||
// for (auto const & tok : entry.m_address[t])
|
||||
// m_entries[{tok}].emplace_back(entry);
|
||||
}
|
||||
}
|
||||
|
||||
IndexHouses();
|
||||
|
||||
LOG(LINFO, ("Finished reading the hierarchy. Stats:"));
|
||||
LOG(LINFO, ("Entries indexed:", stats.m_numLoaded));
|
||||
LOG(LINFO, ("Corrupted json lines:", stats.m_badJsons));
|
||||
|
@ -145,4 +163,27 @@ vector<Hierarchy::Entry> const * const Hierarchy::GetEntries(
|
|||
|
||||
return &it->second;
|
||||
}
|
||||
|
||||
void Hierarchy::IndexHouses()
|
||||
{
|
||||
for (auto const & it : m_entries)
|
||||
{
|
||||
for (auto const & be : it.second)
|
||||
{
|
||||
if (be.m_type != Type::Building)
|
||||
continue;
|
||||
|
||||
size_t const t = static_cast<size_t>(Type::Street);
|
||||
auto const * streetCandidates = GetEntries(be.m_address[t]);
|
||||
if (streetCandidates == nullptr)
|
||||
continue;
|
||||
|
||||
for (auto & se : *streetCandidates)
|
||||
{
|
||||
if (se.IsParentTo(be))
|
||||
se.m_buildingsOnStreet.emplace_back(&be);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -57,17 +57,24 @@ public:
|
|||
void DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
|
||||
ParsingStats & stats);
|
||||
|
||||
// Checks whether this entry is a parent of |e|.
|
||||
bool IsParentTo(Entry const & e) const;
|
||||
|
||||
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
|
||||
|
||||
// Original name of the entry. Useful for debugging.
|
||||
std::string m_name;
|
||||
// Tokenized and simplified name of the entry.
|
||||
std::vector<strings::UniString> m_nameTokens;
|
||||
Tokens m_nameTokens;
|
||||
|
||||
Type m_type = Type::Count;
|
||||
|
||||
// The address fields of this entry, one per Type.
|
||||
std::array<Tokens, static_cast<size_t>(Type::Count) + 1> m_address;
|
||||
|
||||
// List of houses that belong to the street that is desribed by this entry.
|
||||
// Only valid if |m_type| is Type::Street.
|
||||
mutable std::vector<Entry const *> m_buildingsOnStreet;
|
||||
};
|
||||
|
||||
explicit Hierarchy(std::string const & pathToJsonHierarchy);
|
||||
|
@ -77,10 +84,13 @@ public:
|
|||
//
|
||||
// todo This method (and the whole class, in fact) is in the
|
||||
// prototype stage and may be too slow. Proper indexing should
|
||||
// be implemented to perform this type of queries.a
|
||||
// be implemented to perform this type of queries.
|
||||
std::vector<Entry> const * const GetEntries(std::vector<strings::UniString> const & tokens) const;
|
||||
|
||||
private:
|
||||
// Fills |m_buildingsOnStreet| field for all street entries.
|
||||
void IndexHouses();
|
||||
|
||||
std::map<Tokens, std::vector<Entry>> m_entries;
|
||||
};
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -8,6 +8,8 @@ namespace geocoder
|
|||
{
|
||||
struct Result
|
||||
{
|
||||
Result() = default;
|
||||
|
||||
Result(base::GeoObjectId const & osmId, double certainty) : m_osmId(osmId), m_certainty(certainty)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -14,8 +14,9 @@ string ToString(Type type)
|
|||
case Type::Region: return "region";
|
||||
case Type::Subregion: return "subregion";
|
||||
case Type::Locality: return "locality";
|
||||
case Type::Sublocality: return "sublocality";
|
||||
case Type::Suburb: return "suburb";
|
||||
case Type::Sublocality: return "sublocality";
|
||||
case Type::Street: return "street";
|
||||
case Type::Building: return "building";
|
||||
case Type::Count: return "count";
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ enum class Type
|
|||
Locality,
|
||||
Suburb,
|
||||
Sublocality,
|
||||
Street,
|
||||
Building,
|
||||
|
||||
Count
|
||||
|
|
Loading…
Add table
Reference in a new issue